diff --git a/.gitattributes b/.gitattributes index 6d931bd8e4987ea0420efca09b64441bb0dca5e6..861ddeb0cceefa3ae701cc80498d43b228a22e2e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -55,3 +55,6 @@ sft_devstral_24B/wandb/offline-run-20251223_134236-sicxj35d/run-sicxj35d.wandb f sft_devstral_24B/wandb/offline-run-20251223_134432-y3kepwdy/run-y3kepwdy.wandb filter=lfs diff=lfs merge=lfs -text sft_devstral_24B/wandb/run-20251223_134618-9jed4peb/run-9jed4peb.wandb filter=lfs diff=lfs merge=lfs -text sft_devstral_24B/wandb/run-20251223_142235-cktxoubm/run-cktxoubm.wandb filter=lfs diff=lfs merge=lfs -text +sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/run-i1cmzyri.wandb filter=lfs diff=lfs merge=lfs -text +sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/run-oordmylf.wandb filter=lfs diff=lfs merge=lfs -text +sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/run-ny9q48hd.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/sft_devstral_24B_v2/best_adapter/README.md b/sft_devstral_24B_v2/best_adapter/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/best_adapter/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/best_adapter/adapter_config.json b/sft_devstral_24B_v2/best_adapter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/best_adapter/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/best_adapter/adapter_model.safetensors b/sft_devstral_24B_v2/best_adapter/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..96a07c2b8637c3e7c69037eb75ea51b298a483f4 --- /dev/null +++ b/sft_devstral_24B_v2/best_adapter/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83291f6d455ca5ce17a07f21cc02ec56cba0671e1d6495dd81c1d98fd10b7c26 +size 45690960 diff --git a/sft_devstral_24B_v2/best_adapter/training_args.bin b/sft_devstral_24B_v2/best_adapter/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/best_adapter/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1000/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1000/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1000/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..64b375f9c82882815cbceadcad8e4ea93bd5b8f0 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44afa7c257cedf456ae8538816f1dcd1546f8e0509f0b58cb9c689ac56711166 +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1000/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7312e8efbc83a8081481e68d1655040c9c7c43d5 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2baa24039deac4dc92f3e0e35321ca7d94cd710dfd3416641e7f5735dcfbaba5 +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1000/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b803061af35b98169fc13d8c3c28dd03eb0baf1c --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0de6a7ba694bc1275a5b47fcf2c7b687280c780b6f307863a07ec69d3e9567f +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1000/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1f8e31adb2c91cfa161c1c05ffcfb3b1fb6f9a1 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd7d0063a4b9f937b9f5f63ec0f89ae9cc467878dc393ecd578a9607b37e26c8 +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1000/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7be2e67c7dfd3346ebdde2239ad99f2e984c29bb --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/trainer_state.json @@ -0,0 +1,3623 @@ +{ + "best_global_step": 1000, + "best_metric": 0.8388314247131348, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-1000", + "epoch": 0.4219409282700422, + "eval_steps": 100, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0390224045010268e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1000/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1500/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1500/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1500/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8b41c295952b7b2143fb298eb02acd55dc77be69 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:831a7d926d7f5df3964543a446004381a22b97659f1e9acb7da164b4155aa7db +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1500/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3dd014b628a5e75df238cfef4dfede864947a853 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:633ffc9a088e0005164ac58cc14f8eea176fdd2e4b9ebdd504fdddd3accc0a3d +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1500/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f82febed279ef2903d9ad148decb669afc89d39 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a1ae23f994928215671a68d719c37e5eb1f321bb43710f6a94f783d81024ee9 +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1500/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fbe62592600db02c67bc66e7661929ff9c26d8d --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e0ab851b74630ec51d2dd1e28429156fa119043bf6e4acb868afec1778a8d36 +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1500/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c19f44257fd7346ab32b8a5ab880b8274d328e40 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/trainer_state.json @@ -0,0 +1,5413 @@ +{ + "best_global_step": 1500, + "best_metric": 0.7896141409873962, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-1500", + "epoch": 0.6329113924050633, + "eval_steps": 100, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5578111292507464e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-1500/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2000/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2000/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2000/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2e15663269a8be15aab7d8f762afb1f5c473aa45 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddae9fb556b6dafd5e51ffbdf7776618b5a2438b53191f850d3c060e448f3161 +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2000/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac870985f1e95734ff733701c60dab70ecc4b477 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78804d00938d938b90a599b8b0d8dcd8f09e8116b29fd8a7b96100a5696f346 +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2000/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d1fa81bef0edfc58dbd9391656856af5e01758e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f05d71d3ab99767003aa382d142674f4d194421936f48f312a57e6f262a24b51 +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2000/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..38992fdacf1199057082f742f440d3f2251dab08 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c4e41c084848d0f3091226abc7db74bacb76a784828650f35ffabc473a2c375 +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2000/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bca7ed7716ab320356a729219683ed143f0cdfd8 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/trainer_state.json @@ -0,0 +1,7203 @@ +{ + "best_global_step": 2000, + "best_metric": 0.7587011456489563, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-2000", + "epoch": 0.8438818565400844, + "eval_steps": 100, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0761430952197837e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2000/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2500/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2500/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2500/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..121e3e0cc1bf9349e9ac37c5bc3734cd67a99fee --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12ceefd222d6928d2451717456e57f756d9bff479938fd49324891727c39637e +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2500/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ada46c62981743961af0baeaf8244fe7094ae45 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:938980f0edbd2046c344e1b26fe96147ee0bfebc7f3495443c62b27002ce70c4 +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2500/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcfacb95775b1f00419c1d975ed84e86e45be12a --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39b0d665c25c7b02a656e5cf197220cfb689931bbb6d4ae22b61d327d830a916 +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2500/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff8cc7b65cfabdfe24ba57b843faf5a720469695 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4849064f0d6ca593dabdc277f5c88ce471853e22cd7c8abff162abca0907f0fc +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2500/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2839cfd7bba924eab57485396ec61199a6485f83 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/trainer_state.json @@ -0,0 +1,8993 @@ +{ + "best_global_step": 2500, + "best_metric": 0.741338849067688, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-2500", + "epoch": 1.0548523206751055, + "eval_steps": 100, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + }, + { + "epoch": 0.8447257383966245, + "grad_norm": 1.145434021949768, + "learning_rate": 9.893006338154401e-05, + "loss": 0.731850802898407, + "step": 2002 + }, + { + "epoch": 0.8455696202531645, + "grad_norm": 1.0618077516555786, + "learning_rate": 9.892511635084101e-05, + "loss": 0.6711665391921997, + "step": 2004 + }, + { + "epoch": 0.8464135021097047, + "grad_norm": 1.1657867431640625, + "learning_rate": 9.892015803405331e-05, + "loss": 0.6894803643226624, + "step": 2006 + }, + { + "epoch": 0.8472573839662447, + "grad_norm": 1.080140233039856, + "learning_rate": 9.891518843232467e-05, + "loss": 0.628146231174469, + "step": 2008 + }, + { + "epoch": 0.8481012658227848, + "grad_norm": 1.0664509534835815, + "learning_rate": 9.891020754680151e-05, + "loss": 0.740858793258667, + "step": 2010 + }, + { + "epoch": 0.8489451476793249, + "grad_norm": 1.5567615032196045, + "learning_rate": 9.89052153786328e-05, + "loss": 0.7763919234275818, + "step": 2012 + }, + { + "epoch": 0.849789029535865, + "grad_norm": 1.4347095489501953, + "learning_rate": 9.890021192897016e-05, + "loss": 0.8131396770477295, + "step": 2014 + }, + { + "epoch": 0.850632911392405, + "grad_norm": 1.1787892580032349, + "learning_rate": 9.889519719896776e-05, + "loss": 0.6829051375389099, + "step": 2016 + }, + { + "epoch": 0.8514767932489451, + "grad_norm": 1.239745855331421, + "learning_rate": 9.889017118978241e-05, + "loss": 0.7664558291435242, + "step": 2018 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 1.1224207878112793, + "learning_rate": 9.888513390257352e-05, + "loss": 0.7307376861572266, + "step": 2020 + }, + { + "epoch": 0.8531645569620253, + "grad_norm": 1.100536823272705, + "learning_rate": 9.88800853385031e-05, + "loss": 0.6786578893661499, + "step": 2022 + }, + { + "epoch": 0.8540084388185654, + "grad_norm": 1.25773024559021, + "learning_rate": 9.887502549873576e-05, + "loss": 0.7971984148025513, + "step": 2024 + }, + { + "epoch": 0.8548523206751055, + "grad_norm": 0.9980104565620422, + "learning_rate": 9.886995438443868e-05, + "loss": 0.6990941166877747, + "step": 2026 + }, + { + "epoch": 0.8556962025316456, + "grad_norm": 1.0464621782302856, + "learning_rate": 9.886487199678171e-05, + "loss": 0.763938307762146, + "step": 2028 + }, + { + "epoch": 0.8565400843881856, + "grad_norm": 1.2303017377853394, + "learning_rate": 9.885977833693724e-05, + "loss": 0.7165632247924805, + "step": 2030 + }, + { + "epoch": 0.8573839662447258, + "grad_norm": 1.2203325033187866, + "learning_rate": 9.885467340608027e-05, + "loss": 0.7586364150047302, + "step": 2032 + }, + { + "epoch": 0.8582278481012658, + "grad_norm": 1.113882064819336, + "learning_rate": 9.884955720538843e-05, + "loss": 0.703253984451294, + "step": 2034 + }, + { + "epoch": 0.8590717299578059, + "grad_norm": 1.1731632947921753, + "learning_rate": 9.88444297360419e-05, + "loss": 0.8530917763710022, + "step": 2036 + }, + { + "epoch": 0.859915611814346, + "grad_norm": 1.4592338800430298, + "learning_rate": 9.883929099922349e-05, + "loss": 0.8166638612747192, + "step": 2038 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.1279125213623047, + "learning_rate": 9.883414099611864e-05, + "loss": 0.6762415170669556, + "step": 2040 + }, + { + "epoch": 0.8616033755274262, + "grad_norm": 1.1587293148040771, + "learning_rate": 9.882897972791534e-05, + "loss": 0.6826539039611816, + "step": 2042 + }, + { + "epoch": 0.8624472573839662, + "grad_norm": 1.1909502744674683, + "learning_rate": 9.88238071958042e-05, + "loss": 0.7372410893440247, + "step": 2044 + }, + { + "epoch": 0.8632911392405064, + "grad_norm": 1.0340155363082886, + "learning_rate": 9.881862340097841e-05, + "loss": 0.699260950088501, + "step": 2046 + }, + { + "epoch": 0.8641350210970464, + "grad_norm": 1.1745870113372803, + "learning_rate": 9.881342834463379e-05, + "loss": 0.7689789533615112, + "step": 2048 + }, + { + "epoch": 0.8649789029535865, + "grad_norm": 1.0003606081008911, + "learning_rate": 9.880822202796872e-05, + "loss": 0.6877372860908508, + "step": 2050 + }, + { + "epoch": 0.8658227848101265, + "grad_norm": 1.2546781301498413, + "learning_rate": 9.88030044521842e-05, + "loss": 0.7632413506507874, + "step": 2052 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.1178704500198364, + "learning_rate": 9.879777561848385e-05, + "loss": 0.6776729822158813, + "step": 2054 + }, + { + "epoch": 0.8675105485232067, + "grad_norm": 1.523606777191162, + "learning_rate": 9.879253552807384e-05, + "loss": 0.7592973709106445, + "step": 2056 + }, + { + "epoch": 0.8683544303797468, + "grad_norm": 1.3490995168685913, + "learning_rate": 9.878728418216296e-05, + "loss": 0.8028839230537415, + "step": 2058 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 1.1851624250411987, + "learning_rate": 9.87820215819626e-05, + "loss": 0.7499933838844299, + "step": 2060 + }, + { + "epoch": 0.870042194092827, + "grad_norm": 1.1877925395965576, + "learning_rate": 9.877674772868672e-05, + "loss": 0.7324717044830322, + "step": 2062 + }, + { + "epoch": 0.8708860759493671, + "grad_norm": 1.2982885837554932, + "learning_rate": 9.877146262355194e-05, + "loss": 0.7456585168838501, + "step": 2064 + }, + { + "epoch": 0.8717299578059071, + "grad_norm": 1.043912649154663, + "learning_rate": 9.876616626777739e-05, + "loss": 0.7552799582481384, + "step": 2066 + }, + { + "epoch": 0.8725738396624473, + "grad_norm": 1.172580599784851, + "learning_rate": 9.876085866258487e-05, + "loss": 0.6964990496635437, + "step": 2068 + }, + { + "epoch": 0.8734177215189873, + "grad_norm": 1.26815927028656, + "learning_rate": 9.875553980919871e-05, + "loss": 0.7368612289428711, + "step": 2070 + }, + { + "epoch": 0.8742616033755274, + "grad_norm": 1.1268136501312256, + "learning_rate": 9.875020970884587e-05, + "loss": 0.7400802969932556, + "step": 2072 + }, + { + "epoch": 0.8751054852320675, + "grad_norm": 1.0556721687316895, + "learning_rate": 9.874486836275594e-05, + "loss": 0.6931334137916565, + "step": 2074 + }, + { + "epoch": 0.8759493670886076, + "grad_norm": 1.1967823505401611, + "learning_rate": 9.873951577216106e-05, + "loss": 0.7124089002609253, + "step": 2076 + }, + { + "epoch": 0.8767932489451477, + "grad_norm": 1.1753164529800415, + "learning_rate": 9.873415193829591e-05, + "loss": 0.7462030053138733, + "step": 2078 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 1.326923131942749, + "learning_rate": 9.872877686239789e-05, + "loss": 0.778078019618988, + "step": 2080 + }, + { + "epoch": 0.8784810126582279, + "grad_norm": 1.1472662687301636, + "learning_rate": 9.87233905457069e-05, + "loss": 0.6592919826507568, + "step": 2082 + }, + { + "epoch": 0.8793248945147679, + "grad_norm": 1.1162762641906738, + "learning_rate": 9.871799298946544e-05, + "loss": 0.661717414855957, + "step": 2084 + }, + { + "epoch": 0.880168776371308, + "grad_norm": 1.1694408655166626, + "learning_rate": 9.871258419491866e-05, + "loss": 0.6203670501708984, + "step": 2086 + }, + { + "epoch": 0.8810126582278481, + "grad_norm": 1.229691505432129, + "learning_rate": 9.870716416331425e-05, + "loss": 0.758888304233551, + "step": 2088 + }, + { + "epoch": 0.8818565400843882, + "grad_norm": 1.540377140045166, + "learning_rate": 9.870173289590251e-05, + "loss": 0.760649561882019, + "step": 2090 + }, + { + "epoch": 0.8827004219409282, + "grad_norm": 1.173628568649292, + "learning_rate": 9.869629039393632e-05, + "loss": 0.6981227397918701, + "step": 2092 + }, + { + "epoch": 0.8835443037974684, + "grad_norm": 1.1404013633728027, + "learning_rate": 9.869083665867116e-05, + "loss": 0.7808336615562439, + "step": 2094 + }, + { + "epoch": 0.8843881856540085, + "grad_norm": 1.1038721799850464, + "learning_rate": 9.868537169136511e-05, + "loss": 0.7540555596351624, + "step": 2096 + }, + { + "epoch": 0.8852320675105485, + "grad_norm": 1.1510080099105835, + "learning_rate": 9.867989549327885e-05, + "loss": 0.6650454998016357, + "step": 2098 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.166912317276001, + "learning_rate": 9.867440806567561e-05, + "loss": 0.673769474029541, + "step": 2100 + }, + { + "epoch": 0.8860759493670886, + "eval_loss": 0.7559094429016113, + "eval_runtime": 847.8311, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2100 + }, + { + "epoch": 0.8869198312236287, + "grad_norm": 1.227583885192871, + "learning_rate": 9.866890940982121e-05, + "loss": 0.8314241766929626, + "step": 2102 + }, + { + "epoch": 0.8877637130801688, + "grad_norm": 1.1813976764678955, + "learning_rate": 9.866339952698413e-05, + "loss": 0.6770843863487244, + "step": 2104 + }, + { + "epoch": 0.8886075949367088, + "grad_norm": 1.2471063137054443, + "learning_rate": 9.865787841843539e-05, + "loss": 0.7142292857170105, + "step": 2106 + }, + { + "epoch": 0.889451476793249, + "grad_norm": 1.1602860689163208, + "learning_rate": 9.865234608544858e-05, + "loss": 0.6981731653213501, + "step": 2108 + }, + { + "epoch": 0.890295358649789, + "grad_norm": 1.145677089691162, + "learning_rate": 9.864680252929992e-05, + "loss": 0.7019379138946533, + "step": 2110 + }, + { + "epoch": 0.8911392405063291, + "grad_norm": 1.2222462892532349, + "learning_rate": 9.86412477512682e-05, + "loss": 0.7690986394882202, + "step": 2112 + }, + { + "epoch": 0.8919831223628693, + "grad_norm": 1.1288166046142578, + "learning_rate": 9.863568175263478e-05, + "loss": 0.7241792678833008, + "step": 2114 + }, + { + "epoch": 0.8928270042194093, + "grad_norm": 1.1773978471755981, + "learning_rate": 9.863010453468364e-05, + "loss": 0.7392162084579468, + "step": 2116 + }, + { + "epoch": 0.8936708860759494, + "grad_norm": 1.102638840675354, + "learning_rate": 9.862451609870136e-05, + "loss": 0.7603078484535217, + "step": 2118 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.861891644597707e-05, + "loss": 0.6804911494255066, + "step": 2120 + }, + { + "epoch": 0.8953586497890296, + "grad_norm": 1.1381969451904297, + "learning_rate": 9.86133055778025e-05, + "loss": 0.787288248538971, + "step": 2122 + }, + { + "epoch": 0.8962025316455696, + "grad_norm": 1.2454546689987183, + "learning_rate": 9.860768349547196e-05, + "loss": 0.7282505035400391, + "step": 2124 + }, + { + "epoch": 0.8970464135021097, + "grad_norm": 1.2568305730819702, + "learning_rate": 9.860205020028237e-05, + "loss": 0.7554803490638733, + "step": 2126 + }, + { + "epoch": 0.8978902953586498, + "grad_norm": 1.1523523330688477, + "learning_rate": 9.859640569353321e-05, + "loss": 0.7126525044441223, + "step": 2128 + }, + { + "epoch": 0.8987341772151899, + "grad_norm": 1.314878225326538, + "learning_rate": 9.859074997652658e-05, + "loss": 0.7300811409950256, + "step": 2130 + }, + { + "epoch": 0.8995780590717299, + "grad_norm": 1.1272218227386475, + "learning_rate": 9.858508305056713e-05, + "loss": 0.7217329144477844, + "step": 2132 + }, + { + "epoch": 0.90042194092827, + "grad_norm": 1.10934317111969, + "learning_rate": 9.857940491696211e-05, + "loss": 0.714308500289917, + "step": 2134 + }, + { + "epoch": 0.9012658227848102, + "grad_norm": 1.1991039514541626, + "learning_rate": 9.857371557702136e-05, + "loss": 0.6613366007804871, + "step": 2136 + }, + { + "epoch": 0.9021097046413502, + "grad_norm": 1.3176918029785156, + "learning_rate": 9.85680150320573e-05, + "loss": 0.6972863078117371, + "step": 2138 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 1.1966592073440552, + "learning_rate": 9.856230328338496e-05, + "loss": 0.7299100160598755, + "step": 2140 + }, + { + "epoch": 0.9037974683544304, + "grad_norm": 1.2889270782470703, + "learning_rate": 9.85565803323219e-05, + "loss": 0.7145020961761475, + "step": 2142 + }, + { + "epoch": 0.9046413502109705, + "grad_norm": 1.2112789154052734, + "learning_rate": 9.855084618018828e-05, + "loss": 0.6717942953109741, + "step": 2144 + }, + { + "epoch": 0.9054852320675105, + "grad_norm": 1.2550239562988281, + "learning_rate": 9.85451008283069e-05, + "loss": 0.7460196018218994, + "step": 2146 + }, + { + "epoch": 0.9063291139240506, + "grad_norm": 1.2926387786865234, + "learning_rate": 9.853934427800309e-05, + "loss": 0.8300626873970032, + "step": 2148 + }, + { + "epoch": 0.9071729957805907, + "grad_norm": 1.0690672397613525, + "learning_rate": 9.853357653060478e-05, + "loss": 0.715215802192688, + "step": 2150 + }, + { + "epoch": 0.9080168776371308, + "grad_norm": 1.1021424531936646, + "learning_rate": 9.852779758744245e-05, + "loss": 0.7021427154541016, + "step": 2152 + }, + { + "epoch": 0.9088607594936708, + "grad_norm": 1.0713517665863037, + "learning_rate": 9.852200744984921e-05, + "loss": 0.7576406598091125, + "step": 2154 + }, + { + "epoch": 0.909704641350211, + "grad_norm": 1.277526617050171, + "learning_rate": 9.851620611916075e-05, + "loss": 0.7008846998214722, + "step": 2156 + }, + { + "epoch": 0.9105485232067511, + "grad_norm": 1.2434618473052979, + "learning_rate": 9.85103935967153e-05, + "loss": 0.7536613345146179, + "step": 2158 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 1.1654841899871826, + "learning_rate": 9.850456988385371e-05, + "loss": 0.7435567378997803, + "step": 2160 + }, + { + "epoch": 0.9122362869198313, + "grad_norm": 1.0718246698379517, + "learning_rate": 9.849873498191939e-05, + "loss": 0.7725666165351868, + "step": 2162 + }, + { + "epoch": 0.9130801687763713, + "grad_norm": 1.3425630331039429, + "learning_rate": 9.849288889225835e-05, + "loss": 0.7833593487739563, + "step": 2164 + }, + { + "epoch": 0.9139240506329114, + "grad_norm": 1.1989985704421997, + "learning_rate": 9.848703161621917e-05, + "loss": 0.7290158867835999, + "step": 2166 + }, + { + "epoch": 0.9147679324894514, + "grad_norm": 1.0549380779266357, + "learning_rate": 9.8481163155153e-05, + "loss": 0.6787996888160706, + "step": 2168 + }, + { + "epoch": 0.9156118143459916, + "grad_norm": 1.0757017135620117, + "learning_rate": 9.847528351041359e-05, + "loss": 0.7645748853683472, + "step": 2170 + }, + { + "epoch": 0.9164556962025316, + "grad_norm": 1.0636975765228271, + "learning_rate": 9.846939268335726e-05, + "loss": 0.6640698313713074, + "step": 2172 + }, + { + "epoch": 0.9172995780590717, + "grad_norm": 1.2038439512252808, + "learning_rate": 9.846349067534291e-05, + "loss": 0.7216284275054932, + "step": 2174 + }, + { + "epoch": 0.9181434599156119, + "grad_norm": 1.17854642868042, + "learning_rate": 9.845757748773203e-05, + "loss": 0.7244991660118103, + "step": 2176 + }, + { + "epoch": 0.9189873417721519, + "grad_norm": 1.0391159057617188, + "learning_rate": 9.845165312188864e-05, + "loss": 0.6043152809143066, + "step": 2178 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 1.2382071018218994, + "learning_rate": 9.844571757917944e-05, + "loss": 0.7791659832000732, + "step": 2180 + }, + { + "epoch": 0.920675105485232, + "grad_norm": 1.0855708122253418, + "learning_rate": 9.84397708609736e-05, + "loss": 0.7190433144569397, + "step": 2182 + }, + { + "epoch": 0.9215189873417722, + "grad_norm": 1.103308916091919, + "learning_rate": 9.843381296864291e-05, + "loss": 0.6648658514022827, + "step": 2184 + }, + { + "epoch": 0.9223628691983122, + "grad_norm": 1.073517918586731, + "learning_rate": 9.842784390356178e-05, + "loss": 0.6891760230064392, + "step": 2186 + }, + { + "epoch": 0.9232067510548523, + "grad_norm": 1.0806199312210083, + "learning_rate": 9.842186366710712e-05, + "loss": 0.6880859136581421, + "step": 2188 + }, + { + "epoch": 0.9240506329113924, + "grad_norm": 1.0631483793258667, + "learning_rate": 9.841587226065848e-05, + "loss": 0.6238307952880859, + "step": 2190 + }, + { + "epoch": 0.9248945147679325, + "grad_norm": 1.2630863189697266, + "learning_rate": 9.840986968559795e-05, + "loss": 0.6905744075775146, + "step": 2192 + }, + { + "epoch": 0.9257383966244725, + "grad_norm": 1.1307560205459595, + "learning_rate": 9.840385594331022e-05, + "loss": 0.7531564235687256, + "step": 2194 + }, + { + "epoch": 0.9265822784810127, + "grad_norm": 1.0294862985610962, + "learning_rate": 9.839783103518254e-05, + "loss": 0.6750671863555908, + "step": 2196 + }, + { + "epoch": 0.9274261603375528, + "grad_norm": 1.2446976900100708, + "learning_rate": 9.839179496260472e-05, + "loss": 0.7200804352760315, + "step": 2198 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 1.2673420906066895, + "learning_rate": 9.83857477269692e-05, + "loss": 0.7002623677253723, + "step": 2200 + }, + { + "epoch": 0.9282700421940928, + "eval_loss": 0.7497645616531372, + "eval_runtime": 856.8766, + "eval_samples_per_second": 2.459, + "eval_steps_per_second": 2.459, + "step": 2200 + }, + { + "epoch": 0.9291139240506329, + "grad_norm": 1.5114624500274658, + "learning_rate": 9.837968932967094e-05, + "loss": 0.7718265056610107, + "step": 2202 + }, + { + "epoch": 0.929957805907173, + "grad_norm": 1.2059369087219238, + "learning_rate": 9.837361977210751e-05, + "loss": 0.7204271554946899, + "step": 2204 + }, + { + "epoch": 0.9308016877637131, + "grad_norm": 1.2077301740646362, + "learning_rate": 9.836753905567902e-05, + "loss": 0.7371073961257935, + "step": 2206 + }, + { + "epoch": 0.9316455696202531, + "grad_norm": 1.120097279548645, + "learning_rate": 9.836144718178818e-05, + "loss": 0.6601167321205139, + "step": 2208 + }, + { + "epoch": 0.9324894514767933, + "grad_norm": 1.1755714416503906, + "learning_rate": 9.835534415184029e-05, + "loss": 0.6897423267364502, + "step": 2210 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3587000370025635, + "learning_rate": 9.834922996724317e-05, + "loss": 0.758438229560852, + "step": 2212 + }, + { + "epoch": 0.9341772151898734, + "grad_norm": 1.1898177862167358, + "learning_rate": 9.834310462940727e-05, + "loss": 0.7489214539527893, + "step": 2214 + }, + { + "epoch": 0.9350210970464135, + "grad_norm": 1.0814623832702637, + "learning_rate": 9.833696813974558e-05, + "loss": 0.6844488382339478, + "step": 2216 + }, + { + "epoch": 0.9358649789029536, + "grad_norm": 1.1060179471969604, + "learning_rate": 9.833082049967366e-05, + "loss": 0.6617586016654968, + "step": 2218 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 1.1780575513839722, + "learning_rate": 9.832466171060968e-05, + "loss": 0.7383584976196289, + "step": 2220 + }, + { + "epoch": 0.9375527426160337, + "grad_norm": 1.3734618425369263, + "learning_rate": 9.831849177397432e-05, + "loss": 0.7764308452606201, + "step": 2222 + }, + { + "epoch": 0.9383966244725739, + "grad_norm": 1.1367733478546143, + "learning_rate": 9.831231069119089e-05, + "loss": 0.6834397912025452, + "step": 2224 + }, + { + "epoch": 0.9392405063291139, + "grad_norm": 1.1695492267608643, + "learning_rate": 9.830611846368524e-05, + "loss": 0.7054480910301208, + "step": 2226 + }, + { + "epoch": 0.940084388185654, + "grad_norm": 1.0345736742019653, + "learning_rate": 9.829991509288579e-05, + "loss": 0.694448709487915, + "step": 2228 + }, + { + "epoch": 0.9409282700421941, + "grad_norm": 1.298105239868164, + "learning_rate": 9.829370058022356e-05, + "loss": 0.6839741468429565, + "step": 2230 + }, + { + "epoch": 0.9417721518987342, + "grad_norm": 1.2905502319335938, + "learning_rate": 9.828747492713209e-05, + "loss": 0.7886884212493896, + "step": 2232 + }, + { + "epoch": 0.9426160337552743, + "grad_norm": 1.12301504611969, + "learning_rate": 9.828123813504753e-05, + "loss": 0.7206413149833679, + "step": 2234 + }, + { + "epoch": 0.9434599156118143, + "grad_norm": 1.2644896507263184, + "learning_rate": 9.82749902054086e-05, + "loss": 0.7700693607330322, + "step": 2236 + }, + { + "epoch": 0.9443037974683545, + "grad_norm": 1.1626365184783936, + "learning_rate": 9.826873113965655e-05, + "loss": 0.7199711203575134, + "step": 2238 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 1.0728627443313599, + "learning_rate": 9.826246093923528e-05, + "loss": 0.7183539271354675, + "step": 2240 + }, + { + "epoch": 0.9459915611814346, + "grad_norm": 1.1444766521453857, + "learning_rate": 9.825617960559114e-05, + "loss": 0.7417964935302734, + "step": 2242 + }, + { + "epoch": 0.9468354430379747, + "grad_norm": 1.4059823751449585, + "learning_rate": 9.824988714017316e-05, + "loss": 0.7949740290641785, + "step": 2244 + }, + { + "epoch": 0.9476793248945148, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.824358354443286e-05, + "loss": 0.6433083415031433, + "step": 2246 + }, + { + "epoch": 0.9485232067510548, + "grad_norm": 1.0879144668579102, + "learning_rate": 9.823726881982438e-05, + "loss": 0.6519861817359924, + "step": 2248 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.2289162874221802, + "learning_rate": 9.82309429678044e-05, + "loss": 0.7280195355415344, + "step": 2250 + }, + { + "epoch": 0.950210970464135, + "grad_norm": 1.1755765676498413, + "learning_rate": 9.822460598983217e-05, + "loss": 0.7524687647819519, + "step": 2252 + }, + { + "epoch": 0.9510548523206751, + "grad_norm": 1.179807186126709, + "learning_rate": 9.821825788736949e-05, + "loss": 0.7543174624443054, + "step": 2254 + }, + { + "epoch": 0.9518987341772152, + "grad_norm": 1.1234289407730103, + "learning_rate": 9.821189866188079e-05, + "loss": 0.716377854347229, + "step": 2256 + }, + { + "epoch": 0.9527426160337553, + "grad_norm": 1.0324063301086426, + "learning_rate": 9.820552831483297e-05, + "loss": 0.6403332948684692, + "step": 2258 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 1.1459579467773438, + "learning_rate": 9.819914684769558e-05, + "loss": 0.7406947612762451, + "step": 2260 + }, + { + "epoch": 0.9544303797468354, + "grad_norm": 1.2886124849319458, + "learning_rate": 9.819275426194072e-05, + "loss": 0.749687671661377, + "step": 2262 + }, + { + "epoch": 0.9552742616033755, + "grad_norm": 1.3349844217300415, + "learning_rate": 9.818635055904299e-05, + "loss": 0.778410017490387, + "step": 2264 + }, + { + "epoch": 0.9561181434599156, + "grad_norm": 1.0994901657104492, + "learning_rate": 9.81799357404796e-05, + "loss": 0.6701914668083191, + "step": 2266 + }, + { + "epoch": 0.9569620253164557, + "grad_norm": 1.1787796020507812, + "learning_rate": 9.817350980773038e-05, + "loss": 0.7205135226249695, + "step": 2268 + }, + { + "epoch": 0.9578059071729957, + "grad_norm": 1.100813627243042, + "learning_rate": 9.816707276227763e-05, + "loss": 0.6897916197776794, + "step": 2270 + }, + { + "epoch": 0.9586497890295359, + "grad_norm": 1.1280698776245117, + "learning_rate": 9.816062460560627e-05, + "loss": 0.6763570308685303, + "step": 2272 + }, + { + "epoch": 0.959493670886076, + "grad_norm": 1.2322514057159424, + "learning_rate": 9.815416533920374e-05, + "loss": 0.6948683857917786, + "step": 2274 + }, + { + "epoch": 0.960337552742616, + "grad_norm": 1.3963630199432373, + "learning_rate": 9.814769496456008e-05, + "loss": 0.7876828908920288, + "step": 2276 + }, + { + "epoch": 0.9611814345991562, + "grad_norm": 1.2093676328659058, + "learning_rate": 9.814121348316792e-05, + "loss": 0.8191362619400024, + "step": 2278 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 1.2223572731018066, + "learning_rate": 9.813472089652233e-05, + "loss": 0.7162626385688782, + "step": 2280 + }, + { + "epoch": 0.9628691983122363, + "grad_norm": 1.1498078107833862, + "learning_rate": 9.812821720612111e-05, + "loss": 0.7183970212936401, + "step": 2282 + }, + { + "epoch": 0.9637130801687763, + "grad_norm": 1.1563853025436401, + "learning_rate": 9.812170241346449e-05, + "loss": 0.734487771987915, + "step": 2284 + }, + { + "epoch": 0.9645569620253165, + "grad_norm": 1.1823415756225586, + "learning_rate": 9.81151765200553e-05, + "loss": 0.7312371730804443, + "step": 2286 + }, + { + "epoch": 0.9654008438818565, + "grad_norm": 1.1336151361465454, + "learning_rate": 9.810863952739899e-05, + "loss": 0.7668377757072449, + "step": 2288 + }, + { + "epoch": 0.9662447257383966, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.810209143700347e-05, + "loss": 0.7100399732589722, + "step": 2290 + }, + { + "epoch": 0.9670886075949368, + "grad_norm": 1.1368129253387451, + "learning_rate": 9.809553225037926e-05, + "loss": 0.7169836163520813, + "step": 2292 + }, + { + "epoch": 0.9679324894514768, + "grad_norm": 1.141107439994812, + "learning_rate": 9.808896196903947e-05, + "loss": 0.7709535956382751, + "step": 2294 + }, + { + "epoch": 0.9687763713080169, + "grad_norm": 1.276405930519104, + "learning_rate": 9.808238059449971e-05, + "loss": 0.7300511002540588, + "step": 2296 + }, + { + "epoch": 0.9696202531645569, + "grad_norm": 0.9817046523094177, + "learning_rate": 9.80757881282782e-05, + "loss": 0.6259129047393799, + "step": 2298 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 1.3965257406234741, + "learning_rate": 9.806918457189566e-05, + "loss": 0.7361716032028198, + "step": 2300 + }, + { + "epoch": 0.9704641350210971, + "eval_loss": 0.7464568614959717, + "eval_runtime": 864.2128, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 2.438, + "step": 2300 + }, + { + "epoch": 0.9713080168776371, + "grad_norm": 1.2168612480163574, + "learning_rate": 9.806256992687544e-05, + "loss": 0.805477499961853, + "step": 2302 + }, + { + "epoch": 0.9721518987341772, + "grad_norm": 1.0418168306350708, + "learning_rate": 9.80559441947434e-05, + "loss": 0.6673368811607361, + "step": 2304 + }, + { + "epoch": 0.9729957805907173, + "grad_norm": 1.223128318786621, + "learning_rate": 9.804930737702796e-05, + "loss": 0.7585647106170654, + "step": 2306 + }, + { + "epoch": 0.9738396624472574, + "grad_norm": 1.264511227607727, + "learning_rate": 9.804265947526011e-05, + "loss": 0.7642034888267517, + "step": 2308 + }, + { + "epoch": 0.9746835443037974, + "grad_norm": 1.076887607574463, + "learning_rate": 9.803600049097339e-05, + "loss": 0.7094541192054749, + "step": 2310 + }, + { + "epoch": 0.9755274261603376, + "grad_norm": 1.0214987993240356, + "learning_rate": 9.802933042570392e-05, + "loss": 0.7370059490203857, + "step": 2312 + }, + { + "epoch": 0.9763713080168777, + "grad_norm": 1.3075295686721802, + "learning_rate": 9.802264928099035e-05, + "loss": 0.726834237575531, + "step": 2314 + }, + { + "epoch": 0.9772151898734177, + "grad_norm": 1.057386040687561, + "learning_rate": 9.801595705837385e-05, + "loss": 0.6742353439331055, + "step": 2316 + }, + { + "epoch": 0.9780590717299578, + "grad_norm": 1.3998085260391235, + "learning_rate": 9.800925375939825e-05, + "loss": 0.6862425208091736, + "step": 2318 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 1.080574631690979, + "learning_rate": 9.800253938560983e-05, + "loss": 0.6212031245231628, + "step": 2320 + }, + { + "epoch": 0.979746835443038, + "grad_norm": 1.3643771409988403, + "learning_rate": 9.799581393855748e-05, + "loss": 0.7522522211074829, + "step": 2322 + }, + { + "epoch": 0.980590717299578, + "grad_norm": 1.2455768585205078, + "learning_rate": 9.798907741979264e-05, + "loss": 0.7265716791152954, + "step": 2324 + }, + { + "epoch": 0.9814345991561182, + "grad_norm": 1.078774333000183, + "learning_rate": 9.798232983086927e-05, + "loss": 0.7160419225692749, + "step": 2326 + }, + { + "epoch": 0.9822784810126582, + "grad_norm": 1.3013948202133179, + "learning_rate": 9.797557117334394e-05, + "loss": 0.7991124391555786, + "step": 2328 + }, + { + "epoch": 0.9831223628691983, + "grad_norm": 1.2216732501983643, + "learning_rate": 9.796880144877572e-05, + "loss": 0.7193916440010071, + "step": 2330 + }, + { + "epoch": 0.9839662447257383, + "grad_norm": 1.1469542980194092, + "learning_rate": 9.796202065872627e-05, + "loss": 0.7184370756149292, + "step": 2332 + }, + { + "epoch": 0.9848101265822785, + "grad_norm": 1.0431830883026123, + "learning_rate": 9.795522880475979e-05, + "loss": 0.6474619507789612, + "step": 2334 + }, + { + "epoch": 0.9856540084388186, + "grad_norm": 1.1819576025009155, + "learning_rate": 9.794842588844299e-05, + "loss": 0.6392545700073242, + "step": 2336 + }, + { + "epoch": 0.9864978902953586, + "grad_norm": 1.1984983682632446, + "learning_rate": 9.794161191134525e-05, + "loss": 0.7358114719390869, + "step": 2338 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 1.3378512859344482, + "learning_rate": 9.793478687503834e-05, + "loss": 0.6762020587921143, + "step": 2340 + }, + { + "epoch": 0.9881856540084388, + "grad_norm": 1.272674560546875, + "learning_rate": 9.792795078109673e-05, + "loss": 0.7478934526443481, + "step": 2342 + }, + { + "epoch": 0.9890295358649789, + "grad_norm": 1.153746247291565, + "learning_rate": 9.792110363109733e-05, + "loss": 0.7316533923149109, + "step": 2344 + }, + { + "epoch": 0.9898734177215189, + "grad_norm": 1.1361702680587769, + "learning_rate": 9.791424542661967e-05, + "loss": 0.7078539133071899, + "step": 2346 + }, + { + "epoch": 0.9907172995780591, + "grad_norm": 1.3043115139007568, + "learning_rate": 9.790737616924581e-05, + "loss": 0.7945935130119324, + "step": 2348 + }, + { + "epoch": 0.9915611814345991, + "grad_norm": 1.1913264989852905, + "learning_rate": 9.790049586056034e-05, + "loss": 0.8247197866439819, + "step": 2350 + }, + { + "epoch": 0.9924050632911392, + "grad_norm": 1.1560171842575073, + "learning_rate": 9.789360450215041e-05, + "loss": 0.7099657654762268, + "step": 2352 + }, + { + "epoch": 0.9932489451476794, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.788670209560575e-05, + "loss": 0.7480318546295166, + "step": 2354 + }, + { + "epoch": 0.9940928270042194, + "grad_norm": 1.1584707498550415, + "learning_rate": 9.787978864251859e-05, + "loss": 0.6870889067649841, + "step": 2356 + }, + { + "epoch": 0.9949367088607595, + "grad_norm": 1.057478666305542, + "learning_rate": 9.787286414448375e-05, + "loss": 0.6114922165870667, + "step": 2358 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 1.1431775093078613, + "learning_rate": 9.786592860309856e-05, + "loss": 0.6955118179321289, + "step": 2360 + }, + { + "epoch": 0.9966244725738397, + "grad_norm": 1.232142448425293, + "learning_rate": 9.785898201996292e-05, + "loss": 0.735048770904541, + "step": 2362 + }, + { + "epoch": 0.9974683544303797, + "grad_norm": 1.1236306428909302, + "learning_rate": 9.785202439667928e-05, + "loss": 0.7150241136550903, + "step": 2364 + }, + { + "epoch": 0.9983122362869198, + "grad_norm": 1.0517534017562866, + "learning_rate": 9.784505573485263e-05, + "loss": 0.6870222687721252, + "step": 2366 + }, + { + "epoch": 0.99915611814346, + "grad_norm": 1.1747480630874634, + "learning_rate": 9.78380760360905e-05, + "loss": 0.7521567940711975, + "step": 2368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2790346145629883, + "learning_rate": 9.783108530200298e-05, + "loss": 0.7336234450340271, + "step": 2370 + }, + { + "epoch": 1.0008438818565402, + "grad_norm": 1.1216399669647217, + "learning_rate": 9.78240835342027e-05, + "loss": 0.6378109455108643, + "step": 2372 + }, + { + "epoch": 1.00168776371308, + "grad_norm": 1.267336368560791, + "learning_rate": 9.781707073430482e-05, + "loss": 0.6174905300140381, + "step": 2374 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 1.1342934370040894, + "learning_rate": 9.781004690392706e-05, + "loss": 0.6579123139381409, + "step": 2376 + }, + { + "epoch": 1.0033755274261604, + "grad_norm": 1.1317468881607056, + "learning_rate": 9.78030120446897e-05, + "loss": 0.6679617166519165, + "step": 2378 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 1.2992616891860962, + "learning_rate": 9.779596615821552e-05, + "loss": 0.7368149161338806, + "step": 2380 + }, + { + "epoch": 1.0050632911392405, + "grad_norm": 1.1714510917663574, + "learning_rate": 9.77889092461299e-05, + "loss": 0.6887164115905762, + "step": 2382 + }, + { + "epoch": 1.0059071729957807, + "grad_norm": 1.1670639514923096, + "learning_rate": 9.778184131006071e-05, + "loss": 0.681344211101532, + "step": 2384 + }, + { + "epoch": 1.0067510548523206, + "grad_norm": 1.2487291097640991, + "learning_rate": 9.77747623516384e-05, + "loss": 0.7342769503593445, + "step": 2386 + }, + { + "epoch": 1.0075949367088608, + "grad_norm": 1.2408956289291382, + "learning_rate": 9.776767237249595e-05, + "loss": 0.577454149723053, + "step": 2388 + }, + { + "epoch": 1.0084388185654007, + "grad_norm": 1.067991852760315, + "learning_rate": 9.776057137426889e-05, + "loss": 0.6588307023048401, + "step": 2390 + }, + { + "epoch": 1.009282700421941, + "grad_norm": 1.2821543216705322, + "learning_rate": 9.775345935859525e-05, + "loss": 0.7045041918754578, + "step": 2392 + }, + { + "epoch": 1.010126582278481, + "grad_norm": 1.3160134553909302, + "learning_rate": 9.774633632711569e-05, + "loss": 0.7141479253768921, + "step": 2394 + }, + { + "epoch": 1.010970464135021, + "grad_norm": 1.66774320602417, + "learning_rate": 9.773920228147329e-05, + "loss": 0.723293662071228, + "step": 2396 + }, + { + "epoch": 1.0118143459915612, + "grad_norm": 1.027588963508606, + "learning_rate": 9.77320572233138e-05, + "loss": 0.5812023878097534, + "step": 2398 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.406507968902588, + "learning_rate": 9.77249011542854e-05, + "loss": 0.7071458101272583, + "step": 2400 + }, + { + "epoch": 1.0126582278481013, + "eval_loss": 0.7421699166297913, + "eval_runtime": 854.2185, + "eval_samples_per_second": 2.467, + "eval_steps_per_second": 2.467, + "step": 2400 + }, + { + "epoch": 1.0135021097046413, + "grad_norm": 1.1236240863800049, + "learning_rate": 9.771773407603889e-05, + "loss": 0.7049722671508789, + "step": 2402 + }, + { + "epoch": 1.0143459915611814, + "grad_norm": 1.1924289464950562, + "learning_rate": 9.771055599022756e-05, + "loss": 0.635308027267456, + "step": 2404 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 1.1744966506958008, + "learning_rate": 9.770336689850727e-05, + "loss": 0.7286487817764282, + "step": 2406 + }, + { + "epoch": 1.0160337552742615, + "grad_norm": 1.2131173610687256, + "learning_rate": 9.769616680253639e-05, + "loss": 0.6828222274780273, + "step": 2408 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 1.0517828464508057, + "learning_rate": 9.768895570397585e-05, + "loss": 0.6652156114578247, + "step": 2410 + }, + { + "epoch": 1.0177215189873419, + "grad_norm": 1.1603758335113525, + "learning_rate": 9.768173360448912e-05, + "loss": 0.7278267741203308, + "step": 2412 + }, + { + "epoch": 1.0185654008438818, + "grad_norm": 1.3167752027511597, + "learning_rate": 9.767450050574218e-05, + "loss": 0.6082334518432617, + "step": 2414 + }, + { + "epoch": 1.019409282700422, + "grad_norm": 1.1754449605941772, + "learning_rate": 9.766725640940358e-05, + "loss": 0.67228102684021, + "step": 2416 + }, + { + "epoch": 1.0202531645569621, + "grad_norm": 1.060952067375183, + "learning_rate": 9.766000131714442e-05, + "loss": 0.5984366536140442, + "step": 2418 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 1.0826152563095093, + "learning_rate": 9.765273523063825e-05, + "loss": 0.690661609172821, + "step": 2420 + }, + { + "epoch": 1.0219409282700422, + "grad_norm": 1.423723816871643, + "learning_rate": 9.764545815156125e-05, + "loss": 0.7960668802261353, + "step": 2422 + }, + { + "epoch": 1.0227848101265822, + "grad_norm": 1.0882549285888672, + "learning_rate": 9.763817008159212e-05, + "loss": 0.6971074342727661, + "step": 2424 + }, + { + "epoch": 1.0236286919831223, + "grad_norm": 1.1053040027618408, + "learning_rate": 9.763087102241206e-05, + "loss": 0.6854458451271057, + "step": 2426 + }, + { + "epoch": 1.0244725738396625, + "grad_norm": 1.1975224018096924, + "learning_rate": 9.762356097570482e-05, + "loss": 0.6724489331245422, + "step": 2428 + }, + { + "epoch": 1.0253164556962024, + "grad_norm": 1.1692171096801758, + "learning_rate": 9.76162399431567e-05, + "loss": 0.7064506411552429, + "step": 2430 + }, + { + "epoch": 1.0261603375527426, + "grad_norm": 1.1927787065505981, + "learning_rate": 9.760890792645649e-05, + "loss": 0.6605257391929626, + "step": 2432 + }, + { + "epoch": 1.0270042194092828, + "grad_norm": 1.4147427082061768, + "learning_rate": 9.760156492729558e-05, + "loss": 0.6872501373291016, + "step": 2434 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 1.2503126859664917, + "learning_rate": 9.759421094736785e-05, + "loss": 0.7117500305175781, + "step": 2436 + }, + { + "epoch": 1.0286919831223629, + "grad_norm": 1.229978084564209, + "learning_rate": 9.758684598836971e-05, + "loss": 0.6740369200706482, + "step": 2438 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 1.4765945672988892, + "learning_rate": 9.757947005200014e-05, + "loss": 0.7215790748596191, + "step": 2440 + }, + { + "epoch": 1.030379746835443, + "grad_norm": 1.282632827758789, + "learning_rate": 9.757208313996061e-05, + "loss": 0.6961746215820312, + "step": 2442 + }, + { + "epoch": 1.0312236286919831, + "grad_norm": 1.259828805923462, + "learning_rate": 9.756468525395512e-05, + "loss": 0.6348349452018738, + "step": 2444 + }, + { + "epoch": 1.0320675105485233, + "grad_norm": 1.0984172821044922, + "learning_rate": 9.755727639569024e-05, + "loss": 0.6756057739257812, + "step": 2446 + }, + { + "epoch": 1.0329113924050632, + "grad_norm": 1.235835075378418, + "learning_rate": 9.754985656687506e-05, + "loss": 0.6968509554862976, + "step": 2448 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 1.273032546043396, + "learning_rate": 9.754242576922119e-05, + "loss": 0.6793950796127319, + "step": 2450 + }, + { + "epoch": 1.0345991561181433, + "grad_norm": 1.251996397972107, + "learning_rate": 9.753498400444274e-05, + "loss": 0.645270586013794, + "step": 2452 + }, + { + "epoch": 1.0354430379746835, + "grad_norm": 1.4310805797576904, + "learning_rate": 9.752753127425642e-05, + "loss": 0.7291322350502014, + "step": 2454 + }, + { + "epoch": 1.0362869198312237, + "grad_norm": 1.6582196950912476, + "learning_rate": 9.752006758038142e-05, + "loss": 0.7553019523620605, + "step": 2456 + }, + { + "epoch": 1.0371308016877636, + "grad_norm": 1.081773042678833, + "learning_rate": 9.751259292453947e-05, + "loss": 0.5637331008911133, + "step": 2458 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 1.1483876705169678, + "learning_rate": 9.750510730845483e-05, + "loss": 0.6012396216392517, + "step": 2460 + }, + { + "epoch": 1.038818565400844, + "grad_norm": 1.0879185199737549, + "learning_rate": 9.749761073385428e-05, + "loss": 0.6795822381973267, + "step": 2462 + }, + { + "epoch": 1.0396624472573839, + "grad_norm": 1.2378218173980713, + "learning_rate": 9.749010320246714e-05, + "loss": 0.6895145773887634, + "step": 2464 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 1.253233790397644, + "learning_rate": 9.748258471602527e-05, + "loss": 0.7124115228652954, + "step": 2466 + }, + { + "epoch": 1.0413502109704642, + "grad_norm": 1.3994864225387573, + "learning_rate": 9.747505527626302e-05, + "loss": 0.7304861545562744, + "step": 2468 + }, + { + "epoch": 1.0421940928270041, + "grad_norm": 1.2360669374465942, + "learning_rate": 9.74675148849173e-05, + "loss": 0.6845837831497192, + "step": 2470 + }, + { + "epoch": 1.0430379746835443, + "grad_norm": 1.126849889755249, + "learning_rate": 9.74599635437275e-05, + "loss": 0.6780203580856323, + "step": 2472 + }, + { + "epoch": 1.0438818565400845, + "grad_norm": 1.169788122177124, + "learning_rate": 9.745240125443562e-05, + "loss": 0.7550003528594971, + "step": 2474 + }, + { + "epoch": 1.0447257383966244, + "grad_norm": 1.1311867237091064, + "learning_rate": 9.744482801878612e-05, + "loss": 0.6910399198532104, + "step": 2476 + }, + { + "epoch": 1.0455696202531646, + "grad_norm": 1.1267731189727783, + "learning_rate": 9.743724383852597e-05, + "loss": 0.7164814472198486, + "step": 2478 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 1.2239704132080078, + "learning_rate": 9.742964871540472e-05, + "loss": 0.6428439617156982, + "step": 2480 + }, + { + "epoch": 1.0472573839662447, + "grad_norm": 1.1854743957519531, + "learning_rate": 9.742204265117443e-05, + "loss": 0.6994290351867676, + "step": 2482 + }, + { + "epoch": 1.0481012658227848, + "grad_norm": 1.0695894956588745, + "learning_rate": 9.741442564758964e-05, + "loss": 0.6725777983665466, + "step": 2484 + }, + { + "epoch": 1.048945147679325, + "grad_norm": 1.1799863576889038, + "learning_rate": 9.740679770640748e-05, + "loss": 0.6538674235343933, + "step": 2486 + }, + { + "epoch": 1.049789029535865, + "grad_norm": 1.295546293258667, + "learning_rate": 9.739915882938754e-05, + "loss": 0.780756950378418, + "step": 2488 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 1.2371755838394165, + "learning_rate": 9.739150901829198e-05, + "loss": 0.6657930612564087, + "step": 2490 + }, + { + "epoch": 1.051476793248945, + "grad_norm": 1.103037714958191, + "learning_rate": 9.738384827488547e-05, + "loss": 0.6675208210945129, + "step": 2492 + }, + { + "epoch": 1.0523206751054852, + "grad_norm": 1.1835435628890991, + "learning_rate": 9.737617660093517e-05, + "loss": 0.6693358421325684, + "step": 2494 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 1.003771424293518, + "learning_rate": 9.736849399821082e-05, + "loss": 0.624502956867218, + "step": 2496 + }, + { + "epoch": 1.0540084388185653, + "grad_norm": 1.1391769647598267, + "learning_rate": 9.736080046848463e-05, + "loss": 0.6350868344306946, + "step": 2498 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 1.376518726348877, + "learning_rate": 9.735309601353134e-05, + "loss": 0.6721012592315674, + "step": 2500 + }, + { + "epoch": 1.0548523206751055, + "eval_loss": 0.741338849067688, + "eval_runtime": 847.7478, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2500 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 1 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.596869542036017e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-2500/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3000/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3000/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3000/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b8bdcf2b00b6ed5c569ae98da73b11ae81d0c49c --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:572c1222d21c879d07f6653cb4605da12a85133cc0188c2b00b2221aaa62cfbe +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3000/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6aae9777e19bbefae04e231515af5c868fee8dc0 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d13c8293c1238c70e3998e56118b6e7dd7fa6bdfcb679007496a08a03d0b3023 +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3000/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..899db96ae3a4e8341afa93282303dc62f79da3f7 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60683ad220666ae109ec67d53325ffb9c9c5fcc7a868f2ef15b68e9723037766 +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3000/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a2169e6556e5c7c09e61cf4e205c3f258889f5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:387a3533fe4e8e1eeba51f1150cceef09f54bd06750ea51420287466c7ba0384 +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3000/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9334919de3bbf6f5cd1d1da6ccf983bae914f5a9 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/trainer_state.json @@ -0,0 +1,10783 @@ +{ + "best_global_step": 3000, + "best_metric": 0.72515869140625, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-3000", + "epoch": 1.2658227848101267, + "eval_steps": 100, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + }, + { + "epoch": 0.8447257383966245, + "grad_norm": 1.145434021949768, + "learning_rate": 9.893006338154401e-05, + "loss": 0.731850802898407, + "step": 2002 + }, + { + "epoch": 0.8455696202531645, + "grad_norm": 1.0618077516555786, + "learning_rate": 9.892511635084101e-05, + "loss": 0.6711665391921997, + "step": 2004 + }, + { + "epoch": 0.8464135021097047, + "grad_norm": 1.1657867431640625, + "learning_rate": 9.892015803405331e-05, + "loss": 0.6894803643226624, + "step": 2006 + }, + { + "epoch": 0.8472573839662447, + "grad_norm": 1.080140233039856, + "learning_rate": 9.891518843232467e-05, + "loss": 0.628146231174469, + "step": 2008 + }, + { + "epoch": 0.8481012658227848, + "grad_norm": 1.0664509534835815, + "learning_rate": 9.891020754680151e-05, + "loss": 0.740858793258667, + "step": 2010 + }, + { + "epoch": 0.8489451476793249, + "grad_norm": 1.5567615032196045, + "learning_rate": 9.89052153786328e-05, + "loss": 0.7763919234275818, + "step": 2012 + }, + { + "epoch": 0.849789029535865, + "grad_norm": 1.4347095489501953, + "learning_rate": 9.890021192897016e-05, + "loss": 0.8131396770477295, + "step": 2014 + }, + { + "epoch": 0.850632911392405, + "grad_norm": 1.1787892580032349, + "learning_rate": 9.889519719896776e-05, + "loss": 0.6829051375389099, + "step": 2016 + }, + { + "epoch": 0.8514767932489451, + "grad_norm": 1.239745855331421, + "learning_rate": 9.889017118978241e-05, + "loss": 0.7664558291435242, + "step": 2018 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 1.1224207878112793, + "learning_rate": 9.888513390257352e-05, + "loss": 0.7307376861572266, + "step": 2020 + }, + { + "epoch": 0.8531645569620253, + "grad_norm": 1.100536823272705, + "learning_rate": 9.88800853385031e-05, + "loss": 0.6786578893661499, + "step": 2022 + }, + { + "epoch": 0.8540084388185654, + "grad_norm": 1.25773024559021, + "learning_rate": 9.887502549873576e-05, + "loss": 0.7971984148025513, + "step": 2024 + }, + { + "epoch": 0.8548523206751055, + "grad_norm": 0.9980104565620422, + "learning_rate": 9.886995438443868e-05, + "loss": 0.6990941166877747, + "step": 2026 + }, + { + "epoch": 0.8556962025316456, + "grad_norm": 1.0464621782302856, + "learning_rate": 9.886487199678171e-05, + "loss": 0.763938307762146, + "step": 2028 + }, + { + "epoch": 0.8565400843881856, + "grad_norm": 1.2303017377853394, + "learning_rate": 9.885977833693724e-05, + "loss": 0.7165632247924805, + "step": 2030 + }, + { + "epoch": 0.8573839662447258, + "grad_norm": 1.2203325033187866, + "learning_rate": 9.885467340608027e-05, + "loss": 0.7586364150047302, + "step": 2032 + }, + { + "epoch": 0.8582278481012658, + "grad_norm": 1.113882064819336, + "learning_rate": 9.884955720538843e-05, + "loss": 0.703253984451294, + "step": 2034 + }, + { + "epoch": 0.8590717299578059, + "grad_norm": 1.1731632947921753, + "learning_rate": 9.88444297360419e-05, + "loss": 0.8530917763710022, + "step": 2036 + }, + { + "epoch": 0.859915611814346, + "grad_norm": 1.4592338800430298, + "learning_rate": 9.883929099922349e-05, + "loss": 0.8166638612747192, + "step": 2038 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.1279125213623047, + "learning_rate": 9.883414099611864e-05, + "loss": 0.6762415170669556, + "step": 2040 + }, + { + "epoch": 0.8616033755274262, + "grad_norm": 1.1587293148040771, + "learning_rate": 9.882897972791534e-05, + "loss": 0.6826539039611816, + "step": 2042 + }, + { + "epoch": 0.8624472573839662, + "grad_norm": 1.1909502744674683, + "learning_rate": 9.88238071958042e-05, + "loss": 0.7372410893440247, + "step": 2044 + }, + { + "epoch": 0.8632911392405064, + "grad_norm": 1.0340155363082886, + "learning_rate": 9.881862340097841e-05, + "loss": 0.699260950088501, + "step": 2046 + }, + { + "epoch": 0.8641350210970464, + "grad_norm": 1.1745870113372803, + "learning_rate": 9.881342834463379e-05, + "loss": 0.7689789533615112, + "step": 2048 + }, + { + "epoch": 0.8649789029535865, + "grad_norm": 1.0003606081008911, + "learning_rate": 9.880822202796872e-05, + "loss": 0.6877372860908508, + "step": 2050 + }, + { + "epoch": 0.8658227848101265, + "grad_norm": 1.2546781301498413, + "learning_rate": 9.88030044521842e-05, + "loss": 0.7632413506507874, + "step": 2052 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.1178704500198364, + "learning_rate": 9.879777561848385e-05, + "loss": 0.6776729822158813, + "step": 2054 + }, + { + "epoch": 0.8675105485232067, + "grad_norm": 1.523606777191162, + "learning_rate": 9.879253552807384e-05, + "loss": 0.7592973709106445, + "step": 2056 + }, + { + "epoch": 0.8683544303797468, + "grad_norm": 1.3490995168685913, + "learning_rate": 9.878728418216296e-05, + "loss": 0.8028839230537415, + "step": 2058 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 1.1851624250411987, + "learning_rate": 9.87820215819626e-05, + "loss": 0.7499933838844299, + "step": 2060 + }, + { + "epoch": 0.870042194092827, + "grad_norm": 1.1877925395965576, + "learning_rate": 9.877674772868672e-05, + "loss": 0.7324717044830322, + "step": 2062 + }, + { + "epoch": 0.8708860759493671, + "grad_norm": 1.2982885837554932, + "learning_rate": 9.877146262355194e-05, + "loss": 0.7456585168838501, + "step": 2064 + }, + { + "epoch": 0.8717299578059071, + "grad_norm": 1.043912649154663, + "learning_rate": 9.876616626777739e-05, + "loss": 0.7552799582481384, + "step": 2066 + }, + { + "epoch": 0.8725738396624473, + "grad_norm": 1.172580599784851, + "learning_rate": 9.876085866258487e-05, + "loss": 0.6964990496635437, + "step": 2068 + }, + { + "epoch": 0.8734177215189873, + "grad_norm": 1.26815927028656, + "learning_rate": 9.875553980919871e-05, + "loss": 0.7368612289428711, + "step": 2070 + }, + { + "epoch": 0.8742616033755274, + "grad_norm": 1.1268136501312256, + "learning_rate": 9.875020970884587e-05, + "loss": 0.7400802969932556, + "step": 2072 + }, + { + "epoch": 0.8751054852320675, + "grad_norm": 1.0556721687316895, + "learning_rate": 9.874486836275594e-05, + "loss": 0.6931334137916565, + "step": 2074 + }, + { + "epoch": 0.8759493670886076, + "grad_norm": 1.1967823505401611, + "learning_rate": 9.873951577216106e-05, + "loss": 0.7124089002609253, + "step": 2076 + }, + { + "epoch": 0.8767932489451477, + "grad_norm": 1.1753164529800415, + "learning_rate": 9.873415193829591e-05, + "loss": 0.7462030053138733, + "step": 2078 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 1.326923131942749, + "learning_rate": 9.872877686239789e-05, + "loss": 0.778078019618988, + "step": 2080 + }, + { + "epoch": 0.8784810126582279, + "grad_norm": 1.1472662687301636, + "learning_rate": 9.87233905457069e-05, + "loss": 0.6592919826507568, + "step": 2082 + }, + { + "epoch": 0.8793248945147679, + "grad_norm": 1.1162762641906738, + "learning_rate": 9.871799298946544e-05, + "loss": 0.661717414855957, + "step": 2084 + }, + { + "epoch": 0.880168776371308, + "grad_norm": 1.1694408655166626, + "learning_rate": 9.871258419491866e-05, + "loss": 0.6203670501708984, + "step": 2086 + }, + { + "epoch": 0.8810126582278481, + "grad_norm": 1.229691505432129, + "learning_rate": 9.870716416331425e-05, + "loss": 0.758888304233551, + "step": 2088 + }, + { + "epoch": 0.8818565400843882, + "grad_norm": 1.540377140045166, + "learning_rate": 9.870173289590251e-05, + "loss": 0.760649561882019, + "step": 2090 + }, + { + "epoch": 0.8827004219409282, + "grad_norm": 1.173628568649292, + "learning_rate": 9.869629039393632e-05, + "loss": 0.6981227397918701, + "step": 2092 + }, + { + "epoch": 0.8835443037974684, + "grad_norm": 1.1404013633728027, + "learning_rate": 9.869083665867116e-05, + "loss": 0.7808336615562439, + "step": 2094 + }, + { + "epoch": 0.8843881856540085, + "grad_norm": 1.1038721799850464, + "learning_rate": 9.868537169136511e-05, + "loss": 0.7540555596351624, + "step": 2096 + }, + { + "epoch": 0.8852320675105485, + "grad_norm": 1.1510080099105835, + "learning_rate": 9.867989549327885e-05, + "loss": 0.6650454998016357, + "step": 2098 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.166912317276001, + "learning_rate": 9.867440806567561e-05, + "loss": 0.673769474029541, + "step": 2100 + }, + { + "epoch": 0.8860759493670886, + "eval_loss": 0.7559094429016113, + "eval_runtime": 847.8311, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2100 + }, + { + "epoch": 0.8869198312236287, + "grad_norm": 1.227583885192871, + "learning_rate": 9.866890940982121e-05, + "loss": 0.8314241766929626, + "step": 2102 + }, + { + "epoch": 0.8877637130801688, + "grad_norm": 1.1813976764678955, + "learning_rate": 9.866339952698413e-05, + "loss": 0.6770843863487244, + "step": 2104 + }, + { + "epoch": 0.8886075949367088, + "grad_norm": 1.2471063137054443, + "learning_rate": 9.865787841843539e-05, + "loss": 0.7142292857170105, + "step": 2106 + }, + { + "epoch": 0.889451476793249, + "grad_norm": 1.1602860689163208, + "learning_rate": 9.865234608544858e-05, + "loss": 0.6981731653213501, + "step": 2108 + }, + { + "epoch": 0.890295358649789, + "grad_norm": 1.145677089691162, + "learning_rate": 9.864680252929992e-05, + "loss": 0.7019379138946533, + "step": 2110 + }, + { + "epoch": 0.8911392405063291, + "grad_norm": 1.2222462892532349, + "learning_rate": 9.86412477512682e-05, + "loss": 0.7690986394882202, + "step": 2112 + }, + { + "epoch": 0.8919831223628693, + "grad_norm": 1.1288166046142578, + "learning_rate": 9.863568175263478e-05, + "loss": 0.7241792678833008, + "step": 2114 + }, + { + "epoch": 0.8928270042194093, + "grad_norm": 1.1773978471755981, + "learning_rate": 9.863010453468364e-05, + "loss": 0.7392162084579468, + "step": 2116 + }, + { + "epoch": 0.8936708860759494, + "grad_norm": 1.102638840675354, + "learning_rate": 9.862451609870136e-05, + "loss": 0.7603078484535217, + "step": 2118 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.861891644597707e-05, + "loss": 0.6804911494255066, + "step": 2120 + }, + { + "epoch": 0.8953586497890296, + "grad_norm": 1.1381969451904297, + "learning_rate": 9.86133055778025e-05, + "loss": 0.787288248538971, + "step": 2122 + }, + { + "epoch": 0.8962025316455696, + "grad_norm": 1.2454546689987183, + "learning_rate": 9.860768349547196e-05, + "loss": 0.7282505035400391, + "step": 2124 + }, + { + "epoch": 0.8970464135021097, + "grad_norm": 1.2568305730819702, + "learning_rate": 9.860205020028237e-05, + "loss": 0.7554803490638733, + "step": 2126 + }, + { + "epoch": 0.8978902953586498, + "grad_norm": 1.1523523330688477, + "learning_rate": 9.859640569353321e-05, + "loss": 0.7126525044441223, + "step": 2128 + }, + { + "epoch": 0.8987341772151899, + "grad_norm": 1.314878225326538, + "learning_rate": 9.859074997652658e-05, + "loss": 0.7300811409950256, + "step": 2130 + }, + { + "epoch": 0.8995780590717299, + "grad_norm": 1.1272218227386475, + "learning_rate": 9.858508305056713e-05, + "loss": 0.7217329144477844, + "step": 2132 + }, + { + "epoch": 0.90042194092827, + "grad_norm": 1.10934317111969, + "learning_rate": 9.857940491696211e-05, + "loss": 0.714308500289917, + "step": 2134 + }, + { + "epoch": 0.9012658227848102, + "grad_norm": 1.1991039514541626, + "learning_rate": 9.857371557702136e-05, + "loss": 0.6613366007804871, + "step": 2136 + }, + { + "epoch": 0.9021097046413502, + "grad_norm": 1.3176918029785156, + "learning_rate": 9.85680150320573e-05, + "loss": 0.6972863078117371, + "step": 2138 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 1.1966592073440552, + "learning_rate": 9.856230328338496e-05, + "loss": 0.7299100160598755, + "step": 2140 + }, + { + "epoch": 0.9037974683544304, + "grad_norm": 1.2889270782470703, + "learning_rate": 9.85565803323219e-05, + "loss": 0.7145020961761475, + "step": 2142 + }, + { + "epoch": 0.9046413502109705, + "grad_norm": 1.2112789154052734, + "learning_rate": 9.855084618018828e-05, + "loss": 0.6717942953109741, + "step": 2144 + }, + { + "epoch": 0.9054852320675105, + "grad_norm": 1.2550239562988281, + "learning_rate": 9.85451008283069e-05, + "loss": 0.7460196018218994, + "step": 2146 + }, + { + "epoch": 0.9063291139240506, + "grad_norm": 1.2926387786865234, + "learning_rate": 9.853934427800309e-05, + "loss": 0.8300626873970032, + "step": 2148 + }, + { + "epoch": 0.9071729957805907, + "grad_norm": 1.0690672397613525, + "learning_rate": 9.853357653060478e-05, + "loss": 0.715215802192688, + "step": 2150 + }, + { + "epoch": 0.9080168776371308, + "grad_norm": 1.1021424531936646, + "learning_rate": 9.852779758744245e-05, + "loss": 0.7021427154541016, + "step": 2152 + }, + { + "epoch": 0.9088607594936708, + "grad_norm": 1.0713517665863037, + "learning_rate": 9.852200744984921e-05, + "loss": 0.7576406598091125, + "step": 2154 + }, + { + "epoch": 0.909704641350211, + "grad_norm": 1.277526617050171, + "learning_rate": 9.851620611916075e-05, + "loss": 0.7008846998214722, + "step": 2156 + }, + { + "epoch": 0.9105485232067511, + "grad_norm": 1.2434618473052979, + "learning_rate": 9.85103935967153e-05, + "loss": 0.7536613345146179, + "step": 2158 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 1.1654841899871826, + "learning_rate": 9.850456988385371e-05, + "loss": 0.7435567378997803, + "step": 2160 + }, + { + "epoch": 0.9122362869198313, + "grad_norm": 1.0718246698379517, + "learning_rate": 9.849873498191939e-05, + "loss": 0.7725666165351868, + "step": 2162 + }, + { + "epoch": 0.9130801687763713, + "grad_norm": 1.3425630331039429, + "learning_rate": 9.849288889225835e-05, + "loss": 0.7833593487739563, + "step": 2164 + }, + { + "epoch": 0.9139240506329114, + "grad_norm": 1.1989985704421997, + "learning_rate": 9.848703161621917e-05, + "loss": 0.7290158867835999, + "step": 2166 + }, + { + "epoch": 0.9147679324894514, + "grad_norm": 1.0549380779266357, + "learning_rate": 9.8481163155153e-05, + "loss": 0.6787996888160706, + "step": 2168 + }, + { + "epoch": 0.9156118143459916, + "grad_norm": 1.0757017135620117, + "learning_rate": 9.847528351041359e-05, + "loss": 0.7645748853683472, + "step": 2170 + }, + { + "epoch": 0.9164556962025316, + "grad_norm": 1.0636975765228271, + "learning_rate": 9.846939268335726e-05, + "loss": 0.6640698313713074, + "step": 2172 + }, + { + "epoch": 0.9172995780590717, + "grad_norm": 1.2038439512252808, + "learning_rate": 9.846349067534291e-05, + "loss": 0.7216284275054932, + "step": 2174 + }, + { + "epoch": 0.9181434599156119, + "grad_norm": 1.17854642868042, + "learning_rate": 9.845757748773203e-05, + "loss": 0.7244991660118103, + "step": 2176 + }, + { + "epoch": 0.9189873417721519, + "grad_norm": 1.0391159057617188, + "learning_rate": 9.845165312188864e-05, + "loss": 0.6043152809143066, + "step": 2178 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 1.2382071018218994, + "learning_rate": 9.844571757917944e-05, + "loss": 0.7791659832000732, + "step": 2180 + }, + { + "epoch": 0.920675105485232, + "grad_norm": 1.0855708122253418, + "learning_rate": 9.84397708609736e-05, + "loss": 0.7190433144569397, + "step": 2182 + }, + { + "epoch": 0.9215189873417722, + "grad_norm": 1.103308916091919, + "learning_rate": 9.843381296864291e-05, + "loss": 0.6648658514022827, + "step": 2184 + }, + { + "epoch": 0.9223628691983122, + "grad_norm": 1.073517918586731, + "learning_rate": 9.842784390356178e-05, + "loss": 0.6891760230064392, + "step": 2186 + }, + { + "epoch": 0.9232067510548523, + "grad_norm": 1.0806199312210083, + "learning_rate": 9.842186366710712e-05, + "loss": 0.6880859136581421, + "step": 2188 + }, + { + "epoch": 0.9240506329113924, + "grad_norm": 1.0631483793258667, + "learning_rate": 9.841587226065848e-05, + "loss": 0.6238307952880859, + "step": 2190 + }, + { + "epoch": 0.9248945147679325, + "grad_norm": 1.2630863189697266, + "learning_rate": 9.840986968559795e-05, + "loss": 0.6905744075775146, + "step": 2192 + }, + { + "epoch": 0.9257383966244725, + "grad_norm": 1.1307560205459595, + "learning_rate": 9.840385594331022e-05, + "loss": 0.7531564235687256, + "step": 2194 + }, + { + "epoch": 0.9265822784810127, + "grad_norm": 1.0294862985610962, + "learning_rate": 9.839783103518254e-05, + "loss": 0.6750671863555908, + "step": 2196 + }, + { + "epoch": 0.9274261603375528, + "grad_norm": 1.2446976900100708, + "learning_rate": 9.839179496260472e-05, + "loss": 0.7200804352760315, + "step": 2198 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 1.2673420906066895, + "learning_rate": 9.83857477269692e-05, + "loss": 0.7002623677253723, + "step": 2200 + }, + { + "epoch": 0.9282700421940928, + "eval_loss": 0.7497645616531372, + "eval_runtime": 856.8766, + "eval_samples_per_second": 2.459, + "eval_steps_per_second": 2.459, + "step": 2200 + }, + { + "epoch": 0.9291139240506329, + "grad_norm": 1.5114624500274658, + "learning_rate": 9.837968932967094e-05, + "loss": 0.7718265056610107, + "step": 2202 + }, + { + "epoch": 0.929957805907173, + "grad_norm": 1.2059369087219238, + "learning_rate": 9.837361977210751e-05, + "loss": 0.7204271554946899, + "step": 2204 + }, + { + "epoch": 0.9308016877637131, + "grad_norm": 1.2077301740646362, + "learning_rate": 9.836753905567902e-05, + "loss": 0.7371073961257935, + "step": 2206 + }, + { + "epoch": 0.9316455696202531, + "grad_norm": 1.120097279548645, + "learning_rate": 9.836144718178818e-05, + "loss": 0.6601167321205139, + "step": 2208 + }, + { + "epoch": 0.9324894514767933, + "grad_norm": 1.1755714416503906, + "learning_rate": 9.835534415184029e-05, + "loss": 0.6897423267364502, + "step": 2210 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3587000370025635, + "learning_rate": 9.834922996724317e-05, + "loss": 0.758438229560852, + "step": 2212 + }, + { + "epoch": 0.9341772151898734, + "grad_norm": 1.1898177862167358, + "learning_rate": 9.834310462940727e-05, + "loss": 0.7489214539527893, + "step": 2214 + }, + { + "epoch": 0.9350210970464135, + "grad_norm": 1.0814623832702637, + "learning_rate": 9.833696813974558e-05, + "loss": 0.6844488382339478, + "step": 2216 + }, + { + "epoch": 0.9358649789029536, + "grad_norm": 1.1060179471969604, + "learning_rate": 9.833082049967366e-05, + "loss": 0.6617586016654968, + "step": 2218 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 1.1780575513839722, + "learning_rate": 9.832466171060968e-05, + "loss": 0.7383584976196289, + "step": 2220 + }, + { + "epoch": 0.9375527426160337, + "grad_norm": 1.3734618425369263, + "learning_rate": 9.831849177397432e-05, + "loss": 0.7764308452606201, + "step": 2222 + }, + { + "epoch": 0.9383966244725739, + "grad_norm": 1.1367733478546143, + "learning_rate": 9.831231069119089e-05, + "loss": 0.6834397912025452, + "step": 2224 + }, + { + "epoch": 0.9392405063291139, + "grad_norm": 1.1695492267608643, + "learning_rate": 9.830611846368524e-05, + "loss": 0.7054480910301208, + "step": 2226 + }, + { + "epoch": 0.940084388185654, + "grad_norm": 1.0345736742019653, + "learning_rate": 9.829991509288579e-05, + "loss": 0.694448709487915, + "step": 2228 + }, + { + "epoch": 0.9409282700421941, + "grad_norm": 1.298105239868164, + "learning_rate": 9.829370058022356e-05, + "loss": 0.6839741468429565, + "step": 2230 + }, + { + "epoch": 0.9417721518987342, + "grad_norm": 1.2905502319335938, + "learning_rate": 9.828747492713209e-05, + "loss": 0.7886884212493896, + "step": 2232 + }, + { + "epoch": 0.9426160337552743, + "grad_norm": 1.12301504611969, + "learning_rate": 9.828123813504753e-05, + "loss": 0.7206413149833679, + "step": 2234 + }, + { + "epoch": 0.9434599156118143, + "grad_norm": 1.2644896507263184, + "learning_rate": 9.82749902054086e-05, + "loss": 0.7700693607330322, + "step": 2236 + }, + { + "epoch": 0.9443037974683545, + "grad_norm": 1.1626365184783936, + "learning_rate": 9.826873113965655e-05, + "loss": 0.7199711203575134, + "step": 2238 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 1.0728627443313599, + "learning_rate": 9.826246093923528e-05, + "loss": 0.7183539271354675, + "step": 2240 + }, + { + "epoch": 0.9459915611814346, + "grad_norm": 1.1444766521453857, + "learning_rate": 9.825617960559114e-05, + "loss": 0.7417964935302734, + "step": 2242 + }, + { + "epoch": 0.9468354430379747, + "grad_norm": 1.4059823751449585, + "learning_rate": 9.824988714017316e-05, + "loss": 0.7949740290641785, + "step": 2244 + }, + { + "epoch": 0.9476793248945148, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.824358354443286e-05, + "loss": 0.6433083415031433, + "step": 2246 + }, + { + "epoch": 0.9485232067510548, + "grad_norm": 1.0879144668579102, + "learning_rate": 9.823726881982438e-05, + "loss": 0.6519861817359924, + "step": 2248 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.2289162874221802, + "learning_rate": 9.82309429678044e-05, + "loss": 0.7280195355415344, + "step": 2250 + }, + { + "epoch": 0.950210970464135, + "grad_norm": 1.1755765676498413, + "learning_rate": 9.822460598983217e-05, + "loss": 0.7524687647819519, + "step": 2252 + }, + { + "epoch": 0.9510548523206751, + "grad_norm": 1.179807186126709, + "learning_rate": 9.821825788736949e-05, + "loss": 0.7543174624443054, + "step": 2254 + }, + { + "epoch": 0.9518987341772152, + "grad_norm": 1.1234289407730103, + "learning_rate": 9.821189866188079e-05, + "loss": 0.716377854347229, + "step": 2256 + }, + { + "epoch": 0.9527426160337553, + "grad_norm": 1.0324063301086426, + "learning_rate": 9.820552831483297e-05, + "loss": 0.6403332948684692, + "step": 2258 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 1.1459579467773438, + "learning_rate": 9.819914684769558e-05, + "loss": 0.7406947612762451, + "step": 2260 + }, + { + "epoch": 0.9544303797468354, + "grad_norm": 1.2886124849319458, + "learning_rate": 9.819275426194072e-05, + "loss": 0.749687671661377, + "step": 2262 + }, + { + "epoch": 0.9552742616033755, + "grad_norm": 1.3349844217300415, + "learning_rate": 9.818635055904299e-05, + "loss": 0.778410017490387, + "step": 2264 + }, + { + "epoch": 0.9561181434599156, + "grad_norm": 1.0994901657104492, + "learning_rate": 9.81799357404796e-05, + "loss": 0.6701914668083191, + "step": 2266 + }, + { + "epoch": 0.9569620253164557, + "grad_norm": 1.1787796020507812, + "learning_rate": 9.817350980773038e-05, + "loss": 0.7205135226249695, + "step": 2268 + }, + { + "epoch": 0.9578059071729957, + "grad_norm": 1.100813627243042, + "learning_rate": 9.816707276227763e-05, + "loss": 0.6897916197776794, + "step": 2270 + }, + { + "epoch": 0.9586497890295359, + "grad_norm": 1.1280698776245117, + "learning_rate": 9.816062460560627e-05, + "loss": 0.6763570308685303, + "step": 2272 + }, + { + "epoch": 0.959493670886076, + "grad_norm": 1.2322514057159424, + "learning_rate": 9.815416533920374e-05, + "loss": 0.6948683857917786, + "step": 2274 + }, + { + "epoch": 0.960337552742616, + "grad_norm": 1.3963630199432373, + "learning_rate": 9.814769496456008e-05, + "loss": 0.7876828908920288, + "step": 2276 + }, + { + "epoch": 0.9611814345991562, + "grad_norm": 1.2093676328659058, + "learning_rate": 9.814121348316792e-05, + "loss": 0.8191362619400024, + "step": 2278 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 1.2223572731018066, + "learning_rate": 9.813472089652233e-05, + "loss": 0.7162626385688782, + "step": 2280 + }, + { + "epoch": 0.9628691983122363, + "grad_norm": 1.1498078107833862, + "learning_rate": 9.812821720612111e-05, + "loss": 0.7183970212936401, + "step": 2282 + }, + { + "epoch": 0.9637130801687763, + "grad_norm": 1.1563853025436401, + "learning_rate": 9.812170241346449e-05, + "loss": 0.734487771987915, + "step": 2284 + }, + { + "epoch": 0.9645569620253165, + "grad_norm": 1.1823415756225586, + "learning_rate": 9.81151765200553e-05, + "loss": 0.7312371730804443, + "step": 2286 + }, + { + "epoch": 0.9654008438818565, + "grad_norm": 1.1336151361465454, + "learning_rate": 9.810863952739899e-05, + "loss": 0.7668377757072449, + "step": 2288 + }, + { + "epoch": 0.9662447257383966, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.810209143700347e-05, + "loss": 0.7100399732589722, + "step": 2290 + }, + { + "epoch": 0.9670886075949368, + "grad_norm": 1.1368129253387451, + "learning_rate": 9.809553225037926e-05, + "loss": 0.7169836163520813, + "step": 2292 + }, + { + "epoch": 0.9679324894514768, + "grad_norm": 1.141107439994812, + "learning_rate": 9.808896196903947e-05, + "loss": 0.7709535956382751, + "step": 2294 + }, + { + "epoch": 0.9687763713080169, + "grad_norm": 1.276405930519104, + "learning_rate": 9.808238059449971e-05, + "loss": 0.7300511002540588, + "step": 2296 + }, + { + "epoch": 0.9696202531645569, + "grad_norm": 0.9817046523094177, + "learning_rate": 9.80757881282782e-05, + "loss": 0.6259129047393799, + "step": 2298 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 1.3965257406234741, + "learning_rate": 9.806918457189566e-05, + "loss": 0.7361716032028198, + "step": 2300 + }, + { + "epoch": 0.9704641350210971, + "eval_loss": 0.7464568614959717, + "eval_runtime": 864.2128, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 2.438, + "step": 2300 + }, + { + "epoch": 0.9713080168776371, + "grad_norm": 1.2168612480163574, + "learning_rate": 9.806256992687544e-05, + "loss": 0.805477499961853, + "step": 2302 + }, + { + "epoch": 0.9721518987341772, + "grad_norm": 1.0418168306350708, + "learning_rate": 9.80559441947434e-05, + "loss": 0.6673368811607361, + "step": 2304 + }, + { + "epoch": 0.9729957805907173, + "grad_norm": 1.223128318786621, + "learning_rate": 9.804930737702796e-05, + "loss": 0.7585647106170654, + "step": 2306 + }, + { + "epoch": 0.9738396624472574, + "grad_norm": 1.264511227607727, + "learning_rate": 9.804265947526011e-05, + "loss": 0.7642034888267517, + "step": 2308 + }, + { + "epoch": 0.9746835443037974, + "grad_norm": 1.076887607574463, + "learning_rate": 9.803600049097339e-05, + "loss": 0.7094541192054749, + "step": 2310 + }, + { + "epoch": 0.9755274261603376, + "grad_norm": 1.0214987993240356, + "learning_rate": 9.802933042570392e-05, + "loss": 0.7370059490203857, + "step": 2312 + }, + { + "epoch": 0.9763713080168777, + "grad_norm": 1.3075295686721802, + "learning_rate": 9.802264928099035e-05, + "loss": 0.726834237575531, + "step": 2314 + }, + { + "epoch": 0.9772151898734177, + "grad_norm": 1.057386040687561, + "learning_rate": 9.801595705837385e-05, + "loss": 0.6742353439331055, + "step": 2316 + }, + { + "epoch": 0.9780590717299578, + "grad_norm": 1.3998085260391235, + "learning_rate": 9.800925375939825e-05, + "loss": 0.6862425208091736, + "step": 2318 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 1.080574631690979, + "learning_rate": 9.800253938560983e-05, + "loss": 0.6212031245231628, + "step": 2320 + }, + { + "epoch": 0.979746835443038, + "grad_norm": 1.3643771409988403, + "learning_rate": 9.799581393855748e-05, + "loss": 0.7522522211074829, + "step": 2322 + }, + { + "epoch": 0.980590717299578, + "grad_norm": 1.2455768585205078, + "learning_rate": 9.798907741979264e-05, + "loss": 0.7265716791152954, + "step": 2324 + }, + { + "epoch": 0.9814345991561182, + "grad_norm": 1.078774333000183, + "learning_rate": 9.798232983086927e-05, + "loss": 0.7160419225692749, + "step": 2326 + }, + { + "epoch": 0.9822784810126582, + "grad_norm": 1.3013948202133179, + "learning_rate": 9.797557117334394e-05, + "loss": 0.7991124391555786, + "step": 2328 + }, + { + "epoch": 0.9831223628691983, + "grad_norm": 1.2216732501983643, + "learning_rate": 9.796880144877572e-05, + "loss": 0.7193916440010071, + "step": 2330 + }, + { + "epoch": 0.9839662447257383, + "grad_norm": 1.1469542980194092, + "learning_rate": 9.796202065872627e-05, + "loss": 0.7184370756149292, + "step": 2332 + }, + { + "epoch": 0.9848101265822785, + "grad_norm": 1.0431830883026123, + "learning_rate": 9.795522880475979e-05, + "loss": 0.6474619507789612, + "step": 2334 + }, + { + "epoch": 0.9856540084388186, + "grad_norm": 1.1819576025009155, + "learning_rate": 9.794842588844299e-05, + "loss": 0.6392545700073242, + "step": 2336 + }, + { + "epoch": 0.9864978902953586, + "grad_norm": 1.1984983682632446, + "learning_rate": 9.794161191134525e-05, + "loss": 0.7358114719390869, + "step": 2338 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 1.3378512859344482, + "learning_rate": 9.793478687503834e-05, + "loss": 0.6762020587921143, + "step": 2340 + }, + { + "epoch": 0.9881856540084388, + "grad_norm": 1.272674560546875, + "learning_rate": 9.792795078109673e-05, + "loss": 0.7478934526443481, + "step": 2342 + }, + { + "epoch": 0.9890295358649789, + "grad_norm": 1.153746247291565, + "learning_rate": 9.792110363109733e-05, + "loss": 0.7316533923149109, + "step": 2344 + }, + { + "epoch": 0.9898734177215189, + "grad_norm": 1.1361702680587769, + "learning_rate": 9.791424542661967e-05, + "loss": 0.7078539133071899, + "step": 2346 + }, + { + "epoch": 0.9907172995780591, + "grad_norm": 1.3043115139007568, + "learning_rate": 9.790737616924581e-05, + "loss": 0.7945935130119324, + "step": 2348 + }, + { + "epoch": 0.9915611814345991, + "grad_norm": 1.1913264989852905, + "learning_rate": 9.790049586056034e-05, + "loss": 0.8247197866439819, + "step": 2350 + }, + { + "epoch": 0.9924050632911392, + "grad_norm": 1.1560171842575073, + "learning_rate": 9.789360450215041e-05, + "loss": 0.7099657654762268, + "step": 2352 + }, + { + "epoch": 0.9932489451476794, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.788670209560575e-05, + "loss": 0.7480318546295166, + "step": 2354 + }, + { + "epoch": 0.9940928270042194, + "grad_norm": 1.1584707498550415, + "learning_rate": 9.787978864251859e-05, + "loss": 0.6870889067649841, + "step": 2356 + }, + { + "epoch": 0.9949367088607595, + "grad_norm": 1.057478666305542, + "learning_rate": 9.787286414448375e-05, + "loss": 0.6114922165870667, + "step": 2358 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 1.1431775093078613, + "learning_rate": 9.786592860309856e-05, + "loss": 0.6955118179321289, + "step": 2360 + }, + { + "epoch": 0.9966244725738397, + "grad_norm": 1.232142448425293, + "learning_rate": 9.785898201996292e-05, + "loss": 0.735048770904541, + "step": 2362 + }, + { + "epoch": 0.9974683544303797, + "grad_norm": 1.1236306428909302, + "learning_rate": 9.785202439667928e-05, + "loss": 0.7150241136550903, + "step": 2364 + }, + { + "epoch": 0.9983122362869198, + "grad_norm": 1.0517534017562866, + "learning_rate": 9.784505573485263e-05, + "loss": 0.6870222687721252, + "step": 2366 + }, + { + "epoch": 0.99915611814346, + "grad_norm": 1.1747480630874634, + "learning_rate": 9.78380760360905e-05, + "loss": 0.7521567940711975, + "step": 2368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2790346145629883, + "learning_rate": 9.783108530200298e-05, + "loss": 0.7336234450340271, + "step": 2370 + }, + { + "epoch": 1.0008438818565402, + "grad_norm": 1.1216399669647217, + "learning_rate": 9.78240835342027e-05, + "loss": 0.6378109455108643, + "step": 2372 + }, + { + "epoch": 1.00168776371308, + "grad_norm": 1.267336368560791, + "learning_rate": 9.781707073430482e-05, + "loss": 0.6174905300140381, + "step": 2374 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 1.1342934370040894, + "learning_rate": 9.781004690392706e-05, + "loss": 0.6579123139381409, + "step": 2376 + }, + { + "epoch": 1.0033755274261604, + "grad_norm": 1.1317468881607056, + "learning_rate": 9.78030120446897e-05, + "loss": 0.6679617166519165, + "step": 2378 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 1.2992616891860962, + "learning_rate": 9.779596615821552e-05, + "loss": 0.7368149161338806, + "step": 2380 + }, + { + "epoch": 1.0050632911392405, + "grad_norm": 1.1714510917663574, + "learning_rate": 9.77889092461299e-05, + "loss": 0.6887164115905762, + "step": 2382 + }, + { + "epoch": 1.0059071729957807, + "grad_norm": 1.1670639514923096, + "learning_rate": 9.778184131006071e-05, + "loss": 0.681344211101532, + "step": 2384 + }, + { + "epoch": 1.0067510548523206, + "grad_norm": 1.2487291097640991, + "learning_rate": 9.77747623516384e-05, + "loss": 0.7342769503593445, + "step": 2386 + }, + { + "epoch": 1.0075949367088608, + "grad_norm": 1.2408956289291382, + "learning_rate": 9.776767237249595e-05, + "loss": 0.577454149723053, + "step": 2388 + }, + { + "epoch": 1.0084388185654007, + "grad_norm": 1.067991852760315, + "learning_rate": 9.776057137426889e-05, + "loss": 0.6588307023048401, + "step": 2390 + }, + { + "epoch": 1.009282700421941, + "grad_norm": 1.2821543216705322, + "learning_rate": 9.775345935859525e-05, + "loss": 0.7045041918754578, + "step": 2392 + }, + { + "epoch": 1.010126582278481, + "grad_norm": 1.3160134553909302, + "learning_rate": 9.774633632711569e-05, + "loss": 0.7141479253768921, + "step": 2394 + }, + { + "epoch": 1.010970464135021, + "grad_norm": 1.66774320602417, + "learning_rate": 9.773920228147329e-05, + "loss": 0.723293662071228, + "step": 2396 + }, + { + "epoch": 1.0118143459915612, + "grad_norm": 1.027588963508606, + "learning_rate": 9.77320572233138e-05, + "loss": 0.5812023878097534, + "step": 2398 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.406507968902588, + "learning_rate": 9.77249011542854e-05, + "loss": 0.7071458101272583, + "step": 2400 + }, + { + "epoch": 1.0126582278481013, + "eval_loss": 0.7421699166297913, + "eval_runtime": 854.2185, + "eval_samples_per_second": 2.467, + "eval_steps_per_second": 2.467, + "step": 2400 + }, + { + "epoch": 1.0135021097046413, + "grad_norm": 1.1236240863800049, + "learning_rate": 9.771773407603889e-05, + "loss": 0.7049722671508789, + "step": 2402 + }, + { + "epoch": 1.0143459915611814, + "grad_norm": 1.1924289464950562, + "learning_rate": 9.771055599022756e-05, + "loss": 0.635308027267456, + "step": 2404 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 1.1744966506958008, + "learning_rate": 9.770336689850727e-05, + "loss": 0.7286487817764282, + "step": 2406 + }, + { + "epoch": 1.0160337552742615, + "grad_norm": 1.2131173610687256, + "learning_rate": 9.769616680253639e-05, + "loss": 0.6828222274780273, + "step": 2408 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 1.0517828464508057, + "learning_rate": 9.768895570397585e-05, + "loss": 0.6652156114578247, + "step": 2410 + }, + { + "epoch": 1.0177215189873419, + "grad_norm": 1.1603758335113525, + "learning_rate": 9.768173360448912e-05, + "loss": 0.7278267741203308, + "step": 2412 + }, + { + "epoch": 1.0185654008438818, + "grad_norm": 1.3167752027511597, + "learning_rate": 9.767450050574218e-05, + "loss": 0.6082334518432617, + "step": 2414 + }, + { + "epoch": 1.019409282700422, + "grad_norm": 1.1754449605941772, + "learning_rate": 9.766725640940358e-05, + "loss": 0.67228102684021, + "step": 2416 + }, + { + "epoch": 1.0202531645569621, + "grad_norm": 1.060952067375183, + "learning_rate": 9.766000131714442e-05, + "loss": 0.5984366536140442, + "step": 2418 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 1.0826152563095093, + "learning_rate": 9.765273523063825e-05, + "loss": 0.690661609172821, + "step": 2420 + }, + { + "epoch": 1.0219409282700422, + "grad_norm": 1.423723816871643, + "learning_rate": 9.764545815156125e-05, + "loss": 0.7960668802261353, + "step": 2422 + }, + { + "epoch": 1.0227848101265822, + "grad_norm": 1.0882549285888672, + "learning_rate": 9.763817008159212e-05, + "loss": 0.6971074342727661, + "step": 2424 + }, + { + "epoch": 1.0236286919831223, + "grad_norm": 1.1053040027618408, + "learning_rate": 9.763087102241206e-05, + "loss": 0.6854458451271057, + "step": 2426 + }, + { + "epoch": 1.0244725738396625, + "grad_norm": 1.1975224018096924, + "learning_rate": 9.762356097570482e-05, + "loss": 0.6724489331245422, + "step": 2428 + }, + { + "epoch": 1.0253164556962024, + "grad_norm": 1.1692171096801758, + "learning_rate": 9.76162399431567e-05, + "loss": 0.7064506411552429, + "step": 2430 + }, + { + "epoch": 1.0261603375527426, + "grad_norm": 1.1927787065505981, + "learning_rate": 9.760890792645649e-05, + "loss": 0.6605257391929626, + "step": 2432 + }, + { + "epoch": 1.0270042194092828, + "grad_norm": 1.4147427082061768, + "learning_rate": 9.760156492729558e-05, + "loss": 0.6872501373291016, + "step": 2434 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 1.2503126859664917, + "learning_rate": 9.759421094736785e-05, + "loss": 0.7117500305175781, + "step": 2436 + }, + { + "epoch": 1.0286919831223629, + "grad_norm": 1.229978084564209, + "learning_rate": 9.758684598836971e-05, + "loss": 0.6740369200706482, + "step": 2438 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 1.4765945672988892, + "learning_rate": 9.757947005200014e-05, + "loss": 0.7215790748596191, + "step": 2440 + }, + { + "epoch": 1.030379746835443, + "grad_norm": 1.282632827758789, + "learning_rate": 9.757208313996061e-05, + "loss": 0.6961746215820312, + "step": 2442 + }, + { + "epoch": 1.0312236286919831, + "grad_norm": 1.259828805923462, + "learning_rate": 9.756468525395512e-05, + "loss": 0.6348349452018738, + "step": 2444 + }, + { + "epoch": 1.0320675105485233, + "grad_norm": 1.0984172821044922, + "learning_rate": 9.755727639569024e-05, + "loss": 0.6756057739257812, + "step": 2446 + }, + { + "epoch": 1.0329113924050632, + "grad_norm": 1.235835075378418, + "learning_rate": 9.754985656687506e-05, + "loss": 0.6968509554862976, + "step": 2448 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 1.273032546043396, + "learning_rate": 9.754242576922119e-05, + "loss": 0.6793950796127319, + "step": 2450 + }, + { + "epoch": 1.0345991561181433, + "grad_norm": 1.251996397972107, + "learning_rate": 9.753498400444274e-05, + "loss": 0.645270586013794, + "step": 2452 + }, + { + "epoch": 1.0354430379746835, + "grad_norm": 1.4310805797576904, + "learning_rate": 9.752753127425642e-05, + "loss": 0.7291322350502014, + "step": 2454 + }, + { + "epoch": 1.0362869198312237, + "grad_norm": 1.6582196950912476, + "learning_rate": 9.752006758038142e-05, + "loss": 0.7553019523620605, + "step": 2456 + }, + { + "epoch": 1.0371308016877636, + "grad_norm": 1.081773042678833, + "learning_rate": 9.751259292453947e-05, + "loss": 0.5637331008911133, + "step": 2458 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 1.1483876705169678, + "learning_rate": 9.750510730845483e-05, + "loss": 0.6012396216392517, + "step": 2460 + }, + { + "epoch": 1.038818565400844, + "grad_norm": 1.0879185199737549, + "learning_rate": 9.749761073385428e-05, + "loss": 0.6795822381973267, + "step": 2462 + }, + { + "epoch": 1.0396624472573839, + "grad_norm": 1.2378218173980713, + "learning_rate": 9.749010320246714e-05, + "loss": 0.6895145773887634, + "step": 2464 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 1.253233790397644, + "learning_rate": 9.748258471602527e-05, + "loss": 0.7124115228652954, + "step": 2466 + }, + { + "epoch": 1.0413502109704642, + "grad_norm": 1.3994864225387573, + "learning_rate": 9.747505527626302e-05, + "loss": 0.7304861545562744, + "step": 2468 + }, + { + "epoch": 1.0421940928270041, + "grad_norm": 1.2360669374465942, + "learning_rate": 9.74675148849173e-05, + "loss": 0.6845837831497192, + "step": 2470 + }, + { + "epoch": 1.0430379746835443, + "grad_norm": 1.126849889755249, + "learning_rate": 9.74599635437275e-05, + "loss": 0.6780203580856323, + "step": 2472 + }, + { + "epoch": 1.0438818565400845, + "grad_norm": 1.169788122177124, + "learning_rate": 9.745240125443562e-05, + "loss": 0.7550003528594971, + "step": 2474 + }, + { + "epoch": 1.0447257383966244, + "grad_norm": 1.1311867237091064, + "learning_rate": 9.744482801878612e-05, + "loss": 0.6910399198532104, + "step": 2476 + }, + { + "epoch": 1.0455696202531646, + "grad_norm": 1.1267731189727783, + "learning_rate": 9.743724383852597e-05, + "loss": 0.7164814472198486, + "step": 2478 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 1.2239704132080078, + "learning_rate": 9.742964871540472e-05, + "loss": 0.6428439617156982, + "step": 2480 + }, + { + "epoch": 1.0472573839662447, + "grad_norm": 1.1854743957519531, + "learning_rate": 9.742204265117443e-05, + "loss": 0.6994290351867676, + "step": 2482 + }, + { + "epoch": 1.0481012658227848, + "grad_norm": 1.0695894956588745, + "learning_rate": 9.741442564758964e-05, + "loss": 0.6725777983665466, + "step": 2484 + }, + { + "epoch": 1.048945147679325, + "grad_norm": 1.1799863576889038, + "learning_rate": 9.740679770640748e-05, + "loss": 0.6538674235343933, + "step": 2486 + }, + { + "epoch": 1.049789029535865, + "grad_norm": 1.295546293258667, + "learning_rate": 9.739915882938754e-05, + "loss": 0.780756950378418, + "step": 2488 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 1.2371755838394165, + "learning_rate": 9.739150901829198e-05, + "loss": 0.6657930612564087, + "step": 2490 + }, + { + "epoch": 1.051476793248945, + "grad_norm": 1.103037714958191, + "learning_rate": 9.738384827488547e-05, + "loss": 0.6675208210945129, + "step": 2492 + }, + { + "epoch": 1.0523206751054852, + "grad_norm": 1.1835435628890991, + "learning_rate": 9.737617660093517e-05, + "loss": 0.6693358421325684, + "step": 2494 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 1.003771424293518, + "learning_rate": 9.736849399821082e-05, + "loss": 0.624502956867218, + "step": 2496 + }, + { + "epoch": 1.0540084388185653, + "grad_norm": 1.1391769647598267, + "learning_rate": 9.736080046848463e-05, + "loss": 0.6350868344306946, + "step": 2498 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 1.376518726348877, + "learning_rate": 9.735309601353134e-05, + "loss": 0.6721012592315674, + "step": 2500 + }, + { + "epoch": 1.0548523206751055, + "eval_loss": 0.741338849067688, + "eval_runtime": 847.7478, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2500 + }, + { + "epoch": 1.0556962025316456, + "grad_norm": 1.194190502166748, + "learning_rate": 9.734538063512824e-05, + "loss": 0.6888233423233032, + "step": 2502 + }, + { + "epoch": 1.0565400843881856, + "grad_norm": 1.378830909729004, + "learning_rate": 9.733765433505513e-05, + "loss": 0.7095553278923035, + "step": 2504 + }, + { + "epoch": 1.0573839662447257, + "grad_norm": 1.1289541721343994, + "learning_rate": 9.732991711509428e-05, + "loss": 0.6734166145324707, + "step": 2506 + }, + { + "epoch": 1.058227848101266, + "grad_norm": 1.1858116388320923, + "learning_rate": 9.732216897703054e-05, + "loss": 0.7006195187568665, + "step": 2508 + }, + { + "epoch": 1.0590717299578059, + "grad_norm": 1.1365686655044556, + "learning_rate": 9.731440992265127e-05, + "loss": 0.6481205821037292, + "step": 2510 + }, + { + "epoch": 1.059915611814346, + "grad_norm": 1.2886228561401367, + "learning_rate": 9.730663995374632e-05, + "loss": 0.679282546043396, + "step": 2512 + }, + { + "epoch": 1.0607594936708862, + "grad_norm": 1.355322003364563, + "learning_rate": 9.729885907210808e-05, + "loss": 0.7656359672546387, + "step": 2514 + }, + { + "epoch": 1.0616033755274261, + "grad_norm": 1.1552364826202393, + "learning_rate": 9.729106727953142e-05, + "loss": 0.5996183156967163, + "step": 2516 + }, + { + "epoch": 1.0624472573839663, + "grad_norm": 1.1419235467910767, + "learning_rate": 9.728326457781381e-05, + "loss": 0.7599716782569885, + "step": 2518 + }, + { + "epoch": 1.0632911392405062, + "grad_norm": 1.2240079641342163, + "learning_rate": 9.727545096875512e-05, + "loss": 0.7150241732597351, + "step": 2520 + }, + { + "epoch": 1.0641350210970464, + "grad_norm": 1.2463440895080566, + "learning_rate": 9.726762645415785e-05, + "loss": 0.734352171421051, + "step": 2522 + }, + { + "epoch": 1.0649789029535865, + "grad_norm": 1.1680364608764648, + "learning_rate": 9.725979103582697e-05, + "loss": 0.6950796842575073, + "step": 2524 + }, + { + "epoch": 1.0658227848101265, + "grad_norm": 1.1680421829223633, + "learning_rate": 9.725194471556991e-05, + "loss": 0.7096341252326965, + "step": 2526 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.043717861175537, + "learning_rate": 9.724408749519671e-05, + "loss": 0.6486304402351379, + "step": 2528 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 1.1240284442901611, + "learning_rate": 9.723621937651985e-05, + "loss": 0.6519505381584167, + "step": 2530 + }, + { + "epoch": 1.0683544303797468, + "grad_norm": 1.185223937034607, + "learning_rate": 9.722834036135439e-05, + "loss": 0.6724293231964111, + "step": 2532 + }, + { + "epoch": 1.069198312236287, + "grad_norm": 1.3234196901321411, + "learning_rate": 9.722045045151784e-05, + "loss": 0.6886576414108276, + "step": 2534 + }, + { + "epoch": 1.070042194092827, + "grad_norm": 1.333084225654602, + "learning_rate": 9.721254964883024e-05, + "loss": 0.688493549823761, + "step": 2536 + }, + { + "epoch": 1.070886075949367, + "grad_norm": 1.2435462474822998, + "learning_rate": 9.720463795511419e-05, + "loss": 0.6527412533760071, + "step": 2538 + }, + { + "epoch": 1.0717299578059072, + "grad_norm": 1.1521880626678467, + "learning_rate": 9.719671537219472e-05, + "loss": 0.6508163809776306, + "step": 2540 + }, + { + "epoch": 1.0725738396624473, + "grad_norm": 1.015013575553894, + "learning_rate": 9.718878190189947e-05, + "loss": 0.6954023838043213, + "step": 2542 + }, + { + "epoch": 1.0734177215189873, + "grad_norm": 1.1507678031921387, + "learning_rate": 9.718083754605851e-05, + "loss": 0.7201322913169861, + "step": 2544 + }, + { + "epoch": 1.0742616033755275, + "grad_norm": 1.0569016933441162, + "learning_rate": 9.717288230650444e-05, + "loss": 0.6688649654388428, + "step": 2546 + }, + { + "epoch": 1.0751054852320676, + "grad_norm": 1.2178492546081543, + "learning_rate": 9.716491618507241e-05, + "loss": 0.7077898979187012, + "step": 2548 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 1.3587230443954468, + "learning_rate": 9.715693918360002e-05, + "loss": 0.7312119603157043, + "step": 2550 + }, + { + "epoch": 1.0767932489451477, + "grad_norm": 1.1930122375488281, + "learning_rate": 9.714895130392744e-05, + "loss": 0.6910589337348938, + "step": 2552 + }, + { + "epoch": 1.0776371308016879, + "grad_norm": 1.2440707683563232, + "learning_rate": 9.71409525478973e-05, + "loss": 0.7942836284637451, + "step": 2554 + }, + { + "epoch": 1.0784810126582278, + "grad_norm": 1.3755065202713013, + "learning_rate": 9.713294291735477e-05, + "loss": 0.6652286052703857, + "step": 2556 + }, + { + "epoch": 1.079324894514768, + "grad_norm": 1.165448784828186, + "learning_rate": 9.71249224141475e-05, + "loss": 0.6025735139846802, + "step": 2558 + }, + { + "epoch": 1.080168776371308, + "grad_norm": 1.2981204986572266, + "learning_rate": 9.711689104012569e-05, + "loss": 0.7343734502792358, + "step": 2560 + }, + { + "epoch": 1.081012658227848, + "grad_norm": 1.2040622234344482, + "learning_rate": 9.710884879714202e-05, + "loss": 0.6903306841850281, + "step": 2562 + }, + { + "epoch": 1.0818565400843883, + "grad_norm": 1.1835904121398926, + "learning_rate": 9.710079568705168e-05, + "loss": 0.69134920835495, + "step": 2564 + }, + { + "epoch": 1.0827004219409282, + "grad_norm": 1.3345229625701904, + "learning_rate": 9.709273171171235e-05, + "loss": 0.6471185088157654, + "step": 2566 + }, + { + "epoch": 1.0835443037974684, + "grad_norm": 1.0884469747543335, + "learning_rate": 9.708465687298425e-05, + "loss": 0.6302382349967957, + "step": 2568 + }, + { + "epoch": 1.0843881856540085, + "grad_norm": 1.1994211673736572, + "learning_rate": 9.707657117273007e-05, + "loss": 0.7329678535461426, + "step": 2570 + }, + { + "epoch": 1.0852320675105485, + "grad_norm": 1.2609503269195557, + "learning_rate": 9.706847461281507e-05, + "loss": 0.719862163066864, + "step": 2572 + }, + { + "epoch": 1.0860759493670886, + "grad_norm": 1.2686879634857178, + "learning_rate": 9.706036719510694e-05, + "loss": 0.7142901420593262, + "step": 2574 + }, + { + "epoch": 1.0869198312236288, + "grad_norm": 1.2763310670852661, + "learning_rate": 9.705224892147591e-05, + "loss": 0.7009075284004211, + "step": 2576 + }, + { + "epoch": 1.0877637130801687, + "grad_norm": 1.1704022884368896, + "learning_rate": 9.70441197937947e-05, + "loss": 0.6873779296875, + "step": 2578 + }, + { + "epoch": 1.0886075949367089, + "grad_norm": 1.0482875108718872, + "learning_rate": 9.703597981393856e-05, + "loss": 0.6437726020812988, + "step": 2580 + }, + { + "epoch": 1.0894514767932488, + "grad_norm": 1.28431236743927, + "learning_rate": 9.702782898378521e-05, + "loss": 0.6933431625366211, + "step": 2582 + }, + { + "epoch": 1.090295358649789, + "grad_norm": 1.0962283611297607, + "learning_rate": 9.701966730521491e-05, + "loss": 0.6488757133483887, + "step": 2584 + }, + { + "epoch": 1.0911392405063292, + "grad_norm": 1.2177873849868774, + "learning_rate": 9.70114947801104e-05, + "loss": 0.6385396122932434, + "step": 2586 + }, + { + "epoch": 1.091983122362869, + "grad_norm": 1.197059988975525, + "learning_rate": 9.70033114103569e-05, + "loss": 0.6826614737510681, + "step": 2588 + }, + { + "epoch": 1.0928270042194093, + "grad_norm": 1.1624075174331665, + "learning_rate": 9.699511719784217e-05, + "loss": 0.605629563331604, + "step": 2590 + }, + { + "epoch": 1.0936708860759494, + "grad_norm": 1.2975167036056519, + "learning_rate": 9.698691214445648e-05, + "loss": 0.734926700592041, + "step": 2592 + }, + { + "epoch": 1.0945147679324894, + "grad_norm": 1.215414047241211, + "learning_rate": 9.697869625209255e-05, + "loss": 0.7281333804130554, + "step": 2594 + }, + { + "epoch": 1.0953586497890295, + "grad_norm": 1.1862860918045044, + "learning_rate": 9.697046952264563e-05, + "loss": 0.7388250827789307, + "step": 2596 + }, + { + "epoch": 1.0962025316455697, + "grad_norm": 1.1127797365188599, + "learning_rate": 9.696223195801348e-05, + "loss": 0.6495320796966553, + "step": 2598 + }, + { + "epoch": 1.0970464135021096, + "grad_norm": 1.0863338708877563, + "learning_rate": 9.695398356009636e-05, + "loss": 0.7157143950462341, + "step": 2600 + }, + { + "epoch": 1.0970464135021096, + "eval_loss": 0.7377332448959351, + "eval_runtime": 859.6612, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 2600 + }, + { + "epoch": 1.0978902953586498, + "grad_norm": 1.1228652000427246, + "learning_rate": 9.694572433079699e-05, + "loss": 0.6597335934638977, + "step": 2602 + }, + { + "epoch": 1.09873417721519, + "grad_norm": 1.3077653646469116, + "learning_rate": 9.69374542720206e-05, + "loss": 0.6715680360794067, + "step": 2604 + }, + { + "epoch": 1.09957805907173, + "grad_norm": 1.241603970527649, + "learning_rate": 9.692917338567499e-05, + "loss": 0.6910243034362793, + "step": 2606 + }, + { + "epoch": 1.10042194092827, + "grad_norm": 1.1372551918029785, + "learning_rate": 9.692088167367037e-05, + "loss": 0.6519553065299988, + "step": 2608 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 1.2894765138626099, + "learning_rate": 9.691257913791949e-05, + "loss": 0.6542758941650391, + "step": 2610 + }, + { + "epoch": 1.1021097046413502, + "grad_norm": 1.0800915956497192, + "learning_rate": 9.690426578033755e-05, + "loss": 0.6886795163154602, + "step": 2612 + }, + { + "epoch": 1.1029535864978903, + "grad_norm": 1.3394384384155273, + "learning_rate": 9.689594160284233e-05, + "loss": 0.7512150406837463, + "step": 2614 + }, + { + "epoch": 1.1037974683544305, + "grad_norm": 1.2175323963165283, + "learning_rate": 9.688760660735402e-05, + "loss": 0.67207932472229, + "step": 2616 + }, + { + "epoch": 1.1046413502109704, + "grad_norm": 1.2181185483932495, + "learning_rate": 9.687926079579537e-05, + "loss": 0.6591740846633911, + "step": 2618 + }, + { + "epoch": 1.1054852320675106, + "grad_norm": 1.1740983724594116, + "learning_rate": 9.68709041700916e-05, + "loss": 0.6431041359901428, + "step": 2620 + }, + { + "epoch": 1.1063291139240505, + "grad_norm": 1.1792434453964233, + "learning_rate": 9.686253673217038e-05, + "loss": 0.6573615074157715, + "step": 2622 + }, + { + "epoch": 1.1071729957805907, + "grad_norm": 1.058391809463501, + "learning_rate": 9.685415848396196e-05, + "loss": 0.5576209425926208, + "step": 2624 + }, + { + "epoch": 1.1080168776371309, + "grad_norm": 1.3203206062316895, + "learning_rate": 9.684576942739903e-05, + "loss": 0.668684184551239, + "step": 2626 + }, + { + "epoch": 1.1088607594936708, + "grad_norm": 1.2391762733459473, + "learning_rate": 9.68373695644168e-05, + "loss": 0.6800089478492737, + "step": 2628 + }, + { + "epoch": 1.109704641350211, + "grad_norm": 1.2323405742645264, + "learning_rate": 9.682895889695292e-05, + "loss": 0.6433757543563843, + "step": 2630 + }, + { + "epoch": 1.1105485232067511, + "grad_norm": 1.2656551599502563, + "learning_rate": 9.682053742694759e-05, + "loss": 0.6628785729408264, + "step": 2632 + }, + { + "epoch": 1.111392405063291, + "grad_norm": 1.2984392642974854, + "learning_rate": 9.681210515634349e-05, + "loss": 0.6838971972465515, + "step": 2634 + }, + { + "epoch": 1.1122362869198312, + "grad_norm": 1.3200393915176392, + "learning_rate": 9.680366208708576e-05, + "loss": 0.7548647522926331, + "step": 2636 + }, + { + "epoch": 1.1130801687763714, + "grad_norm": 1.225388526916504, + "learning_rate": 9.679520822112208e-05, + "loss": 0.6553335189819336, + "step": 2638 + }, + { + "epoch": 1.1139240506329113, + "grad_norm": 1.2350653409957886, + "learning_rate": 9.678674356040259e-05, + "loss": 0.631401538848877, + "step": 2640 + }, + { + "epoch": 1.1147679324894515, + "grad_norm": 1.2325507402420044, + "learning_rate": 9.677826810687989e-05, + "loss": 0.6459156274795532, + "step": 2642 + }, + { + "epoch": 1.1156118143459917, + "grad_norm": 1.0008996725082397, + "learning_rate": 9.676978186250915e-05, + "loss": 0.6425284743309021, + "step": 2644 + }, + { + "epoch": 1.1164556962025316, + "grad_norm": 1.3767247200012207, + "learning_rate": 9.676128482924796e-05, + "loss": 0.6451422572135925, + "step": 2646 + }, + { + "epoch": 1.1172995780590718, + "grad_norm": 1.2070895433425903, + "learning_rate": 9.675277700905643e-05, + "loss": 0.6713272929191589, + "step": 2648 + }, + { + "epoch": 1.1181434599156117, + "grad_norm": 1.1582069396972656, + "learning_rate": 9.674425840389716e-05, + "loss": 0.6285044550895691, + "step": 2650 + }, + { + "epoch": 1.1189873417721519, + "grad_norm": 1.1641311645507812, + "learning_rate": 9.67357290157352e-05, + "loss": 0.624229907989502, + "step": 2652 + }, + { + "epoch": 1.119831223628692, + "grad_norm": 1.3071147203445435, + "learning_rate": 9.672718884653814e-05, + "loss": 0.7214919328689575, + "step": 2654 + }, + { + "epoch": 1.120675105485232, + "grad_norm": 1.2157800197601318, + "learning_rate": 9.671863789827602e-05, + "loss": 0.8062215447425842, + "step": 2656 + }, + { + "epoch": 1.1215189873417721, + "grad_norm": 1.2843927145004272, + "learning_rate": 9.671007617292138e-05, + "loss": 0.6362426280975342, + "step": 2658 + }, + { + "epoch": 1.1223628691983123, + "grad_norm": 1.1182712316513062, + "learning_rate": 9.670150367244927e-05, + "loss": 0.6181318163871765, + "step": 2660 + }, + { + "epoch": 1.1232067510548522, + "grad_norm": 1.566605806350708, + "learning_rate": 9.669292039883717e-05, + "loss": 0.6973897218704224, + "step": 2662 + }, + { + "epoch": 1.1240506329113924, + "grad_norm": 1.0726850032806396, + "learning_rate": 9.66843263540651e-05, + "loss": 0.6117324829101562, + "step": 2664 + }, + { + "epoch": 1.1248945147679326, + "grad_norm": 1.2953020334243774, + "learning_rate": 9.66757215401155e-05, + "loss": 0.642676830291748, + "step": 2666 + }, + { + "epoch": 1.1257383966244725, + "grad_norm": 1.1184383630752563, + "learning_rate": 9.66671059589734e-05, + "loss": 0.6757452487945557, + "step": 2668 + }, + { + "epoch": 1.1265822784810127, + "grad_norm": 1.2732970714569092, + "learning_rate": 9.66584796126262e-05, + "loss": 0.6861951947212219, + "step": 2670 + }, + { + "epoch": 1.1274261603375528, + "grad_norm": 1.2713000774383545, + "learning_rate": 9.664984250306383e-05, + "loss": 0.6727077960968018, + "step": 2672 + }, + { + "epoch": 1.1282700421940928, + "grad_norm": 1.269827961921692, + "learning_rate": 9.664119463227874e-05, + "loss": 0.7355974912643433, + "step": 2674 + }, + { + "epoch": 1.129113924050633, + "grad_norm": 1.3067172765731812, + "learning_rate": 9.663253600226581e-05, + "loss": 0.7121313214302063, + "step": 2676 + }, + { + "epoch": 1.129957805907173, + "grad_norm": 1.2958797216415405, + "learning_rate": 9.662386661502242e-05, + "loss": 0.6671369075775146, + "step": 2678 + }, + { + "epoch": 1.130801687763713, + "grad_norm": 1.2943401336669922, + "learning_rate": 9.661518647254842e-05, + "loss": 0.6153768301010132, + "step": 2680 + }, + { + "epoch": 1.1316455696202532, + "grad_norm": 1.1744167804718018, + "learning_rate": 9.660649557684616e-05, + "loss": 0.6070778965950012, + "step": 2682 + }, + { + "epoch": 1.1324894514767934, + "grad_norm": 1.159209132194519, + "learning_rate": 9.659779392992047e-05, + "loss": 0.676887035369873, + "step": 2684 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.1937510967254639, + "learning_rate": 9.658908153377866e-05, + "loss": 0.6086745262145996, + "step": 2686 + }, + { + "epoch": 1.1341772151898735, + "grad_norm": 1.1461687088012695, + "learning_rate": 9.658035839043049e-05, + "loss": 0.6493708491325378, + "step": 2688 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 2.066361665725708, + "learning_rate": 9.657162450188824e-05, + "loss": 0.6813004016876221, + "step": 2690 + }, + { + "epoch": 1.1358649789029536, + "grad_norm": 1.086910367012024, + "learning_rate": 9.656287987016664e-05, + "loss": 0.721062183380127, + "step": 2692 + }, + { + "epoch": 1.1367088607594937, + "grad_norm": 1.1869292259216309, + "learning_rate": 9.65541244972829e-05, + "loss": 0.5975021123886108, + "step": 2694 + }, + { + "epoch": 1.1375527426160337, + "grad_norm": 1.2456518411636353, + "learning_rate": 9.654535838525674e-05, + "loss": 0.6818324327468872, + "step": 2696 + }, + { + "epoch": 1.1383966244725738, + "grad_norm": 1.5271464586257935, + "learning_rate": 9.653658153611031e-05, + "loss": 0.6844469308853149, + "step": 2698 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.1403794288635254, + "learning_rate": 9.652779395186827e-05, + "loss": 0.6388684511184692, + "step": 2700 + }, + { + "epoch": 1.139240506329114, + "eval_loss": 0.7335711717605591, + "eval_runtime": 861.9651, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 2700 + }, + { + "epoch": 1.140084388185654, + "grad_norm": 1.1091634035110474, + "learning_rate": 9.651899563455775e-05, + "loss": 0.6154619455337524, + "step": 2702 + }, + { + "epoch": 1.140928270042194, + "grad_norm": 1.3280601501464844, + "learning_rate": 9.651018658620837e-05, + "loss": 0.629319429397583, + "step": 2704 + }, + { + "epoch": 1.1417721518987343, + "grad_norm": 1.226806402206421, + "learning_rate": 9.650136680885216e-05, + "loss": 0.6088175773620605, + "step": 2706 + }, + { + "epoch": 1.1426160337552742, + "grad_norm": 1.0593408346176147, + "learning_rate": 9.649253630452372e-05, + "loss": 0.6199659705162048, + "step": 2708 + }, + { + "epoch": 1.1434599156118144, + "grad_norm": 1.1112475395202637, + "learning_rate": 9.648369507526008e-05, + "loss": 0.7233364582061768, + "step": 2710 + }, + { + "epoch": 1.1443037974683543, + "grad_norm": 1.1737885475158691, + "learning_rate": 9.647484312310068e-05, + "loss": 0.6687955856323242, + "step": 2712 + }, + { + "epoch": 1.1451476793248945, + "grad_norm": 1.194532036781311, + "learning_rate": 9.646598045008756e-05, + "loss": 0.6508969068527222, + "step": 2714 + }, + { + "epoch": 1.1459915611814346, + "grad_norm": 1.069395899772644, + "learning_rate": 9.645710705826517e-05, + "loss": 0.6408317685127258, + "step": 2716 + }, + { + "epoch": 1.1468354430379746, + "grad_norm": 1.2429133653640747, + "learning_rate": 9.644822294968037e-05, + "loss": 0.650763750076294, + "step": 2718 + }, + { + "epoch": 1.1476793248945147, + "grad_norm": 1.2950133085250854, + "learning_rate": 9.64393281263826e-05, + "loss": 0.6952191591262817, + "step": 2720 + }, + { + "epoch": 1.148523206751055, + "grad_norm": 1.1972628831863403, + "learning_rate": 9.643042259042372e-05, + "loss": 0.6772956252098083, + "step": 2722 + }, + { + "epoch": 1.1493670886075948, + "grad_norm": 1.1670407056808472, + "learning_rate": 9.642150634385805e-05, + "loss": 0.6734447479248047, + "step": 2724 + }, + { + "epoch": 1.150210970464135, + "grad_norm": 1.120302677154541, + "learning_rate": 9.641257938874243e-05, + "loss": 0.6387717127799988, + "step": 2726 + }, + { + "epoch": 1.1510548523206752, + "grad_norm": 1.1241344213485718, + "learning_rate": 9.640364172713609e-05, + "loss": 0.6592874526977539, + "step": 2728 + }, + { + "epoch": 1.1518987341772151, + "grad_norm": 1.2627261877059937, + "learning_rate": 9.639469336110083e-05, + "loss": 0.7257466912269592, + "step": 2730 + }, + { + "epoch": 1.1527426160337553, + "grad_norm": 1.0528618097305298, + "learning_rate": 9.638573429270083e-05, + "loss": 0.572188138961792, + "step": 2732 + }, + { + "epoch": 1.1535864978902954, + "grad_norm": 1.212536334991455, + "learning_rate": 9.637676452400277e-05, + "loss": 0.678981602191925, + "step": 2734 + }, + { + "epoch": 1.1544303797468354, + "grad_norm": 1.152167797088623, + "learning_rate": 9.636778405707582e-05, + "loss": 0.6375001072883606, + "step": 2736 + }, + { + "epoch": 1.1552742616033755, + "grad_norm": 1.2400429248809814, + "learning_rate": 9.635879289399161e-05, + "loss": 0.7602289319038391, + "step": 2738 + }, + { + "epoch": 1.1561181434599157, + "grad_norm": 1.3488622903823853, + "learning_rate": 9.634979103682421e-05, + "loss": 0.6209543943405151, + "step": 2740 + }, + { + "epoch": 1.1569620253164556, + "grad_norm": 1.1999555826187134, + "learning_rate": 9.634077848765019e-05, + "loss": 0.6215830445289612, + "step": 2742 + }, + { + "epoch": 1.1578059071729958, + "grad_norm": 1.2008578777313232, + "learning_rate": 9.633175524854855e-05, + "loss": 0.6634654998779297, + "step": 2744 + }, + { + "epoch": 1.158649789029536, + "grad_norm": 1.3920676708221436, + "learning_rate": 9.63227213216008e-05, + "loss": 0.7515161633491516, + "step": 2746 + }, + { + "epoch": 1.159493670886076, + "grad_norm": 1.0551656484603882, + "learning_rate": 9.631367670889089e-05, + "loss": 0.724361777305603, + "step": 2748 + }, + { + "epoch": 1.160337552742616, + "grad_norm": 1.2820028066635132, + "learning_rate": 9.630462141250523e-05, + "loss": 0.6673553586006165, + "step": 2750 + }, + { + "epoch": 1.1611814345991562, + "grad_norm": 1.1452983617782593, + "learning_rate": 9.62955554345327e-05, + "loss": 0.7029784917831421, + "step": 2752 + }, + { + "epoch": 1.1620253164556962, + "grad_norm": 1.1808624267578125, + "learning_rate": 9.628647877706466e-05, + "loss": 0.7355457544326782, + "step": 2754 + }, + { + "epoch": 1.1628691983122363, + "grad_norm": 1.0574703216552734, + "learning_rate": 9.627739144219492e-05, + "loss": 0.6144933700561523, + "step": 2756 + }, + { + "epoch": 1.1637130801687763, + "grad_norm": 1.215733528137207, + "learning_rate": 9.626829343201974e-05, + "loss": 0.6843759417533875, + "step": 2758 + }, + { + "epoch": 1.1645569620253164, + "grad_norm": 1.1667706966400146, + "learning_rate": 9.625918474863787e-05, + "loss": 0.6197049617767334, + "step": 2760 + }, + { + "epoch": 1.1654008438818566, + "grad_norm": 1.3765631914138794, + "learning_rate": 9.62500653941505e-05, + "loss": 0.715958297252655, + "step": 2762 + }, + { + "epoch": 1.1662447257383965, + "grad_norm": 1.173715591430664, + "learning_rate": 9.62409353706613e-05, + "loss": 0.7433139085769653, + "step": 2764 + }, + { + "epoch": 1.1670886075949367, + "grad_norm": 1.1837430000305176, + "learning_rate": 9.623179468027637e-05, + "loss": 0.7174371480941772, + "step": 2766 + }, + { + "epoch": 1.1679324894514769, + "grad_norm": 1.1577154397964478, + "learning_rate": 9.622264332510432e-05, + "loss": 0.7184823751449585, + "step": 2768 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 1.165246605873108, + "learning_rate": 9.621348130725617e-05, + "loss": 0.693343460559845, + "step": 2770 + }, + { + "epoch": 1.169620253164557, + "grad_norm": 1.2853080034255981, + "learning_rate": 9.620430862884542e-05, + "loss": 0.6999852061271667, + "step": 2772 + }, + { + "epoch": 1.1704641350210971, + "grad_norm": 1.1782865524291992, + "learning_rate": 9.619512529198806e-05, + "loss": 0.6034331321716309, + "step": 2774 + }, + { + "epoch": 1.171308016877637, + "grad_norm": 1.4055447578430176, + "learning_rate": 9.61859312988025e-05, + "loss": 0.7588269710540771, + "step": 2776 + }, + { + "epoch": 1.1721518987341772, + "grad_norm": 1.1148805618286133, + "learning_rate": 9.617672665140957e-05, + "loss": 0.6913981437683105, + "step": 2778 + }, + { + "epoch": 1.1729957805907172, + "grad_norm": 1.1311042308807373, + "learning_rate": 9.616751135193266e-05, + "loss": 0.5976925492286682, + "step": 2780 + }, + { + "epoch": 1.1738396624472573, + "grad_norm": 1.2378602027893066, + "learning_rate": 9.615828540249754e-05, + "loss": 0.6897050142288208, + "step": 2782 + }, + { + "epoch": 1.1746835443037975, + "grad_norm": 1.3445732593536377, + "learning_rate": 9.614904880523248e-05, + "loss": 0.6772098541259766, + "step": 2784 + }, + { + "epoch": 1.1755274261603375, + "grad_norm": 1.3380862474441528, + "learning_rate": 9.613980156226815e-05, + "loss": 0.6354818344116211, + "step": 2786 + }, + { + "epoch": 1.1763713080168776, + "grad_norm": 1.0955157279968262, + "learning_rate": 9.613054367573773e-05, + "loss": 0.6541208028793335, + "step": 2788 + }, + { + "epoch": 1.1772151898734178, + "grad_norm": 1.0176626443862915, + "learning_rate": 9.612127514777686e-05, + "loss": 0.6472887992858887, + "step": 2790 + }, + { + "epoch": 1.1780590717299577, + "grad_norm": 1.2644864320755005, + "learning_rate": 9.611199598052357e-05, + "loss": 0.7511212229728699, + "step": 2792 + }, + { + "epoch": 1.1789029535864979, + "grad_norm": 1.248197317123413, + "learning_rate": 9.61027061761184e-05, + "loss": 0.696236789226532, + "step": 2794 + }, + { + "epoch": 1.179746835443038, + "grad_norm": 1.189935564994812, + "learning_rate": 9.609340573670436e-05, + "loss": 0.5962010622024536, + "step": 2796 + }, + { + "epoch": 1.180590717299578, + "grad_norm": 1.1760492324829102, + "learning_rate": 9.608409466442685e-05, + "loss": 0.5981685519218445, + "step": 2798 + }, + { + "epoch": 1.1814345991561181, + "grad_norm": 1.1820716857910156, + "learning_rate": 9.607477296143374e-05, + "loss": 0.6186091303825378, + "step": 2800 + }, + { + "epoch": 1.1814345991561181, + "eval_loss": 0.7298192977905273, + "eval_runtime": 849.544, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 2.48, + "step": 2800 + }, + { + "epoch": 1.1822784810126583, + "grad_norm": 1.0353888273239136, + "learning_rate": 9.606544062987541e-05, + "loss": 0.5859389901161194, + "step": 2802 + }, + { + "epoch": 1.1831223628691983, + "grad_norm": 1.3141933679580688, + "learning_rate": 9.605609767190464e-05, + "loss": 0.6573460698127747, + "step": 2804 + }, + { + "epoch": 1.1839662447257384, + "grad_norm": 1.1209372282028198, + "learning_rate": 9.604674408967664e-05, + "loss": 0.6991921067237854, + "step": 2806 + }, + { + "epoch": 1.1848101265822786, + "grad_norm": 1.2830493450164795, + "learning_rate": 9.603737988534913e-05, + "loss": 0.6438087821006775, + "step": 2808 + }, + { + "epoch": 1.1856540084388185, + "grad_norm": 1.1427195072174072, + "learning_rate": 9.602800506108225e-05, + "loss": 0.6452094316482544, + "step": 2810 + }, + { + "epoch": 1.1864978902953587, + "grad_norm": 1.316420078277588, + "learning_rate": 9.601861961903857e-05, + "loss": 0.6745601296424866, + "step": 2812 + }, + { + "epoch": 1.1873417721518988, + "grad_norm": 1.1643308401107788, + "learning_rate": 9.600922356138317e-05, + "loss": 0.6761514544487, + "step": 2814 + }, + { + "epoch": 1.1881856540084388, + "grad_norm": 1.036056399345398, + "learning_rate": 9.59998168902835e-05, + "loss": 0.6453908681869507, + "step": 2816 + }, + { + "epoch": 1.189029535864979, + "grad_norm": 1.2211129665374756, + "learning_rate": 9.599039960790954e-05, + "loss": 0.6576406359672546, + "step": 2818 + }, + { + "epoch": 1.189873417721519, + "grad_norm": 1.084114670753479, + "learning_rate": 9.598097171643364e-05, + "loss": 0.6214181780815125, + "step": 2820 + }, + { + "epoch": 1.190717299578059, + "grad_norm": 1.1297314167022705, + "learning_rate": 9.597153321803064e-05, + "loss": 0.6381646990776062, + "step": 2822 + }, + { + "epoch": 1.1915611814345992, + "grad_norm": 1.2568120956420898, + "learning_rate": 9.596208411487784e-05, + "loss": 0.7129076719284058, + "step": 2824 + }, + { + "epoch": 1.1924050632911392, + "grad_norm": 1.07041335105896, + "learning_rate": 9.595262440915493e-05, + "loss": 0.7123546004295349, + "step": 2826 + }, + { + "epoch": 1.1932489451476793, + "grad_norm": 1.3950074911117554, + "learning_rate": 9.594315410304413e-05, + "loss": 0.7263038158416748, + "step": 2828 + }, + { + "epoch": 1.1940928270042195, + "grad_norm": 1.2470672130584717, + "learning_rate": 9.593367319873002e-05, + "loss": 0.6863036751747131, + "step": 2830 + }, + { + "epoch": 1.1949367088607594, + "grad_norm": 1.2065461874008179, + "learning_rate": 9.592418169839968e-05, + "loss": 0.745354175567627, + "step": 2832 + }, + { + "epoch": 1.1957805907172996, + "grad_norm": 1.1710152626037598, + "learning_rate": 9.591467960424261e-05, + "loss": 0.6401656866073608, + "step": 2834 + }, + { + "epoch": 1.1966244725738397, + "grad_norm": 1.3324087858200073, + "learning_rate": 9.590516691845077e-05, + "loss": 0.7402615547180176, + "step": 2836 + }, + { + "epoch": 1.1974683544303797, + "grad_norm": 1.0100195407867432, + "learning_rate": 9.589564364321855e-05, + "loss": 0.5723769068717957, + "step": 2838 + }, + { + "epoch": 1.1983122362869199, + "grad_norm": 1.2706246376037598, + "learning_rate": 9.588610978074277e-05, + "loss": 0.6618966460227966, + "step": 2840 + }, + { + "epoch": 1.1991561181434598, + "grad_norm": 1.1921758651733398, + "learning_rate": 9.587656533322273e-05, + "loss": 0.7090804576873779, + "step": 2842 + }, + { + "epoch": 1.2, + "grad_norm": 1.36713445186615, + "learning_rate": 9.586701030286014e-05, + "loss": 0.6930652856826782, + "step": 2844 + }, + { + "epoch": 1.2008438818565401, + "grad_norm": 1.3084295988082886, + "learning_rate": 9.585744469185917e-05, + "loss": 0.7386236190795898, + "step": 2846 + }, + { + "epoch": 1.20168776371308, + "grad_norm": 1.198922038078308, + "learning_rate": 9.584786850242642e-05, + "loss": 0.6179903149604797, + "step": 2848 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.2106369733810425, + "learning_rate": 9.583828173677092e-05, + "loss": 0.7027528882026672, + "step": 2850 + }, + { + "epoch": 1.2033755274261604, + "grad_norm": 1.2959522008895874, + "learning_rate": 9.582868439710418e-05, + "loss": 0.6612945199012756, + "step": 2852 + }, + { + "epoch": 1.2042194092827003, + "grad_norm": 1.1441705226898193, + "learning_rate": 9.58190764856401e-05, + "loss": 0.7085917592048645, + "step": 2854 + }, + { + "epoch": 1.2050632911392405, + "grad_norm": 1.1586185693740845, + "learning_rate": 9.580945800459504e-05, + "loss": 0.7480600476264954, + "step": 2856 + }, + { + "epoch": 1.2059071729957807, + "grad_norm": 1.2068266868591309, + "learning_rate": 9.579982895618783e-05, + "loss": 0.7185836434364319, + "step": 2858 + }, + { + "epoch": 1.2067510548523206, + "grad_norm": 1.2188525199890137, + "learning_rate": 9.579018934263966e-05, + "loss": 0.6737306118011475, + "step": 2860 + }, + { + "epoch": 1.2075949367088608, + "grad_norm": 1.1513181924819946, + "learning_rate": 9.578053916617423e-05, + "loss": 0.7239293456077576, + "step": 2862 + }, + { + "epoch": 1.208438818565401, + "grad_norm": 1.2063703536987305, + "learning_rate": 9.577087842901764e-05, + "loss": 0.6416276097297668, + "step": 2864 + }, + { + "epoch": 1.2092827004219409, + "grad_norm": 1.102460503578186, + "learning_rate": 9.576120713339844e-05, + "loss": 0.697213351726532, + "step": 2866 + }, + { + "epoch": 1.210126582278481, + "grad_norm": 1.2484638690948486, + "learning_rate": 9.575152528154763e-05, + "loss": 0.6664742231369019, + "step": 2868 + }, + { + "epoch": 1.2109704641350212, + "grad_norm": 1.4476624727249146, + "learning_rate": 9.57418328756986e-05, + "loss": 0.6914868354797363, + "step": 2870 + }, + { + "epoch": 1.2118143459915611, + "grad_norm": 1.0130122900009155, + "learning_rate": 9.573212991808722e-05, + "loss": 0.662024736404419, + "step": 2872 + }, + { + "epoch": 1.2126582278481013, + "grad_norm": 1.014470100402832, + "learning_rate": 9.572241641095177e-05, + "loss": 0.6330409646034241, + "step": 2874 + }, + { + "epoch": 1.2135021097046415, + "grad_norm": 1.1803333759307861, + "learning_rate": 9.571269235653298e-05, + "loss": 0.6607463955879211, + "step": 2876 + }, + { + "epoch": 1.2143459915611814, + "grad_norm": 1.261366844177246, + "learning_rate": 9.570295775707398e-05, + "loss": 0.6925629377365112, + "step": 2878 + }, + { + "epoch": 1.2151898734177216, + "grad_norm": 1.226670503616333, + "learning_rate": 9.569321261482037e-05, + "loss": 0.7070510983467102, + "step": 2880 + }, + { + "epoch": 1.2160337552742617, + "grad_norm": 1.164565920829773, + "learning_rate": 9.568345693202016e-05, + "loss": 0.7243561744689941, + "step": 2882 + }, + { + "epoch": 1.2168776371308017, + "grad_norm": 1.060331106185913, + "learning_rate": 9.567369071092382e-05, + "loss": 0.6316909790039062, + "step": 2884 + }, + { + "epoch": 1.2177215189873418, + "grad_norm": 1.1998693943023682, + "learning_rate": 9.566391395378419e-05, + "loss": 0.6139125227928162, + "step": 2886 + }, + { + "epoch": 1.2185654008438818, + "grad_norm": 1.1875834465026855, + "learning_rate": 9.565412666285661e-05, + "loss": 0.688897430896759, + "step": 2888 + }, + { + "epoch": 1.219409282700422, + "grad_norm": 1.199174404144287, + "learning_rate": 9.564432884039882e-05, + "loss": 0.684590756893158, + "step": 2890 + }, + { + "epoch": 1.220253164556962, + "grad_norm": 1.2428219318389893, + "learning_rate": 9.563452048867099e-05, + "loss": 0.67433100938797, + "step": 2892 + }, + { + "epoch": 1.221097046413502, + "grad_norm": 1.0826431512832642, + "learning_rate": 9.562470160993568e-05, + "loss": 0.6959785223007202, + "step": 2894 + }, + { + "epoch": 1.2219409282700422, + "grad_norm": 1.3140246868133545, + "learning_rate": 9.561487220645797e-05, + "loss": 0.6443175673484802, + "step": 2896 + }, + { + "epoch": 1.2227848101265824, + "grad_norm": 1.2758334875106812, + "learning_rate": 9.560503228050529e-05, + "loss": 0.6715332865715027, + "step": 2898 + }, + { + "epoch": 1.2236286919831223, + "grad_norm": 1.3326421976089478, + "learning_rate": 9.559518183434753e-05, + "loss": 0.6896081566810608, + "step": 2900 + }, + { + "epoch": 1.2236286919831223, + "eval_loss": 0.7281573414802551, + "eval_runtime": 854.563, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 2.466, + "step": 2900 + }, + { + "epoch": 1.2244725738396625, + "grad_norm": 1.3225606679916382, + "learning_rate": 9.558532087025697e-05, + "loss": 0.6797633171081543, + "step": 2902 + }, + { + "epoch": 1.2253164556962026, + "grad_norm": 1.3058340549468994, + "learning_rate": 9.55754493905084e-05, + "loss": 0.6510948538780212, + "step": 2904 + }, + { + "epoch": 1.2261603375527426, + "grad_norm": 1.140268087387085, + "learning_rate": 9.556556739737892e-05, + "loss": 0.6481176614761353, + "step": 2906 + }, + { + "epoch": 1.2270042194092827, + "grad_norm": 1.465113639831543, + "learning_rate": 9.555567489314816e-05, + "loss": 0.7533771991729736, + "step": 2908 + }, + { + "epoch": 1.2278481012658227, + "grad_norm": 1.1468979120254517, + "learning_rate": 9.554577188009812e-05, + "loss": 0.6924305558204651, + "step": 2910 + }, + { + "epoch": 1.2286919831223628, + "grad_norm": 1.2193517684936523, + "learning_rate": 9.553585836051321e-05, + "loss": 0.7082820534706116, + "step": 2912 + }, + { + "epoch": 1.229535864978903, + "grad_norm": 1.2015037536621094, + "learning_rate": 9.552593433668034e-05, + "loss": 0.6735695004463196, + "step": 2914 + }, + { + "epoch": 1.230379746835443, + "grad_norm": 1.1915435791015625, + "learning_rate": 9.551599981088874e-05, + "loss": 0.7312048673629761, + "step": 2916 + }, + { + "epoch": 1.231223628691983, + "grad_norm": 1.2849410772323608, + "learning_rate": 9.550605478543013e-05, + "loss": 0.6590308547019958, + "step": 2918 + }, + { + "epoch": 1.2320675105485233, + "grad_norm": 1.192238688468933, + "learning_rate": 9.549609926259866e-05, + "loss": 0.6237715482711792, + "step": 2920 + }, + { + "epoch": 1.2329113924050632, + "grad_norm": 1.141845703125, + "learning_rate": 9.548613324469085e-05, + "loss": 0.6546295881271362, + "step": 2922 + }, + { + "epoch": 1.2337552742616034, + "grad_norm": 1.1662311553955078, + "learning_rate": 9.547615673400566e-05, + "loss": 0.5800934433937073, + "step": 2924 + }, + { + "epoch": 1.2345991561181435, + "grad_norm": 1.120578646659851, + "learning_rate": 9.546616973284453e-05, + "loss": 0.6487136483192444, + "step": 2926 + }, + { + "epoch": 1.2354430379746835, + "grad_norm": 1.0884860754013062, + "learning_rate": 9.54561722435112e-05, + "loss": 0.7515342235565186, + "step": 2928 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 1.4208670854568481, + "learning_rate": 9.544616426831196e-05, + "loss": 0.7162003517150879, + "step": 2930 + }, + { + "epoch": 1.2371308016877638, + "grad_norm": 1.083389401435852, + "learning_rate": 9.543614580955543e-05, + "loss": 0.708450198173523, + "step": 2932 + }, + { + "epoch": 1.2379746835443037, + "grad_norm": 1.141364336013794, + "learning_rate": 9.542611686955268e-05, + "loss": 0.6255859732627869, + "step": 2934 + }, + { + "epoch": 1.238818565400844, + "grad_norm": 1.122036099433899, + "learning_rate": 9.54160774506172e-05, + "loss": 0.6485402584075928, + "step": 2936 + }, + { + "epoch": 1.239662447257384, + "grad_norm": 1.3514165878295898, + "learning_rate": 9.540602755506487e-05, + "loss": 0.6735473871231079, + "step": 2938 + }, + { + "epoch": 1.240506329113924, + "grad_norm": 1.1762629747390747, + "learning_rate": 9.539596718521403e-05, + "loss": 0.6154970526695251, + "step": 2940 + }, + { + "epoch": 1.2413502109704642, + "grad_norm": 1.1609408855438232, + "learning_rate": 9.53858963433854e-05, + "loss": 0.6410251259803772, + "step": 2942 + }, + { + "epoch": 1.2421940928270043, + "grad_norm": 1.1750361919403076, + "learning_rate": 9.537581503190214e-05, + "loss": 0.6841039657592773, + "step": 2944 + }, + { + "epoch": 1.2430379746835443, + "grad_norm": 1.3125680685043335, + "learning_rate": 9.536572325308982e-05, + "loss": 0.7293462753295898, + "step": 2946 + }, + { + "epoch": 1.2438818565400844, + "grad_norm": 1.1737277507781982, + "learning_rate": 9.53556210092764e-05, + "loss": 0.7713663578033447, + "step": 2948 + }, + { + "epoch": 1.2447257383966246, + "grad_norm": 1.1702152490615845, + "learning_rate": 9.53455083027923e-05, + "loss": 0.6612298488616943, + "step": 2950 + }, + { + "epoch": 1.2455696202531645, + "grad_norm": 1.2594486474990845, + "learning_rate": 9.533538513597028e-05, + "loss": 0.6725803017616272, + "step": 2952 + }, + { + "epoch": 1.2464135021097047, + "grad_norm": 1.180816411972046, + "learning_rate": 9.532525151114562e-05, + "loss": 0.6421069502830505, + "step": 2954 + }, + { + "epoch": 1.2472573839662446, + "grad_norm": 1.25814688205719, + "learning_rate": 9.531510743065593e-05, + "loss": 0.7042996287345886, + "step": 2956 + }, + { + "epoch": 1.2481012658227848, + "grad_norm": 1.2101783752441406, + "learning_rate": 9.530495289684122e-05, + "loss": 0.7359137535095215, + "step": 2958 + }, + { + "epoch": 1.248945147679325, + "grad_norm": 1.1438405513763428, + "learning_rate": 9.5294787912044e-05, + "loss": 0.6186386346817017, + "step": 2960 + }, + { + "epoch": 1.249789029535865, + "grad_norm": 1.163364291191101, + "learning_rate": 9.52846124786091e-05, + "loss": 0.6243056058883667, + "step": 2962 + }, + { + "epoch": 1.250632911392405, + "grad_norm": 1.0695953369140625, + "learning_rate": 9.52744265988838e-05, + "loss": 0.6568763852119446, + "step": 2964 + }, + { + "epoch": 1.2514767932489452, + "grad_norm": 1.2228879928588867, + "learning_rate": 9.52642302752178e-05, + "loss": 0.6486776471138, + "step": 2966 + }, + { + "epoch": 1.2523206751054852, + "grad_norm": 1.2262967824935913, + "learning_rate": 9.52540235099632e-05, + "loss": 0.6293455958366394, + "step": 2968 + }, + { + "epoch": 1.2531645569620253, + "grad_norm": 1.0862956047058105, + "learning_rate": 9.524380630547449e-05, + "loss": 0.6549884080886841, + "step": 2970 + }, + { + "epoch": 1.2540084388185653, + "grad_norm": 1.1721880435943604, + "learning_rate": 9.52335786641086e-05, + "loss": 0.6126490831375122, + "step": 2972 + }, + { + "epoch": 1.2548523206751054, + "grad_norm": 1.2452391386032104, + "learning_rate": 9.522334058822483e-05, + "loss": 0.7078590393066406, + "step": 2974 + }, + { + "epoch": 1.2556962025316456, + "grad_norm": 1.2290222644805908, + "learning_rate": 9.521309208018492e-05, + "loss": 0.6166214942932129, + "step": 2976 + }, + { + "epoch": 1.2565400843881855, + "grad_norm": 1.1823618412017822, + "learning_rate": 9.520283314235299e-05, + "loss": 0.666228175163269, + "step": 2978 + }, + { + "epoch": 1.2573839662447257, + "grad_norm": 1.1702475547790527, + "learning_rate": 9.51925637770956e-05, + "loss": 0.7436795830726624, + "step": 2980 + }, + { + "epoch": 1.2582278481012659, + "grad_norm": 1.0879321098327637, + "learning_rate": 9.518228398678168e-05, + "loss": 0.7120893001556396, + "step": 2982 + }, + { + "epoch": 1.2590717299578058, + "grad_norm": 1.1608418226242065, + "learning_rate": 9.517199377378261e-05, + "loss": 0.6931713223457336, + "step": 2984 + }, + { + "epoch": 1.259915611814346, + "grad_norm": 1.1289087533950806, + "learning_rate": 9.51616931404721e-05, + "loss": 0.6803538799285889, + "step": 2986 + }, + { + "epoch": 1.2607594936708861, + "grad_norm": 1.1622236967086792, + "learning_rate": 9.515138208922633e-05, + "loss": 0.6499706506729126, + "step": 2988 + }, + { + "epoch": 1.261603375527426, + "grad_norm": 1.2492594718933105, + "learning_rate": 9.514106062242386e-05, + "loss": 0.6132655739784241, + "step": 2990 + }, + { + "epoch": 1.2624472573839662, + "grad_norm": 1.1538822650909424, + "learning_rate": 9.513072874244567e-05, + "loss": 0.6309265494346619, + "step": 2992 + }, + { + "epoch": 1.2632911392405064, + "grad_norm": 1.0828478336334229, + "learning_rate": 9.512038645167509e-05, + "loss": 0.6297751665115356, + "step": 2994 + }, + { + "epoch": 1.2641350210970463, + "grad_norm": 1.2440937757492065, + "learning_rate": 9.511003375249792e-05, + "loss": 0.6335258483886719, + "step": 2996 + }, + { + "epoch": 1.2649789029535865, + "grad_norm": 1.1259970664978027, + "learning_rate": 9.50996706473023e-05, + "loss": 0.6513770818710327, + "step": 2998 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.1530309915542603, + "learning_rate": 9.508929713847884e-05, + "loss": 0.6490892767906189, + "step": 3000 + }, + { + "epoch": 1.2658227848101267, + "eval_loss": 0.72515869140625, + "eval_runtime": 868.0515, + "eval_samples_per_second": 2.427, + "eval_steps_per_second": 2.427, + "step": 3000 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.1176815960961147e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3000/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3500/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3500/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3500/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c15a11a7f2dc1f5b3e7ce1c8e6ad2c95653b2726 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957eb2d5d1ffe73b7a84d0fb076e12576f526034973b97039ee4c03976bffe01 +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3500/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..45e37800fcf9af05a6afcfe22fbe83c465281820 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d1ba7e825bf8869398fcbd6f238d668c2d90eab33e298c3ae735600ed11395f +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3500/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..35e5b0ea58b7929219be7f4382a45d9988983893 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0f916096e49dbb7a03cf13b3f0a9c0dd359b939f3fc1e94d65073bd53e57e12 +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3500/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b2e6a4bddc4ac4f5bfb4f008738d70b4f5f9c5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ee2a286f9179f64de03a71ced091d09e4c4512b55ec74518ace31194ac91b1a +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3500/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c326d1fb0f1bf1db5ffe9b6c8b1dcee1448259a6 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/trainer_state.json @@ -0,0 +1,12573 @@ +{ + "best_global_step": 3500, + "best_metric": 0.7155047059059143, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-3500", + "epoch": 1.4767932489451476, + "eval_steps": 100, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + }, + { + "epoch": 0.8447257383966245, + "grad_norm": 1.145434021949768, + "learning_rate": 9.893006338154401e-05, + "loss": 0.731850802898407, + "step": 2002 + }, + { + "epoch": 0.8455696202531645, + "grad_norm": 1.0618077516555786, + "learning_rate": 9.892511635084101e-05, + "loss": 0.6711665391921997, + "step": 2004 + }, + { + "epoch": 0.8464135021097047, + "grad_norm": 1.1657867431640625, + "learning_rate": 9.892015803405331e-05, + "loss": 0.6894803643226624, + "step": 2006 + }, + { + "epoch": 0.8472573839662447, + "grad_norm": 1.080140233039856, + "learning_rate": 9.891518843232467e-05, + "loss": 0.628146231174469, + "step": 2008 + }, + { + "epoch": 0.8481012658227848, + "grad_norm": 1.0664509534835815, + "learning_rate": 9.891020754680151e-05, + "loss": 0.740858793258667, + "step": 2010 + }, + { + "epoch": 0.8489451476793249, + "grad_norm": 1.5567615032196045, + "learning_rate": 9.89052153786328e-05, + "loss": 0.7763919234275818, + "step": 2012 + }, + { + "epoch": 0.849789029535865, + "grad_norm": 1.4347095489501953, + "learning_rate": 9.890021192897016e-05, + "loss": 0.8131396770477295, + "step": 2014 + }, + { + "epoch": 0.850632911392405, + "grad_norm": 1.1787892580032349, + "learning_rate": 9.889519719896776e-05, + "loss": 0.6829051375389099, + "step": 2016 + }, + { + "epoch": 0.8514767932489451, + "grad_norm": 1.239745855331421, + "learning_rate": 9.889017118978241e-05, + "loss": 0.7664558291435242, + "step": 2018 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 1.1224207878112793, + "learning_rate": 9.888513390257352e-05, + "loss": 0.7307376861572266, + "step": 2020 + }, + { + "epoch": 0.8531645569620253, + "grad_norm": 1.100536823272705, + "learning_rate": 9.88800853385031e-05, + "loss": 0.6786578893661499, + "step": 2022 + }, + { + "epoch": 0.8540084388185654, + "grad_norm": 1.25773024559021, + "learning_rate": 9.887502549873576e-05, + "loss": 0.7971984148025513, + "step": 2024 + }, + { + "epoch": 0.8548523206751055, + "grad_norm": 0.9980104565620422, + "learning_rate": 9.886995438443868e-05, + "loss": 0.6990941166877747, + "step": 2026 + }, + { + "epoch": 0.8556962025316456, + "grad_norm": 1.0464621782302856, + "learning_rate": 9.886487199678171e-05, + "loss": 0.763938307762146, + "step": 2028 + }, + { + "epoch": 0.8565400843881856, + "grad_norm": 1.2303017377853394, + "learning_rate": 9.885977833693724e-05, + "loss": 0.7165632247924805, + "step": 2030 + }, + { + "epoch": 0.8573839662447258, + "grad_norm": 1.2203325033187866, + "learning_rate": 9.885467340608027e-05, + "loss": 0.7586364150047302, + "step": 2032 + }, + { + "epoch": 0.8582278481012658, + "grad_norm": 1.113882064819336, + "learning_rate": 9.884955720538843e-05, + "loss": 0.703253984451294, + "step": 2034 + }, + { + "epoch": 0.8590717299578059, + "grad_norm": 1.1731632947921753, + "learning_rate": 9.88444297360419e-05, + "loss": 0.8530917763710022, + "step": 2036 + }, + { + "epoch": 0.859915611814346, + "grad_norm": 1.4592338800430298, + "learning_rate": 9.883929099922349e-05, + "loss": 0.8166638612747192, + "step": 2038 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.1279125213623047, + "learning_rate": 9.883414099611864e-05, + "loss": 0.6762415170669556, + "step": 2040 + }, + { + "epoch": 0.8616033755274262, + "grad_norm": 1.1587293148040771, + "learning_rate": 9.882897972791534e-05, + "loss": 0.6826539039611816, + "step": 2042 + }, + { + "epoch": 0.8624472573839662, + "grad_norm": 1.1909502744674683, + "learning_rate": 9.88238071958042e-05, + "loss": 0.7372410893440247, + "step": 2044 + }, + { + "epoch": 0.8632911392405064, + "grad_norm": 1.0340155363082886, + "learning_rate": 9.881862340097841e-05, + "loss": 0.699260950088501, + "step": 2046 + }, + { + "epoch": 0.8641350210970464, + "grad_norm": 1.1745870113372803, + "learning_rate": 9.881342834463379e-05, + "loss": 0.7689789533615112, + "step": 2048 + }, + { + "epoch": 0.8649789029535865, + "grad_norm": 1.0003606081008911, + "learning_rate": 9.880822202796872e-05, + "loss": 0.6877372860908508, + "step": 2050 + }, + { + "epoch": 0.8658227848101265, + "grad_norm": 1.2546781301498413, + "learning_rate": 9.88030044521842e-05, + "loss": 0.7632413506507874, + "step": 2052 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.1178704500198364, + "learning_rate": 9.879777561848385e-05, + "loss": 0.6776729822158813, + "step": 2054 + }, + { + "epoch": 0.8675105485232067, + "grad_norm": 1.523606777191162, + "learning_rate": 9.879253552807384e-05, + "loss": 0.7592973709106445, + "step": 2056 + }, + { + "epoch": 0.8683544303797468, + "grad_norm": 1.3490995168685913, + "learning_rate": 9.878728418216296e-05, + "loss": 0.8028839230537415, + "step": 2058 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 1.1851624250411987, + "learning_rate": 9.87820215819626e-05, + "loss": 0.7499933838844299, + "step": 2060 + }, + { + "epoch": 0.870042194092827, + "grad_norm": 1.1877925395965576, + "learning_rate": 9.877674772868672e-05, + "loss": 0.7324717044830322, + "step": 2062 + }, + { + "epoch": 0.8708860759493671, + "grad_norm": 1.2982885837554932, + "learning_rate": 9.877146262355194e-05, + "loss": 0.7456585168838501, + "step": 2064 + }, + { + "epoch": 0.8717299578059071, + "grad_norm": 1.043912649154663, + "learning_rate": 9.876616626777739e-05, + "loss": 0.7552799582481384, + "step": 2066 + }, + { + "epoch": 0.8725738396624473, + "grad_norm": 1.172580599784851, + "learning_rate": 9.876085866258487e-05, + "loss": 0.6964990496635437, + "step": 2068 + }, + { + "epoch": 0.8734177215189873, + "grad_norm": 1.26815927028656, + "learning_rate": 9.875553980919871e-05, + "loss": 0.7368612289428711, + "step": 2070 + }, + { + "epoch": 0.8742616033755274, + "grad_norm": 1.1268136501312256, + "learning_rate": 9.875020970884587e-05, + "loss": 0.7400802969932556, + "step": 2072 + }, + { + "epoch": 0.8751054852320675, + "grad_norm": 1.0556721687316895, + "learning_rate": 9.874486836275594e-05, + "loss": 0.6931334137916565, + "step": 2074 + }, + { + "epoch": 0.8759493670886076, + "grad_norm": 1.1967823505401611, + "learning_rate": 9.873951577216106e-05, + "loss": 0.7124089002609253, + "step": 2076 + }, + { + "epoch": 0.8767932489451477, + "grad_norm": 1.1753164529800415, + "learning_rate": 9.873415193829591e-05, + "loss": 0.7462030053138733, + "step": 2078 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 1.326923131942749, + "learning_rate": 9.872877686239789e-05, + "loss": 0.778078019618988, + "step": 2080 + }, + { + "epoch": 0.8784810126582279, + "grad_norm": 1.1472662687301636, + "learning_rate": 9.87233905457069e-05, + "loss": 0.6592919826507568, + "step": 2082 + }, + { + "epoch": 0.8793248945147679, + "grad_norm": 1.1162762641906738, + "learning_rate": 9.871799298946544e-05, + "loss": 0.661717414855957, + "step": 2084 + }, + { + "epoch": 0.880168776371308, + "grad_norm": 1.1694408655166626, + "learning_rate": 9.871258419491866e-05, + "loss": 0.6203670501708984, + "step": 2086 + }, + { + "epoch": 0.8810126582278481, + "grad_norm": 1.229691505432129, + "learning_rate": 9.870716416331425e-05, + "loss": 0.758888304233551, + "step": 2088 + }, + { + "epoch": 0.8818565400843882, + "grad_norm": 1.540377140045166, + "learning_rate": 9.870173289590251e-05, + "loss": 0.760649561882019, + "step": 2090 + }, + { + "epoch": 0.8827004219409282, + "grad_norm": 1.173628568649292, + "learning_rate": 9.869629039393632e-05, + "loss": 0.6981227397918701, + "step": 2092 + }, + { + "epoch": 0.8835443037974684, + "grad_norm": 1.1404013633728027, + "learning_rate": 9.869083665867116e-05, + "loss": 0.7808336615562439, + "step": 2094 + }, + { + "epoch": 0.8843881856540085, + "grad_norm": 1.1038721799850464, + "learning_rate": 9.868537169136511e-05, + "loss": 0.7540555596351624, + "step": 2096 + }, + { + "epoch": 0.8852320675105485, + "grad_norm": 1.1510080099105835, + "learning_rate": 9.867989549327885e-05, + "loss": 0.6650454998016357, + "step": 2098 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.166912317276001, + "learning_rate": 9.867440806567561e-05, + "loss": 0.673769474029541, + "step": 2100 + }, + { + "epoch": 0.8860759493670886, + "eval_loss": 0.7559094429016113, + "eval_runtime": 847.8311, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2100 + }, + { + "epoch": 0.8869198312236287, + "grad_norm": 1.227583885192871, + "learning_rate": 9.866890940982121e-05, + "loss": 0.8314241766929626, + "step": 2102 + }, + { + "epoch": 0.8877637130801688, + "grad_norm": 1.1813976764678955, + "learning_rate": 9.866339952698413e-05, + "loss": 0.6770843863487244, + "step": 2104 + }, + { + "epoch": 0.8886075949367088, + "grad_norm": 1.2471063137054443, + "learning_rate": 9.865787841843539e-05, + "loss": 0.7142292857170105, + "step": 2106 + }, + { + "epoch": 0.889451476793249, + "grad_norm": 1.1602860689163208, + "learning_rate": 9.865234608544858e-05, + "loss": 0.6981731653213501, + "step": 2108 + }, + { + "epoch": 0.890295358649789, + "grad_norm": 1.145677089691162, + "learning_rate": 9.864680252929992e-05, + "loss": 0.7019379138946533, + "step": 2110 + }, + { + "epoch": 0.8911392405063291, + "grad_norm": 1.2222462892532349, + "learning_rate": 9.86412477512682e-05, + "loss": 0.7690986394882202, + "step": 2112 + }, + { + "epoch": 0.8919831223628693, + "grad_norm": 1.1288166046142578, + "learning_rate": 9.863568175263478e-05, + "loss": 0.7241792678833008, + "step": 2114 + }, + { + "epoch": 0.8928270042194093, + "grad_norm": 1.1773978471755981, + "learning_rate": 9.863010453468364e-05, + "loss": 0.7392162084579468, + "step": 2116 + }, + { + "epoch": 0.8936708860759494, + "grad_norm": 1.102638840675354, + "learning_rate": 9.862451609870136e-05, + "loss": 0.7603078484535217, + "step": 2118 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.861891644597707e-05, + "loss": 0.6804911494255066, + "step": 2120 + }, + { + "epoch": 0.8953586497890296, + "grad_norm": 1.1381969451904297, + "learning_rate": 9.86133055778025e-05, + "loss": 0.787288248538971, + "step": 2122 + }, + { + "epoch": 0.8962025316455696, + "grad_norm": 1.2454546689987183, + "learning_rate": 9.860768349547196e-05, + "loss": 0.7282505035400391, + "step": 2124 + }, + { + "epoch": 0.8970464135021097, + "grad_norm": 1.2568305730819702, + "learning_rate": 9.860205020028237e-05, + "loss": 0.7554803490638733, + "step": 2126 + }, + { + "epoch": 0.8978902953586498, + "grad_norm": 1.1523523330688477, + "learning_rate": 9.859640569353321e-05, + "loss": 0.7126525044441223, + "step": 2128 + }, + { + "epoch": 0.8987341772151899, + "grad_norm": 1.314878225326538, + "learning_rate": 9.859074997652658e-05, + "loss": 0.7300811409950256, + "step": 2130 + }, + { + "epoch": 0.8995780590717299, + "grad_norm": 1.1272218227386475, + "learning_rate": 9.858508305056713e-05, + "loss": 0.7217329144477844, + "step": 2132 + }, + { + "epoch": 0.90042194092827, + "grad_norm": 1.10934317111969, + "learning_rate": 9.857940491696211e-05, + "loss": 0.714308500289917, + "step": 2134 + }, + { + "epoch": 0.9012658227848102, + "grad_norm": 1.1991039514541626, + "learning_rate": 9.857371557702136e-05, + "loss": 0.6613366007804871, + "step": 2136 + }, + { + "epoch": 0.9021097046413502, + "grad_norm": 1.3176918029785156, + "learning_rate": 9.85680150320573e-05, + "loss": 0.6972863078117371, + "step": 2138 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 1.1966592073440552, + "learning_rate": 9.856230328338496e-05, + "loss": 0.7299100160598755, + "step": 2140 + }, + { + "epoch": 0.9037974683544304, + "grad_norm": 1.2889270782470703, + "learning_rate": 9.85565803323219e-05, + "loss": 0.7145020961761475, + "step": 2142 + }, + { + "epoch": 0.9046413502109705, + "grad_norm": 1.2112789154052734, + "learning_rate": 9.855084618018828e-05, + "loss": 0.6717942953109741, + "step": 2144 + }, + { + "epoch": 0.9054852320675105, + "grad_norm": 1.2550239562988281, + "learning_rate": 9.85451008283069e-05, + "loss": 0.7460196018218994, + "step": 2146 + }, + { + "epoch": 0.9063291139240506, + "grad_norm": 1.2926387786865234, + "learning_rate": 9.853934427800309e-05, + "loss": 0.8300626873970032, + "step": 2148 + }, + { + "epoch": 0.9071729957805907, + "grad_norm": 1.0690672397613525, + "learning_rate": 9.853357653060478e-05, + "loss": 0.715215802192688, + "step": 2150 + }, + { + "epoch": 0.9080168776371308, + "grad_norm": 1.1021424531936646, + "learning_rate": 9.852779758744245e-05, + "loss": 0.7021427154541016, + "step": 2152 + }, + { + "epoch": 0.9088607594936708, + "grad_norm": 1.0713517665863037, + "learning_rate": 9.852200744984921e-05, + "loss": 0.7576406598091125, + "step": 2154 + }, + { + "epoch": 0.909704641350211, + "grad_norm": 1.277526617050171, + "learning_rate": 9.851620611916075e-05, + "loss": 0.7008846998214722, + "step": 2156 + }, + { + "epoch": 0.9105485232067511, + "grad_norm": 1.2434618473052979, + "learning_rate": 9.85103935967153e-05, + "loss": 0.7536613345146179, + "step": 2158 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 1.1654841899871826, + "learning_rate": 9.850456988385371e-05, + "loss": 0.7435567378997803, + "step": 2160 + }, + { + "epoch": 0.9122362869198313, + "grad_norm": 1.0718246698379517, + "learning_rate": 9.849873498191939e-05, + "loss": 0.7725666165351868, + "step": 2162 + }, + { + "epoch": 0.9130801687763713, + "grad_norm": 1.3425630331039429, + "learning_rate": 9.849288889225835e-05, + "loss": 0.7833593487739563, + "step": 2164 + }, + { + "epoch": 0.9139240506329114, + "grad_norm": 1.1989985704421997, + "learning_rate": 9.848703161621917e-05, + "loss": 0.7290158867835999, + "step": 2166 + }, + { + "epoch": 0.9147679324894514, + "grad_norm": 1.0549380779266357, + "learning_rate": 9.8481163155153e-05, + "loss": 0.6787996888160706, + "step": 2168 + }, + { + "epoch": 0.9156118143459916, + "grad_norm": 1.0757017135620117, + "learning_rate": 9.847528351041359e-05, + "loss": 0.7645748853683472, + "step": 2170 + }, + { + "epoch": 0.9164556962025316, + "grad_norm": 1.0636975765228271, + "learning_rate": 9.846939268335726e-05, + "loss": 0.6640698313713074, + "step": 2172 + }, + { + "epoch": 0.9172995780590717, + "grad_norm": 1.2038439512252808, + "learning_rate": 9.846349067534291e-05, + "loss": 0.7216284275054932, + "step": 2174 + }, + { + "epoch": 0.9181434599156119, + "grad_norm": 1.17854642868042, + "learning_rate": 9.845757748773203e-05, + "loss": 0.7244991660118103, + "step": 2176 + }, + { + "epoch": 0.9189873417721519, + "grad_norm": 1.0391159057617188, + "learning_rate": 9.845165312188864e-05, + "loss": 0.6043152809143066, + "step": 2178 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 1.2382071018218994, + "learning_rate": 9.844571757917944e-05, + "loss": 0.7791659832000732, + "step": 2180 + }, + { + "epoch": 0.920675105485232, + "grad_norm": 1.0855708122253418, + "learning_rate": 9.84397708609736e-05, + "loss": 0.7190433144569397, + "step": 2182 + }, + { + "epoch": 0.9215189873417722, + "grad_norm": 1.103308916091919, + "learning_rate": 9.843381296864291e-05, + "loss": 0.6648658514022827, + "step": 2184 + }, + { + "epoch": 0.9223628691983122, + "grad_norm": 1.073517918586731, + "learning_rate": 9.842784390356178e-05, + "loss": 0.6891760230064392, + "step": 2186 + }, + { + "epoch": 0.9232067510548523, + "grad_norm": 1.0806199312210083, + "learning_rate": 9.842186366710712e-05, + "loss": 0.6880859136581421, + "step": 2188 + }, + { + "epoch": 0.9240506329113924, + "grad_norm": 1.0631483793258667, + "learning_rate": 9.841587226065848e-05, + "loss": 0.6238307952880859, + "step": 2190 + }, + { + "epoch": 0.9248945147679325, + "grad_norm": 1.2630863189697266, + "learning_rate": 9.840986968559795e-05, + "loss": 0.6905744075775146, + "step": 2192 + }, + { + "epoch": 0.9257383966244725, + "grad_norm": 1.1307560205459595, + "learning_rate": 9.840385594331022e-05, + "loss": 0.7531564235687256, + "step": 2194 + }, + { + "epoch": 0.9265822784810127, + "grad_norm": 1.0294862985610962, + "learning_rate": 9.839783103518254e-05, + "loss": 0.6750671863555908, + "step": 2196 + }, + { + "epoch": 0.9274261603375528, + "grad_norm": 1.2446976900100708, + "learning_rate": 9.839179496260472e-05, + "loss": 0.7200804352760315, + "step": 2198 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 1.2673420906066895, + "learning_rate": 9.83857477269692e-05, + "loss": 0.7002623677253723, + "step": 2200 + }, + { + "epoch": 0.9282700421940928, + "eval_loss": 0.7497645616531372, + "eval_runtime": 856.8766, + "eval_samples_per_second": 2.459, + "eval_steps_per_second": 2.459, + "step": 2200 + }, + { + "epoch": 0.9291139240506329, + "grad_norm": 1.5114624500274658, + "learning_rate": 9.837968932967094e-05, + "loss": 0.7718265056610107, + "step": 2202 + }, + { + "epoch": 0.929957805907173, + "grad_norm": 1.2059369087219238, + "learning_rate": 9.837361977210751e-05, + "loss": 0.7204271554946899, + "step": 2204 + }, + { + "epoch": 0.9308016877637131, + "grad_norm": 1.2077301740646362, + "learning_rate": 9.836753905567902e-05, + "loss": 0.7371073961257935, + "step": 2206 + }, + { + "epoch": 0.9316455696202531, + "grad_norm": 1.120097279548645, + "learning_rate": 9.836144718178818e-05, + "loss": 0.6601167321205139, + "step": 2208 + }, + { + "epoch": 0.9324894514767933, + "grad_norm": 1.1755714416503906, + "learning_rate": 9.835534415184029e-05, + "loss": 0.6897423267364502, + "step": 2210 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3587000370025635, + "learning_rate": 9.834922996724317e-05, + "loss": 0.758438229560852, + "step": 2212 + }, + { + "epoch": 0.9341772151898734, + "grad_norm": 1.1898177862167358, + "learning_rate": 9.834310462940727e-05, + "loss": 0.7489214539527893, + "step": 2214 + }, + { + "epoch": 0.9350210970464135, + "grad_norm": 1.0814623832702637, + "learning_rate": 9.833696813974558e-05, + "loss": 0.6844488382339478, + "step": 2216 + }, + { + "epoch": 0.9358649789029536, + "grad_norm": 1.1060179471969604, + "learning_rate": 9.833082049967366e-05, + "loss": 0.6617586016654968, + "step": 2218 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 1.1780575513839722, + "learning_rate": 9.832466171060968e-05, + "loss": 0.7383584976196289, + "step": 2220 + }, + { + "epoch": 0.9375527426160337, + "grad_norm": 1.3734618425369263, + "learning_rate": 9.831849177397432e-05, + "loss": 0.7764308452606201, + "step": 2222 + }, + { + "epoch": 0.9383966244725739, + "grad_norm": 1.1367733478546143, + "learning_rate": 9.831231069119089e-05, + "loss": 0.6834397912025452, + "step": 2224 + }, + { + "epoch": 0.9392405063291139, + "grad_norm": 1.1695492267608643, + "learning_rate": 9.830611846368524e-05, + "loss": 0.7054480910301208, + "step": 2226 + }, + { + "epoch": 0.940084388185654, + "grad_norm": 1.0345736742019653, + "learning_rate": 9.829991509288579e-05, + "loss": 0.694448709487915, + "step": 2228 + }, + { + "epoch": 0.9409282700421941, + "grad_norm": 1.298105239868164, + "learning_rate": 9.829370058022356e-05, + "loss": 0.6839741468429565, + "step": 2230 + }, + { + "epoch": 0.9417721518987342, + "grad_norm": 1.2905502319335938, + "learning_rate": 9.828747492713209e-05, + "loss": 0.7886884212493896, + "step": 2232 + }, + { + "epoch": 0.9426160337552743, + "grad_norm": 1.12301504611969, + "learning_rate": 9.828123813504753e-05, + "loss": 0.7206413149833679, + "step": 2234 + }, + { + "epoch": 0.9434599156118143, + "grad_norm": 1.2644896507263184, + "learning_rate": 9.82749902054086e-05, + "loss": 0.7700693607330322, + "step": 2236 + }, + { + "epoch": 0.9443037974683545, + "grad_norm": 1.1626365184783936, + "learning_rate": 9.826873113965655e-05, + "loss": 0.7199711203575134, + "step": 2238 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 1.0728627443313599, + "learning_rate": 9.826246093923528e-05, + "loss": 0.7183539271354675, + "step": 2240 + }, + { + "epoch": 0.9459915611814346, + "grad_norm": 1.1444766521453857, + "learning_rate": 9.825617960559114e-05, + "loss": 0.7417964935302734, + "step": 2242 + }, + { + "epoch": 0.9468354430379747, + "grad_norm": 1.4059823751449585, + "learning_rate": 9.824988714017316e-05, + "loss": 0.7949740290641785, + "step": 2244 + }, + { + "epoch": 0.9476793248945148, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.824358354443286e-05, + "loss": 0.6433083415031433, + "step": 2246 + }, + { + "epoch": 0.9485232067510548, + "grad_norm": 1.0879144668579102, + "learning_rate": 9.823726881982438e-05, + "loss": 0.6519861817359924, + "step": 2248 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.2289162874221802, + "learning_rate": 9.82309429678044e-05, + "loss": 0.7280195355415344, + "step": 2250 + }, + { + "epoch": 0.950210970464135, + "grad_norm": 1.1755765676498413, + "learning_rate": 9.822460598983217e-05, + "loss": 0.7524687647819519, + "step": 2252 + }, + { + "epoch": 0.9510548523206751, + "grad_norm": 1.179807186126709, + "learning_rate": 9.821825788736949e-05, + "loss": 0.7543174624443054, + "step": 2254 + }, + { + "epoch": 0.9518987341772152, + "grad_norm": 1.1234289407730103, + "learning_rate": 9.821189866188079e-05, + "loss": 0.716377854347229, + "step": 2256 + }, + { + "epoch": 0.9527426160337553, + "grad_norm": 1.0324063301086426, + "learning_rate": 9.820552831483297e-05, + "loss": 0.6403332948684692, + "step": 2258 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 1.1459579467773438, + "learning_rate": 9.819914684769558e-05, + "loss": 0.7406947612762451, + "step": 2260 + }, + { + "epoch": 0.9544303797468354, + "grad_norm": 1.2886124849319458, + "learning_rate": 9.819275426194072e-05, + "loss": 0.749687671661377, + "step": 2262 + }, + { + "epoch": 0.9552742616033755, + "grad_norm": 1.3349844217300415, + "learning_rate": 9.818635055904299e-05, + "loss": 0.778410017490387, + "step": 2264 + }, + { + "epoch": 0.9561181434599156, + "grad_norm": 1.0994901657104492, + "learning_rate": 9.81799357404796e-05, + "loss": 0.6701914668083191, + "step": 2266 + }, + { + "epoch": 0.9569620253164557, + "grad_norm": 1.1787796020507812, + "learning_rate": 9.817350980773038e-05, + "loss": 0.7205135226249695, + "step": 2268 + }, + { + "epoch": 0.9578059071729957, + "grad_norm": 1.100813627243042, + "learning_rate": 9.816707276227763e-05, + "loss": 0.6897916197776794, + "step": 2270 + }, + { + "epoch": 0.9586497890295359, + "grad_norm": 1.1280698776245117, + "learning_rate": 9.816062460560627e-05, + "loss": 0.6763570308685303, + "step": 2272 + }, + { + "epoch": 0.959493670886076, + "grad_norm": 1.2322514057159424, + "learning_rate": 9.815416533920374e-05, + "loss": 0.6948683857917786, + "step": 2274 + }, + { + "epoch": 0.960337552742616, + "grad_norm": 1.3963630199432373, + "learning_rate": 9.814769496456008e-05, + "loss": 0.7876828908920288, + "step": 2276 + }, + { + "epoch": 0.9611814345991562, + "grad_norm": 1.2093676328659058, + "learning_rate": 9.814121348316792e-05, + "loss": 0.8191362619400024, + "step": 2278 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 1.2223572731018066, + "learning_rate": 9.813472089652233e-05, + "loss": 0.7162626385688782, + "step": 2280 + }, + { + "epoch": 0.9628691983122363, + "grad_norm": 1.1498078107833862, + "learning_rate": 9.812821720612111e-05, + "loss": 0.7183970212936401, + "step": 2282 + }, + { + "epoch": 0.9637130801687763, + "grad_norm": 1.1563853025436401, + "learning_rate": 9.812170241346449e-05, + "loss": 0.734487771987915, + "step": 2284 + }, + { + "epoch": 0.9645569620253165, + "grad_norm": 1.1823415756225586, + "learning_rate": 9.81151765200553e-05, + "loss": 0.7312371730804443, + "step": 2286 + }, + { + "epoch": 0.9654008438818565, + "grad_norm": 1.1336151361465454, + "learning_rate": 9.810863952739899e-05, + "loss": 0.7668377757072449, + "step": 2288 + }, + { + "epoch": 0.9662447257383966, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.810209143700347e-05, + "loss": 0.7100399732589722, + "step": 2290 + }, + { + "epoch": 0.9670886075949368, + "grad_norm": 1.1368129253387451, + "learning_rate": 9.809553225037926e-05, + "loss": 0.7169836163520813, + "step": 2292 + }, + { + "epoch": 0.9679324894514768, + "grad_norm": 1.141107439994812, + "learning_rate": 9.808896196903947e-05, + "loss": 0.7709535956382751, + "step": 2294 + }, + { + "epoch": 0.9687763713080169, + "grad_norm": 1.276405930519104, + "learning_rate": 9.808238059449971e-05, + "loss": 0.7300511002540588, + "step": 2296 + }, + { + "epoch": 0.9696202531645569, + "grad_norm": 0.9817046523094177, + "learning_rate": 9.80757881282782e-05, + "loss": 0.6259129047393799, + "step": 2298 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 1.3965257406234741, + "learning_rate": 9.806918457189566e-05, + "loss": 0.7361716032028198, + "step": 2300 + }, + { + "epoch": 0.9704641350210971, + "eval_loss": 0.7464568614959717, + "eval_runtime": 864.2128, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 2.438, + "step": 2300 + }, + { + "epoch": 0.9713080168776371, + "grad_norm": 1.2168612480163574, + "learning_rate": 9.806256992687544e-05, + "loss": 0.805477499961853, + "step": 2302 + }, + { + "epoch": 0.9721518987341772, + "grad_norm": 1.0418168306350708, + "learning_rate": 9.80559441947434e-05, + "loss": 0.6673368811607361, + "step": 2304 + }, + { + "epoch": 0.9729957805907173, + "grad_norm": 1.223128318786621, + "learning_rate": 9.804930737702796e-05, + "loss": 0.7585647106170654, + "step": 2306 + }, + { + "epoch": 0.9738396624472574, + "grad_norm": 1.264511227607727, + "learning_rate": 9.804265947526011e-05, + "loss": 0.7642034888267517, + "step": 2308 + }, + { + "epoch": 0.9746835443037974, + "grad_norm": 1.076887607574463, + "learning_rate": 9.803600049097339e-05, + "loss": 0.7094541192054749, + "step": 2310 + }, + { + "epoch": 0.9755274261603376, + "grad_norm": 1.0214987993240356, + "learning_rate": 9.802933042570392e-05, + "loss": 0.7370059490203857, + "step": 2312 + }, + { + "epoch": 0.9763713080168777, + "grad_norm": 1.3075295686721802, + "learning_rate": 9.802264928099035e-05, + "loss": 0.726834237575531, + "step": 2314 + }, + { + "epoch": 0.9772151898734177, + "grad_norm": 1.057386040687561, + "learning_rate": 9.801595705837385e-05, + "loss": 0.6742353439331055, + "step": 2316 + }, + { + "epoch": 0.9780590717299578, + "grad_norm": 1.3998085260391235, + "learning_rate": 9.800925375939825e-05, + "loss": 0.6862425208091736, + "step": 2318 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 1.080574631690979, + "learning_rate": 9.800253938560983e-05, + "loss": 0.6212031245231628, + "step": 2320 + }, + { + "epoch": 0.979746835443038, + "grad_norm": 1.3643771409988403, + "learning_rate": 9.799581393855748e-05, + "loss": 0.7522522211074829, + "step": 2322 + }, + { + "epoch": 0.980590717299578, + "grad_norm": 1.2455768585205078, + "learning_rate": 9.798907741979264e-05, + "loss": 0.7265716791152954, + "step": 2324 + }, + { + "epoch": 0.9814345991561182, + "grad_norm": 1.078774333000183, + "learning_rate": 9.798232983086927e-05, + "loss": 0.7160419225692749, + "step": 2326 + }, + { + "epoch": 0.9822784810126582, + "grad_norm": 1.3013948202133179, + "learning_rate": 9.797557117334394e-05, + "loss": 0.7991124391555786, + "step": 2328 + }, + { + "epoch": 0.9831223628691983, + "grad_norm": 1.2216732501983643, + "learning_rate": 9.796880144877572e-05, + "loss": 0.7193916440010071, + "step": 2330 + }, + { + "epoch": 0.9839662447257383, + "grad_norm": 1.1469542980194092, + "learning_rate": 9.796202065872627e-05, + "loss": 0.7184370756149292, + "step": 2332 + }, + { + "epoch": 0.9848101265822785, + "grad_norm": 1.0431830883026123, + "learning_rate": 9.795522880475979e-05, + "loss": 0.6474619507789612, + "step": 2334 + }, + { + "epoch": 0.9856540084388186, + "grad_norm": 1.1819576025009155, + "learning_rate": 9.794842588844299e-05, + "loss": 0.6392545700073242, + "step": 2336 + }, + { + "epoch": 0.9864978902953586, + "grad_norm": 1.1984983682632446, + "learning_rate": 9.794161191134525e-05, + "loss": 0.7358114719390869, + "step": 2338 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 1.3378512859344482, + "learning_rate": 9.793478687503834e-05, + "loss": 0.6762020587921143, + "step": 2340 + }, + { + "epoch": 0.9881856540084388, + "grad_norm": 1.272674560546875, + "learning_rate": 9.792795078109673e-05, + "loss": 0.7478934526443481, + "step": 2342 + }, + { + "epoch": 0.9890295358649789, + "grad_norm": 1.153746247291565, + "learning_rate": 9.792110363109733e-05, + "loss": 0.7316533923149109, + "step": 2344 + }, + { + "epoch": 0.9898734177215189, + "grad_norm": 1.1361702680587769, + "learning_rate": 9.791424542661967e-05, + "loss": 0.7078539133071899, + "step": 2346 + }, + { + "epoch": 0.9907172995780591, + "grad_norm": 1.3043115139007568, + "learning_rate": 9.790737616924581e-05, + "loss": 0.7945935130119324, + "step": 2348 + }, + { + "epoch": 0.9915611814345991, + "grad_norm": 1.1913264989852905, + "learning_rate": 9.790049586056034e-05, + "loss": 0.8247197866439819, + "step": 2350 + }, + { + "epoch": 0.9924050632911392, + "grad_norm": 1.1560171842575073, + "learning_rate": 9.789360450215041e-05, + "loss": 0.7099657654762268, + "step": 2352 + }, + { + "epoch": 0.9932489451476794, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.788670209560575e-05, + "loss": 0.7480318546295166, + "step": 2354 + }, + { + "epoch": 0.9940928270042194, + "grad_norm": 1.1584707498550415, + "learning_rate": 9.787978864251859e-05, + "loss": 0.6870889067649841, + "step": 2356 + }, + { + "epoch": 0.9949367088607595, + "grad_norm": 1.057478666305542, + "learning_rate": 9.787286414448375e-05, + "loss": 0.6114922165870667, + "step": 2358 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 1.1431775093078613, + "learning_rate": 9.786592860309856e-05, + "loss": 0.6955118179321289, + "step": 2360 + }, + { + "epoch": 0.9966244725738397, + "grad_norm": 1.232142448425293, + "learning_rate": 9.785898201996292e-05, + "loss": 0.735048770904541, + "step": 2362 + }, + { + "epoch": 0.9974683544303797, + "grad_norm": 1.1236306428909302, + "learning_rate": 9.785202439667928e-05, + "loss": 0.7150241136550903, + "step": 2364 + }, + { + "epoch": 0.9983122362869198, + "grad_norm": 1.0517534017562866, + "learning_rate": 9.784505573485263e-05, + "loss": 0.6870222687721252, + "step": 2366 + }, + { + "epoch": 0.99915611814346, + "grad_norm": 1.1747480630874634, + "learning_rate": 9.78380760360905e-05, + "loss": 0.7521567940711975, + "step": 2368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2790346145629883, + "learning_rate": 9.783108530200298e-05, + "loss": 0.7336234450340271, + "step": 2370 + }, + { + "epoch": 1.0008438818565402, + "grad_norm": 1.1216399669647217, + "learning_rate": 9.78240835342027e-05, + "loss": 0.6378109455108643, + "step": 2372 + }, + { + "epoch": 1.00168776371308, + "grad_norm": 1.267336368560791, + "learning_rate": 9.781707073430482e-05, + "loss": 0.6174905300140381, + "step": 2374 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 1.1342934370040894, + "learning_rate": 9.781004690392706e-05, + "loss": 0.6579123139381409, + "step": 2376 + }, + { + "epoch": 1.0033755274261604, + "grad_norm": 1.1317468881607056, + "learning_rate": 9.78030120446897e-05, + "loss": 0.6679617166519165, + "step": 2378 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 1.2992616891860962, + "learning_rate": 9.779596615821552e-05, + "loss": 0.7368149161338806, + "step": 2380 + }, + { + "epoch": 1.0050632911392405, + "grad_norm": 1.1714510917663574, + "learning_rate": 9.77889092461299e-05, + "loss": 0.6887164115905762, + "step": 2382 + }, + { + "epoch": 1.0059071729957807, + "grad_norm": 1.1670639514923096, + "learning_rate": 9.778184131006071e-05, + "loss": 0.681344211101532, + "step": 2384 + }, + { + "epoch": 1.0067510548523206, + "grad_norm": 1.2487291097640991, + "learning_rate": 9.77747623516384e-05, + "loss": 0.7342769503593445, + "step": 2386 + }, + { + "epoch": 1.0075949367088608, + "grad_norm": 1.2408956289291382, + "learning_rate": 9.776767237249595e-05, + "loss": 0.577454149723053, + "step": 2388 + }, + { + "epoch": 1.0084388185654007, + "grad_norm": 1.067991852760315, + "learning_rate": 9.776057137426889e-05, + "loss": 0.6588307023048401, + "step": 2390 + }, + { + "epoch": 1.009282700421941, + "grad_norm": 1.2821543216705322, + "learning_rate": 9.775345935859525e-05, + "loss": 0.7045041918754578, + "step": 2392 + }, + { + "epoch": 1.010126582278481, + "grad_norm": 1.3160134553909302, + "learning_rate": 9.774633632711569e-05, + "loss": 0.7141479253768921, + "step": 2394 + }, + { + "epoch": 1.010970464135021, + "grad_norm": 1.66774320602417, + "learning_rate": 9.773920228147329e-05, + "loss": 0.723293662071228, + "step": 2396 + }, + { + "epoch": 1.0118143459915612, + "grad_norm": 1.027588963508606, + "learning_rate": 9.77320572233138e-05, + "loss": 0.5812023878097534, + "step": 2398 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.406507968902588, + "learning_rate": 9.77249011542854e-05, + "loss": 0.7071458101272583, + "step": 2400 + }, + { + "epoch": 1.0126582278481013, + "eval_loss": 0.7421699166297913, + "eval_runtime": 854.2185, + "eval_samples_per_second": 2.467, + "eval_steps_per_second": 2.467, + "step": 2400 + }, + { + "epoch": 1.0135021097046413, + "grad_norm": 1.1236240863800049, + "learning_rate": 9.771773407603889e-05, + "loss": 0.7049722671508789, + "step": 2402 + }, + { + "epoch": 1.0143459915611814, + "grad_norm": 1.1924289464950562, + "learning_rate": 9.771055599022756e-05, + "loss": 0.635308027267456, + "step": 2404 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 1.1744966506958008, + "learning_rate": 9.770336689850727e-05, + "loss": 0.7286487817764282, + "step": 2406 + }, + { + "epoch": 1.0160337552742615, + "grad_norm": 1.2131173610687256, + "learning_rate": 9.769616680253639e-05, + "loss": 0.6828222274780273, + "step": 2408 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 1.0517828464508057, + "learning_rate": 9.768895570397585e-05, + "loss": 0.6652156114578247, + "step": 2410 + }, + { + "epoch": 1.0177215189873419, + "grad_norm": 1.1603758335113525, + "learning_rate": 9.768173360448912e-05, + "loss": 0.7278267741203308, + "step": 2412 + }, + { + "epoch": 1.0185654008438818, + "grad_norm": 1.3167752027511597, + "learning_rate": 9.767450050574218e-05, + "loss": 0.6082334518432617, + "step": 2414 + }, + { + "epoch": 1.019409282700422, + "grad_norm": 1.1754449605941772, + "learning_rate": 9.766725640940358e-05, + "loss": 0.67228102684021, + "step": 2416 + }, + { + "epoch": 1.0202531645569621, + "grad_norm": 1.060952067375183, + "learning_rate": 9.766000131714442e-05, + "loss": 0.5984366536140442, + "step": 2418 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 1.0826152563095093, + "learning_rate": 9.765273523063825e-05, + "loss": 0.690661609172821, + "step": 2420 + }, + { + "epoch": 1.0219409282700422, + "grad_norm": 1.423723816871643, + "learning_rate": 9.764545815156125e-05, + "loss": 0.7960668802261353, + "step": 2422 + }, + { + "epoch": 1.0227848101265822, + "grad_norm": 1.0882549285888672, + "learning_rate": 9.763817008159212e-05, + "loss": 0.6971074342727661, + "step": 2424 + }, + { + "epoch": 1.0236286919831223, + "grad_norm": 1.1053040027618408, + "learning_rate": 9.763087102241206e-05, + "loss": 0.6854458451271057, + "step": 2426 + }, + { + "epoch": 1.0244725738396625, + "grad_norm": 1.1975224018096924, + "learning_rate": 9.762356097570482e-05, + "loss": 0.6724489331245422, + "step": 2428 + }, + { + "epoch": 1.0253164556962024, + "grad_norm": 1.1692171096801758, + "learning_rate": 9.76162399431567e-05, + "loss": 0.7064506411552429, + "step": 2430 + }, + { + "epoch": 1.0261603375527426, + "grad_norm": 1.1927787065505981, + "learning_rate": 9.760890792645649e-05, + "loss": 0.6605257391929626, + "step": 2432 + }, + { + "epoch": 1.0270042194092828, + "grad_norm": 1.4147427082061768, + "learning_rate": 9.760156492729558e-05, + "loss": 0.6872501373291016, + "step": 2434 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 1.2503126859664917, + "learning_rate": 9.759421094736785e-05, + "loss": 0.7117500305175781, + "step": 2436 + }, + { + "epoch": 1.0286919831223629, + "grad_norm": 1.229978084564209, + "learning_rate": 9.758684598836971e-05, + "loss": 0.6740369200706482, + "step": 2438 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 1.4765945672988892, + "learning_rate": 9.757947005200014e-05, + "loss": 0.7215790748596191, + "step": 2440 + }, + { + "epoch": 1.030379746835443, + "grad_norm": 1.282632827758789, + "learning_rate": 9.757208313996061e-05, + "loss": 0.6961746215820312, + "step": 2442 + }, + { + "epoch": 1.0312236286919831, + "grad_norm": 1.259828805923462, + "learning_rate": 9.756468525395512e-05, + "loss": 0.6348349452018738, + "step": 2444 + }, + { + "epoch": 1.0320675105485233, + "grad_norm": 1.0984172821044922, + "learning_rate": 9.755727639569024e-05, + "loss": 0.6756057739257812, + "step": 2446 + }, + { + "epoch": 1.0329113924050632, + "grad_norm": 1.235835075378418, + "learning_rate": 9.754985656687506e-05, + "loss": 0.6968509554862976, + "step": 2448 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 1.273032546043396, + "learning_rate": 9.754242576922119e-05, + "loss": 0.6793950796127319, + "step": 2450 + }, + { + "epoch": 1.0345991561181433, + "grad_norm": 1.251996397972107, + "learning_rate": 9.753498400444274e-05, + "loss": 0.645270586013794, + "step": 2452 + }, + { + "epoch": 1.0354430379746835, + "grad_norm": 1.4310805797576904, + "learning_rate": 9.752753127425642e-05, + "loss": 0.7291322350502014, + "step": 2454 + }, + { + "epoch": 1.0362869198312237, + "grad_norm": 1.6582196950912476, + "learning_rate": 9.752006758038142e-05, + "loss": 0.7553019523620605, + "step": 2456 + }, + { + "epoch": 1.0371308016877636, + "grad_norm": 1.081773042678833, + "learning_rate": 9.751259292453947e-05, + "loss": 0.5637331008911133, + "step": 2458 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 1.1483876705169678, + "learning_rate": 9.750510730845483e-05, + "loss": 0.6012396216392517, + "step": 2460 + }, + { + "epoch": 1.038818565400844, + "grad_norm": 1.0879185199737549, + "learning_rate": 9.749761073385428e-05, + "loss": 0.6795822381973267, + "step": 2462 + }, + { + "epoch": 1.0396624472573839, + "grad_norm": 1.2378218173980713, + "learning_rate": 9.749010320246714e-05, + "loss": 0.6895145773887634, + "step": 2464 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 1.253233790397644, + "learning_rate": 9.748258471602527e-05, + "loss": 0.7124115228652954, + "step": 2466 + }, + { + "epoch": 1.0413502109704642, + "grad_norm": 1.3994864225387573, + "learning_rate": 9.747505527626302e-05, + "loss": 0.7304861545562744, + "step": 2468 + }, + { + "epoch": 1.0421940928270041, + "grad_norm": 1.2360669374465942, + "learning_rate": 9.74675148849173e-05, + "loss": 0.6845837831497192, + "step": 2470 + }, + { + "epoch": 1.0430379746835443, + "grad_norm": 1.126849889755249, + "learning_rate": 9.74599635437275e-05, + "loss": 0.6780203580856323, + "step": 2472 + }, + { + "epoch": 1.0438818565400845, + "grad_norm": 1.169788122177124, + "learning_rate": 9.745240125443562e-05, + "loss": 0.7550003528594971, + "step": 2474 + }, + { + "epoch": 1.0447257383966244, + "grad_norm": 1.1311867237091064, + "learning_rate": 9.744482801878612e-05, + "loss": 0.6910399198532104, + "step": 2476 + }, + { + "epoch": 1.0455696202531646, + "grad_norm": 1.1267731189727783, + "learning_rate": 9.743724383852597e-05, + "loss": 0.7164814472198486, + "step": 2478 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 1.2239704132080078, + "learning_rate": 9.742964871540472e-05, + "loss": 0.6428439617156982, + "step": 2480 + }, + { + "epoch": 1.0472573839662447, + "grad_norm": 1.1854743957519531, + "learning_rate": 9.742204265117443e-05, + "loss": 0.6994290351867676, + "step": 2482 + }, + { + "epoch": 1.0481012658227848, + "grad_norm": 1.0695894956588745, + "learning_rate": 9.741442564758964e-05, + "loss": 0.6725777983665466, + "step": 2484 + }, + { + "epoch": 1.048945147679325, + "grad_norm": 1.1799863576889038, + "learning_rate": 9.740679770640748e-05, + "loss": 0.6538674235343933, + "step": 2486 + }, + { + "epoch": 1.049789029535865, + "grad_norm": 1.295546293258667, + "learning_rate": 9.739915882938754e-05, + "loss": 0.780756950378418, + "step": 2488 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 1.2371755838394165, + "learning_rate": 9.739150901829198e-05, + "loss": 0.6657930612564087, + "step": 2490 + }, + { + "epoch": 1.051476793248945, + "grad_norm": 1.103037714958191, + "learning_rate": 9.738384827488547e-05, + "loss": 0.6675208210945129, + "step": 2492 + }, + { + "epoch": 1.0523206751054852, + "grad_norm": 1.1835435628890991, + "learning_rate": 9.737617660093517e-05, + "loss": 0.6693358421325684, + "step": 2494 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 1.003771424293518, + "learning_rate": 9.736849399821082e-05, + "loss": 0.624502956867218, + "step": 2496 + }, + { + "epoch": 1.0540084388185653, + "grad_norm": 1.1391769647598267, + "learning_rate": 9.736080046848463e-05, + "loss": 0.6350868344306946, + "step": 2498 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 1.376518726348877, + "learning_rate": 9.735309601353134e-05, + "loss": 0.6721012592315674, + "step": 2500 + }, + { + "epoch": 1.0548523206751055, + "eval_loss": 0.741338849067688, + "eval_runtime": 847.7478, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2500 + }, + { + "epoch": 1.0556962025316456, + "grad_norm": 1.194190502166748, + "learning_rate": 9.734538063512824e-05, + "loss": 0.6888233423233032, + "step": 2502 + }, + { + "epoch": 1.0565400843881856, + "grad_norm": 1.378830909729004, + "learning_rate": 9.733765433505513e-05, + "loss": 0.7095553278923035, + "step": 2504 + }, + { + "epoch": 1.0573839662447257, + "grad_norm": 1.1289541721343994, + "learning_rate": 9.732991711509428e-05, + "loss": 0.6734166145324707, + "step": 2506 + }, + { + "epoch": 1.058227848101266, + "grad_norm": 1.1858116388320923, + "learning_rate": 9.732216897703054e-05, + "loss": 0.7006195187568665, + "step": 2508 + }, + { + "epoch": 1.0590717299578059, + "grad_norm": 1.1365686655044556, + "learning_rate": 9.731440992265127e-05, + "loss": 0.6481205821037292, + "step": 2510 + }, + { + "epoch": 1.059915611814346, + "grad_norm": 1.2886228561401367, + "learning_rate": 9.730663995374632e-05, + "loss": 0.679282546043396, + "step": 2512 + }, + { + "epoch": 1.0607594936708862, + "grad_norm": 1.355322003364563, + "learning_rate": 9.729885907210808e-05, + "loss": 0.7656359672546387, + "step": 2514 + }, + { + "epoch": 1.0616033755274261, + "grad_norm": 1.1552364826202393, + "learning_rate": 9.729106727953142e-05, + "loss": 0.5996183156967163, + "step": 2516 + }, + { + "epoch": 1.0624472573839663, + "grad_norm": 1.1419235467910767, + "learning_rate": 9.728326457781381e-05, + "loss": 0.7599716782569885, + "step": 2518 + }, + { + "epoch": 1.0632911392405062, + "grad_norm": 1.2240079641342163, + "learning_rate": 9.727545096875512e-05, + "loss": 0.7150241732597351, + "step": 2520 + }, + { + "epoch": 1.0641350210970464, + "grad_norm": 1.2463440895080566, + "learning_rate": 9.726762645415785e-05, + "loss": 0.734352171421051, + "step": 2522 + }, + { + "epoch": 1.0649789029535865, + "grad_norm": 1.1680364608764648, + "learning_rate": 9.725979103582697e-05, + "loss": 0.6950796842575073, + "step": 2524 + }, + { + "epoch": 1.0658227848101265, + "grad_norm": 1.1680421829223633, + "learning_rate": 9.725194471556991e-05, + "loss": 0.7096341252326965, + "step": 2526 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.043717861175537, + "learning_rate": 9.724408749519671e-05, + "loss": 0.6486304402351379, + "step": 2528 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 1.1240284442901611, + "learning_rate": 9.723621937651985e-05, + "loss": 0.6519505381584167, + "step": 2530 + }, + { + "epoch": 1.0683544303797468, + "grad_norm": 1.185223937034607, + "learning_rate": 9.722834036135439e-05, + "loss": 0.6724293231964111, + "step": 2532 + }, + { + "epoch": 1.069198312236287, + "grad_norm": 1.3234196901321411, + "learning_rate": 9.722045045151784e-05, + "loss": 0.6886576414108276, + "step": 2534 + }, + { + "epoch": 1.070042194092827, + "grad_norm": 1.333084225654602, + "learning_rate": 9.721254964883024e-05, + "loss": 0.688493549823761, + "step": 2536 + }, + { + "epoch": 1.070886075949367, + "grad_norm": 1.2435462474822998, + "learning_rate": 9.720463795511419e-05, + "loss": 0.6527412533760071, + "step": 2538 + }, + { + "epoch": 1.0717299578059072, + "grad_norm": 1.1521880626678467, + "learning_rate": 9.719671537219472e-05, + "loss": 0.6508163809776306, + "step": 2540 + }, + { + "epoch": 1.0725738396624473, + "grad_norm": 1.015013575553894, + "learning_rate": 9.718878190189947e-05, + "loss": 0.6954023838043213, + "step": 2542 + }, + { + "epoch": 1.0734177215189873, + "grad_norm": 1.1507678031921387, + "learning_rate": 9.718083754605851e-05, + "loss": 0.7201322913169861, + "step": 2544 + }, + { + "epoch": 1.0742616033755275, + "grad_norm": 1.0569016933441162, + "learning_rate": 9.717288230650444e-05, + "loss": 0.6688649654388428, + "step": 2546 + }, + { + "epoch": 1.0751054852320676, + "grad_norm": 1.2178492546081543, + "learning_rate": 9.716491618507241e-05, + "loss": 0.7077898979187012, + "step": 2548 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 1.3587230443954468, + "learning_rate": 9.715693918360002e-05, + "loss": 0.7312119603157043, + "step": 2550 + }, + { + "epoch": 1.0767932489451477, + "grad_norm": 1.1930122375488281, + "learning_rate": 9.714895130392744e-05, + "loss": 0.6910589337348938, + "step": 2552 + }, + { + "epoch": 1.0776371308016879, + "grad_norm": 1.2440707683563232, + "learning_rate": 9.71409525478973e-05, + "loss": 0.7942836284637451, + "step": 2554 + }, + { + "epoch": 1.0784810126582278, + "grad_norm": 1.3755065202713013, + "learning_rate": 9.713294291735477e-05, + "loss": 0.6652286052703857, + "step": 2556 + }, + { + "epoch": 1.079324894514768, + "grad_norm": 1.165448784828186, + "learning_rate": 9.71249224141475e-05, + "loss": 0.6025735139846802, + "step": 2558 + }, + { + "epoch": 1.080168776371308, + "grad_norm": 1.2981204986572266, + "learning_rate": 9.711689104012569e-05, + "loss": 0.7343734502792358, + "step": 2560 + }, + { + "epoch": 1.081012658227848, + "grad_norm": 1.2040622234344482, + "learning_rate": 9.710884879714202e-05, + "loss": 0.6903306841850281, + "step": 2562 + }, + { + "epoch": 1.0818565400843883, + "grad_norm": 1.1835904121398926, + "learning_rate": 9.710079568705168e-05, + "loss": 0.69134920835495, + "step": 2564 + }, + { + "epoch": 1.0827004219409282, + "grad_norm": 1.3345229625701904, + "learning_rate": 9.709273171171235e-05, + "loss": 0.6471185088157654, + "step": 2566 + }, + { + "epoch": 1.0835443037974684, + "grad_norm": 1.0884469747543335, + "learning_rate": 9.708465687298425e-05, + "loss": 0.6302382349967957, + "step": 2568 + }, + { + "epoch": 1.0843881856540085, + "grad_norm": 1.1994211673736572, + "learning_rate": 9.707657117273007e-05, + "loss": 0.7329678535461426, + "step": 2570 + }, + { + "epoch": 1.0852320675105485, + "grad_norm": 1.2609503269195557, + "learning_rate": 9.706847461281507e-05, + "loss": 0.719862163066864, + "step": 2572 + }, + { + "epoch": 1.0860759493670886, + "grad_norm": 1.2686879634857178, + "learning_rate": 9.706036719510694e-05, + "loss": 0.7142901420593262, + "step": 2574 + }, + { + "epoch": 1.0869198312236288, + "grad_norm": 1.2763310670852661, + "learning_rate": 9.705224892147591e-05, + "loss": 0.7009075284004211, + "step": 2576 + }, + { + "epoch": 1.0877637130801687, + "grad_norm": 1.1704022884368896, + "learning_rate": 9.70441197937947e-05, + "loss": 0.6873779296875, + "step": 2578 + }, + { + "epoch": 1.0886075949367089, + "grad_norm": 1.0482875108718872, + "learning_rate": 9.703597981393856e-05, + "loss": 0.6437726020812988, + "step": 2580 + }, + { + "epoch": 1.0894514767932488, + "grad_norm": 1.28431236743927, + "learning_rate": 9.702782898378521e-05, + "loss": 0.6933431625366211, + "step": 2582 + }, + { + "epoch": 1.090295358649789, + "grad_norm": 1.0962283611297607, + "learning_rate": 9.701966730521491e-05, + "loss": 0.6488757133483887, + "step": 2584 + }, + { + "epoch": 1.0911392405063292, + "grad_norm": 1.2177873849868774, + "learning_rate": 9.70114947801104e-05, + "loss": 0.6385396122932434, + "step": 2586 + }, + { + "epoch": 1.091983122362869, + "grad_norm": 1.197059988975525, + "learning_rate": 9.70033114103569e-05, + "loss": 0.6826614737510681, + "step": 2588 + }, + { + "epoch": 1.0928270042194093, + "grad_norm": 1.1624075174331665, + "learning_rate": 9.699511719784217e-05, + "loss": 0.605629563331604, + "step": 2590 + }, + { + "epoch": 1.0936708860759494, + "grad_norm": 1.2975167036056519, + "learning_rate": 9.698691214445648e-05, + "loss": 0.734926700592041, + "step": 2592 + }, + { + "epoch": 1.0945147679324894, + "grad_norm": 1.215414047241211, + "learning_rate": 9.697869625209255e-05, + "loss": 0.7281333804130554, + "step": 2594 + }, + { + "epoch": 1.0953586497890295, + "grad_norm": 1.1862860918045044, + "learning_rate": 9.697046952264563e-05, + "loss": 0.7388250827789307, + "step": 2596 + }, + { + "epoch": 1.0962025316455697, + "grad_norm": 1.1127797365188599, + "learning_rate": 9.696223195801348e-05, + "loss": 0.6495320796966553, + "step": 2598 + }, + { + "epoch": 1.0970464135021096, + "grad_norm": 1.0863338708877563, + "learning_rate": 9.695398356009636e-05, + "loss": 0.7157143950462341, + "step": 2600 + }, + { + "epoch": 1.0970464135021096, + "eval_loss": 0.7377332448959351, + "eval_runtime": 859.6612, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 2600 + }, + { + "epoch": 1.0978902953586498, + "grad_norm": 1.1228652000427246, + "learning_rate": 9.694572433079699e-05, + "loss": 0.6597335934638977, + "step": 2602 + }, + { + "epoch": 1.09873417721519, + "grad_norm": 1.3077653646469116, + "learning_rate": 9.69374542720206e-05, + "loss": 0.6715680360794067, + "step": 2604 + }, + { + "epoch": 1.09957805907173, + "grad_norm": 1.241603970527649, + "learning_rate": 9.692917338567499e-05, + "loss": 0.6910243034362793, + "step": 2606 + }, + { + "epoch": 1.10042194092827, + "grad_norm": 1.1372551918029785, + "learning_rate": 9.692088167367037e-05, + "loss": 0.6519553065299988, + "step": 2608 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 1.2894765138626099, + "learning_rate": 9.691257913791949e-05, + "loss": 0.6542758941650391, + "step": 2610 + }, + { + "epoch": 1.1021097046413502, + "grad_norm": 1.0800915956497192, + "learning_rate": 9.690426578033755e-05, + "loss": 0.6886795163154602, + "step": 2612 + }, + { + "epoch": 1.1029535864978903, + "grad_norm": 1.3394384384155273, + "learning_rate": 9.689594160284233e-05, + "loss": 0.7512150406837463, + "step": 2614 + }, + { + "epoch": 1.1037974683544305, + "grad_norm": 1.2175323963165283, + "learning_rate": 9.688760660735402e-05, + "loss": 0.67207932472229, + "step": 2616 + }, + { + "epoch": 1.1046413502109704, + "grad_norm": 1.2181185483932495, + "learning_rate": 9.687926079579537e-05, + "loss": 0.6591740846633911, + "step": 2618 + }, + { + "epoch": 1.1054852320675106, + "grad_norm": 1.1740983724594116, + "learning_rate": 9.68709041700916e-05, + "loss": 0.6431041359901428, + "step": 2620 + }, + { + "epoch": 1.1063291139240505, + "grad_norm": 1.1792434453964233, + "learning_rate": 9.686253673217038e-05, + "loss": 0.6573615074157715, + "step": 2622 + }, + { + "epoch": 1.1071729957805907, + "grad_norm": 1.058391809463501, + "learning_rate": 9.685415848396196e-05, + "loss": 0.5576209425926208, + "step": 2624 + }, + { + "epoch": 1.1080168776371309, + "grad_norm": 1.3203206062316895, + "learning_rate": 9.684576942739903e-05, + "loss": 0.668684184551239, + "step": 2626 + }, + { + "epoch": 1.1088607594936708, + "grad_norm": 1.2391762733459473, + "learning_rate": 9.68373695644168e-05, + "loss": 0.6800089478492737, + "step": 2628 + }, + { + "epoch": 1.109704641350211, + "grad_norm": 1.2323405742645264, + "learning_rate": 9.682895889695292e-05, + "loss": 0.6433757543563843, + "step": 2630 + }, + { + "epoch": 1.1105485232067511, + "grad_norm": 1.2656551599502563, + "learning_rate": 9.682053742694759e-05, + "loss": 0.6628785729408264, + "step": 2632 + }, + { + "epoch": 1.111392405063291, + "grad_norm": 1.2984392642974854, + "learning_rate": 9.681210515634349e-05, + "loss": 0.6838971972465515, + "step": 2634 + }, + { + "epoch": 1.1122362869198312, + "grad_norm": 1.3200393915176392, + "learning_rate": 9.680366208708576e-05, + "loss": 0.7548647522926331, + "step": 2636 + }, + { + "epoch": 1.1130801687763714, + "grad_norm": 1.225388526916504, + "learning_rate": 9.679520822112208e-05, + "loss": 0.6553335189819336, + "step": 2638 + }, + { + "epoch": 1.1139240506329113, + "grad_norm": 1.2350653409957886, + "learning_rate": 9.678674356040259e-05, + "loss": 0.631401538848877, + "step": 2640 + }, + { + "epoch": 1.1147679324894515, + "grad_norm": 1.2325507402420044, + "learning_rate": 9.677826810687989e-05, + "loss": 0.6459156274795532, + "step": 2642 + }, + { + "epoch": 1.1156118143459917, + "grad_norm": 1.0008996725082397, + "learning_rate": 9.676978186250915e-05, + "loss": 0.6425284743309021, + "step": 2644 + }, + { + "epoch": 1.1164556962025316, + "grad_norm": 1.3767247200012207, + "learning_rate": 9.676128482924796e-05, + "loss": 0.6451422572135925, + "step": 2646 + }, + { + "epoch": 1.1172995780590718, + "grad_norm": 1.2070895433425903, + "learning_rate": 9.675277700905643e-05, + "loss": 0.6713272929191589, + "step": 2648 + }, + { + "epoch": 1.1181434599156117, + "grad_norm": 1.1582069396972656, + "learning_rate": 9.674425840389716e-05, + "loss": 0.6285044550895691, + "step": 2650 + }, + { + "epoch": 1.1189873417721519, + "grad_norm": 1.1641311645507812, + "learning_rate": 9.67357290157352e-05, + "loss": 0.624229907989502, + "step": 2652 + }, + { + "epoch": 1.119831223628692, + "grad_norm": 1.3071147203445435, + "learning_rate": 9.672718884653814e-05, + "loss": 0.7214919328689575, + "step": 2654 + }, + { + "epoch": 1.120675105485232, + "grad_norm": 1.2157800197601318, + "learning_rate": 9.671863789827602e-05, + "loss": 0.8062215447425842, + "step": 2656 + }, + { + "epoch": 1.1215189873417721, + "grad_norm": 1.2843927145004272, + "learning_rate": 9.671007617292138e-05, + "loss": 0.6362426280975342, + "step": 2658 + }, + { + "epoch": 1.1223628691983123, + "grad_norm": 1.1182712316513062, + "learning_rate": 9.670150367244927e-05, + "loss": 0.6181318163871765, + "step": 2660 + }, + { + "epoch": 1.1232067510548522, + "grad_norm": 1.566605806350708, + "learning_rate": 9.669292039883717e-05, + "loss": 0.6973897218704224, + "step": 2662 + }, + { + "epoch": 1.1240506329113924, + "grad_norm": 1.0726850032806396, + "learning_rate": 9.66843263540651e-05, + "loss": 0.6117324829101562, + "step": 2664 + }, + { + "epoch": 1.1248945147679326, + "grad_norm": 1.2953020334243774, + "learning_rate": 9.66757215401155e-05, + "loss": 0.642676830291748, + "step": 2666 + }, + { + "epoch": 1.1257383966244725, + "grad_norm": 1.1184383630752563, + "learning_rate": 9.66671059589734e-05, + "loss": 0.6757452487945557, + "step": 2668 + }, + { + "epoch": 1.1265822784810127, + "grad_norm": 1.2732970714569092, + "learning_rate": 9.66584796126262e-05, + "loss": 0.6861951947212219, + "step": 2670 + }, + { + "epoch": 1.1274261603375528, + "grad_norm": 1.2713000774383545, + "learning_rate": 9.664984250306383e-05, + "loss": 0.6727077960968018, + "step": 2672 + }, + { + "epoch": 1.1282700421940928, + "grad_norm": 1.269827961921692, + "learning_rate": 9.664119463227874e-05, + "loss": 0.7355974912643433, + "step": 2674 + }, + { + "epoch": 1.129113924050633, + "grad_norm": 1.3067172765731812, + "learning_rate": 9.663253600226581e-05, + "loss": 0.7121313214302063, + "step": 2676 + }, + { + "epoch": 1.129957805907173, + "grad_norm": 1.2958797216415405, + "learning_rate": 9.662386661502242e-05, + "loss": 0.6671369075775146, + "step": 2678 + }, + { + "epoch": 1.130801687763713, + "grad_norm": 1.2943401336669922, + "learning_rate": 9.661518647254842e-05, + "loss": 0.6153768301010132, + "step": 2680 + }, + { + "epoch": 1.1316455696202532, + "grad_norm": 1.1744167804718018, + "learning_rate": 9.660649557684616e-05, + "loss": 0.6070778965950012, + "step": 2682 + }, + { + "epoch": 1.1324894514767934, + "grad_norm": 1.159209132194519, + "learning_rate": 9.659779392992047e-05, + "loss": 0.676887035369873, + "step": 2684 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.1937510967254639, + "learning_rate": 9.658908153377866e-05, + "loss": 0.6086745262145996, + "step": 2686 + }, + { + "epoch": 1.1341772151898735, + "grad_norm": 1.1461687088012695, + "learning_rate": 9.658035839043049e-05, + "loss": 0.6493708491325378, + "step": 2688 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 2.066361665725708, + "learning_rate": 9.657162450188824e-05, + "loss": 0.6813004016876221, + "step": 2690 + }, + { + "epoch": 1.1358649789029536, + "grad_norm": 1.086910367012024, + "learning_rate": 9.656287987016664e-05, + "loss": 0.721062183380127, + "step": 2692 + }, + { + "epoch": 1.1367088607594937, + "grad_norm": 1.1869292259216309, + "learning_rate": 9.65541244972829e-05, + "loss": 0.5975021123886108, + "step": 2694 + }, + { + "epoch": 1.1375527426160337, + "grad_norm": 1.2456518411636353, + "learning_rate": 9.654535838525674e-05, + "loss": 0.6818324327468872, + "step": 2696 + }, + { + "epoch": 1.1383966244725738, + "grad_norm": 1.5271464586257935, + "learning_rate": 9.653658153611031e-05, + "loss": 0.6844469308853149, + "step": 2698 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.1403794288635254, + "learning_rate": 9.652779395186827e-05, + "loss": 0.6388684511184692, + "step": 2700 + }, + { + "epoch": 1.139240506329114, + "eval_loss": 0.7335711717605591, + "eval_runtime": 861.9651, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 2700 + }, + { + "epoch": 1.140084388185654, + "grad_norm": 1.1091634035110474, + "learning_rate": 9.651899563455775e-05, + "loss": 0.6154619455337524, + "step": 2702 + }, + { + "epoch": 1.140928270042194, + "grad_norm": 1.3280601501464844, + "learning_rate": 9.651018658620837e-05, + "loss": 0.629319429397583, + "step": 2704 + }, + { + "epoch": 1.1417721518987343, + "grad_norm": 1.226806402206421, + "learning_rate": 9.650136680885216e-05, + "loss": 0.6088175773620605, + "step": 2706 + }, + { + "epoch": 1.1426160337552742, + "grad_norm": 1.0593408346176147, + "learning_rate": 9.649253630452372e-05, + "loss": 0.6199659705162048, + "step": 2708 + }, + { + "epoch": 1.1434599156118144, + "grad_norm": 1.1112475395202637, + "learning_rate": 9.648369507526008e-05, + "loss": 0.7233364582061768, + "step": 2710 + }, + { + "epoch": 1.1443037974683543, + "grad_norm": 1.1737885475158691, + "learning_rate": 9.647484312310068e-05, + "loss": 0.6687955856323242, + "step": 2712 + }, + { + "epoch": 1.1451476793248945, + "grad_norm": 1.194532036781311, + "learning_rate": 9.646598045008756e-05, + "loss": 0.6508969068527222, + "step": 2714 + }, + { + "epoch": 1.1459915611814346, + "grad_norm": 1.069395899772644, + "learning_rate": 9.645710705826517e-05, + "loss": 0.6408317685127258, + "step": 2716 + }, + { + "epoch": 1.1468354430379746, + "grad_norm": 1.2429133653640747, + "learning_rate": 9.644822294968037e-05, + "loss": 0.650763750076294, + "step": 2718 + }, + { + "epoch": 1.1476793248945147, + "grad_norm": 1.2950133085250854, + "learning_rate": 9.64393281263826e-05, + "loss": 0.6952191591262817, + "step": 2720 + }, + { + "epoch": 1.148523206751055, + "grad_norm": 1.1972628831863403, + "learning_rate": 9.643042259042372e-05, + "loss": 0.6772956252098083, + "step": 2722 + }, + { + "epoch": 1.1493670886075948, + "grad_norm": 1.1670407056808472, + "learning_rate": 9.642150634385805e-05, + "loss": 0.6734447479248047, + "step": 2724 + }, + { + "epoch": 1.150210970464135, + "grad_norm": 1.120302677154541, + "learning_rate": 9.641257938874243e-05, + "loss": 0.6387717127799988, + "step": 2726 + }, + { + "epoch": 1.1510548523206752, + "grad_norm": 1.1241344213485718, + "learning_rate": 9.640364172713609e-05, + "loss": 0.6592874526977539, + "step": 2728 + }, + { + "epoch": 1.1518987341772151, + "grad_norm": 1.2627261877059937, + "learning_rate": 9.639469336110083e-05, + "loss": 0.7257466912269592, + "step": 2730 + }, + { + "epoch": 1.1527426160337553, + "grad_norm": 1.0528618097305298, + "learning_rate": 9.638573429270083e-05, + "loss": 0.572188138961792, + "step": 2732 + }, + { + "epoch": 1.1535864978902954, + "grad_norm": 1.212536334991455, + "learning_rate": 9.637676452400277e-05, + "loss": 0.678981602191925, + "step": 2734 + }, + { + "epoch": 1.1544303797468354, + "grad_norm": 1.152167797088623, + "learning_rate": 9.636778405707582e-05, + "loss": 0.6375001072883606, + "step": 2736 + }, + { + "epoch": 1.1552742616033755, + "grad_norm": 1.2400429248809814, + "learning_rate": 9.635879289399161e-05, + "loss": 0.7602289319038391, + "step": 2738 + }, + { + "epoch": 1.1561181434599157, + "grad_norm": 1.3488622903823853, + "learning_rate": 9.634979103682421e-05, + "loss": 0.6209543943405151, + "step": 2740 + }, + { + "epoch": 1.1569620253164556, + "grad_norm": 1.1999555826187134, + "learning_rate": 9.634077848765019e-05, + "loss": 0.6215830445289612, + "step": 2742 + }, + { + "epoch": 1.1578059071729958, + "grad_norm": 1.2008578777313232, + "learning_rate": 9.633175524854855e-05, + "loss": 0.6634654998779297, + "step": 2744 + }, + { + "epoch": 1.158649789029536, + "grad_norm": 1.3920676708221436, + "learning_rate": 9.63227213216008e-05, + "loss": 0.7515161633491516, + "step": 2746 + }, + { + "epoch": 1.159493670886076, + "grad_norm": 1.0551656484603882, + "learning_rate": 9.631367670889089e-05, + "loss": 0.724361777305603, + "step": 2748 + }, + { + "epoch": 1.160337552742616, + "grad_norm": 1.2820028066635132, + "learning_rate": 9.630462141250523e-05, + "loss": 0.6673553586006165, + "step": 2750 + }, + { + "epoch": 1.1611814345991562, + "grad_norm": 1.1452983617782593, + "learning_rate": 9.62955554345327e-05, + "loss": 0.7029784917831421, + "step": 2752 + }, + { + "epoch": 1.1620253164556962, + "grad_norm": 1.1808624267578125, + "learning_rate": 9.628647877706466e-05, + "loss": 0.7355457544326782, + "step": 2754 + }, + { + "epoch": 1.1628691983122363, + "grad_norm": 1.0574703216552734, + "learning_rate": 9.627739144219492e-05, + "loss": 0.6144933700561523, + "step": 2756 + }, + { + "epoch": 1.1637130801687763, + "grad_norm": 1.215733528137207, + "learning_rate": 9.626829343201974e-05, + "loss": 0.6843759417533875, + "step": 2758 + }, + { + "epoch": 1.1645569620253164, + "grad_norm": 1.1667706966400146, + "learning_rate": 9.625918474863787e-05, + "loss": 0.6197049617767334, + "step": 2760 + }, + { + "epoch": 1.1654008438818566, + "grad_norm": 1.3765631914138794, + "learning_rate": 9.62500653941505e-05, + "loss": 0.715958297252655, + "step": 2762 + }, + { + "epoch": 1.1662447257383965, + "grad_norm": 1.173715591430664, + "learning_rate": 9.62409353706613e-05, + "loss": 0.7433139085769653, + "step": 2764 + }, + { + "epoch": 1.1670886075949367, + "grad_norm": 1.1837430000305176, + "learning_rate": 9.623179468027637e-05, + "loss": 0.7174371480941772, + "step": 2766 + }, + { + "epoch": 1.1679324894514769, + "grad_norm": 1.1577154397964478, + "learning_rate": 9.622264332510432e-05, + "loss": 0.7184823751449585, + "step": 2768 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 1.165246605873108, + "learning_rate": 9.621348130725617e-05, + "loss": 0.693343460559845, + "step": 2770 + }, + { + "epoch": 1.169620253164557, + "grad_norm": 1.2853080034255981, + "learning_rate": 9.620430862884542e-05, + "loss": 0.6999852061271667, + "step": 2772 + }, + { + "epoch": 1.1704641350210971, + "grad_norm": 1.1782865524291992, + "learning_rate": 9.619512529198806e-05, + "loss": 0.6034331321716309, + "step": 2774 + }, + { + "epoch": 1.171308016877637, + "grad_norm": 1.4055447578430176, + "learning_rate": 9.61859312988025e-05, + "loss": 0.7588269710540771, + "step": 2776 + }, + { + "epoch": 1.1721518987341772, + "grad_norm": 1.1148805618286133, + "learning_rate": 9.617672665140957e-05, + "loss": 0.6913981437683105, + "step": 2778 + }, + { + "epoch": 1.1729957805907172, + "grad_norm": 1.1311042308807373, + "learning_rate": 9.616751135193266e-05, + "loss": 0.5976925492286682, + "step": 2780 + }, + { + "epoch": 1.1738396624472573, + "grad_norm": 1.2378602027893066, + "learning_rate": 9.615828540249754e-05, + "loss": 0.6897050142288208, + "step": 2782 + }, + { + "epoch": 1.1746835443037975, + "grad_norm": 1.3445732593536377, + "learning_rate": 9.614904880523248e-05, + "loss": 0.6772098541259766, + "step": 2784 + }, + { + "epoch": 1.1755274261603375, + "grad_norm": 1.3380862474441528, + "learning_rate": 9.613980156226815e-05, + "loss": 0.6354818344116211, + "step": 2786 + }, + { + "epoch": 1.1763713080168776, + "grad_norm": 1.0955157279968262, + "learning_rate": 9.613054367573773e-05, + "loss": 0.6541208028793335, + "step": 2788 + }, + { + "epoch": 1.1772151898734178, + "grad_norm": 1.0176626443862915, + "learning_rate": 9.612127514777686e-05, + "loss": 0.6472887992858887, + "step": 2790 + }, + { + "epoch": 1.1780590717299577, + "grad_norm": 1.2644864320755005, + "learning_rate": 9.611199598052357e-05, + "loss": 0.7511212229728699, + "step": 2792 + }, + { + "epoch": 1.1789029535864979, + "grad_norm": 1.248197317123413, + "learning_rate": 9.61027061761184e-05, + "loss": 0.696236789226532, + "step": 2794 + }, + { + "epoch": 1.179746835443038, + "grad_norm": 1.189935564994812, + "learning_rate": 9.609340573670436e-05, + "loss": 0.5962010622024536, + "step": 2796 + }, + { + "epoch": 1.180590717299578, + "grad_norm": 1.1760492324829102, + "learning_rate": 9.608409466442685e-05, + "loss": 0.5981685519218445, + "step": 2798 + }, + { + "epoch": 1.1814345991561181, + "grad_norm": 1.1820716857910156, + "learning_rate": 9.607477296143374e-05, + "loss": 0.6186091303825378, + "step": 2800 + }, + { + "epoch": 1.1814345991561181, + "eval_loss": 0.7298192977905273, + "eval_runtime": 849.544, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 2.48, + "step": 2800 + }, + { + "epoch": 1.1822784810126583, + "grad_norm": 1.0353888273239136, + "learning_rate": 9.606544062987541e-05, + "loss": 0.5859389901161194, + "step": 2802 + }, + { + "epoch": 1.1831223628691983, + "grad_norm": 1.3141933679580688, + "learning_rate": 9.605609767190464e-05, + "loss": 0.6573460698127747, + "step": 2804 + }, + { + "epoch": 1.1839662447257384, + "grad_norm": 1.1209372282028198, + "learning_rate": 9.604674408967664e-05, + "loss": 0.6991921067237854, + "step": 2806 + }, + { + "epoch": 1.1848101265822786, + "grad_norm": 1.2830493450164795, + "learning_rate": 9.603737988534913e-05, + "loss": 0.6438087821006775, + "step": 2808 + }, + { + "epoch": 1.1856540084388185, + "grad_norm": 1.1427195072174072, + "learning_rate": 9.602800506108225e-05, + "loss": 0.6452094316482544, + "step": 2810 + }, + { + "epoch": 1.1864978902953587, + "grad_norm": 1.316420078277588, + "learning_rate": 9.601861961903857e-05, + "loss": 0.6745601296424866, + "step": 2812 + }, + { + "epoch": 1.1873417721518988, + "grad_norm": 1.1643308401107788, + "learning_rate": 9.600922356138317e-05, + "loss": 0.6761514544487, + "step": 2814 + }, + { + "epoch": 1.1881856540084388, + "grad_norm": 1.036056399345398, + "learning_rate": 9.59998168902835e-05, + "loss": 0.6453908681869507, + "step": 2816 + }, + { + "epoch": 1.189029535864979, + "grad_norm": 1.2211129665374756, + "learning_rate": 9.599039960790954e-05, + "loss": 0.6576406359672546, + "step": 2818 + }, + { + "epoch": 1.189873417721519, + "grad_norm": 1.084114670753479, + "learning_rate": 9.598097171643364e-05, + "loss": 0.6214181780815125, + "step": 2820 + }, + { + "epoch": 1.190717299578059, + "grad_norm": 1.1297314167022705, + "learning_rate": 9.597153321803064e-05, + "loss": 0.6381646990776062, + "step": 2822 + }, + { + "epoch": 1.1915611814345992, + "grad_norm": 1.2568120956420898, + "learning_rate": 9.596208411487784e-05, + "loss": 0.7129076719284058, + "step": 2824 + }, + { + "epoch": 1.1924050632911392, + "grad_norm": 1.07041335105896, + "learning_rate": 9.595262440915493e-05, + "loss": 0.7123546004295349, + "step": 2826 + }, + { + "epoch": 1.1932489451476793, + "grad_norm": 1.3950074911117554, + "learning_rate": 9.594315410304413e-05, + "loss": 0.7263038158416748, + "step": 2828 + }, + { + "epoch": 1.1940928270042195, + "grad_norm": 1.2470672130584717, + "learning_rate": 9.593367319873002e-05, + "loss": 0.6863036751747131, + "step": 2830 + }, + { + "epoch": 1.1949367088607594, + "grad_norm": 1.2065461874008179, + "learning_rate": 9.592418169839968e-05, + "loss": 0.745354175567627, + "step": 2832 + }, + { + "epoch": 1.1957805907172996, + "grad_norm": 1.1710152626037598, + "learning_rate": 9.591467960424261e-05, + "loss": 0.6401656866073608, + "step": 2834 + }, + { + "epoch": 1.1966244725738397, + "grad_norm": 1.3324087858200073, + "learning_rate": 9.590516691845077e-05, + "loss": 0.7402615547180176, + "step": 2836 + }, + { + "epoch": 1.1974683544303797, + "grad_norm": 1.0100195407867432, + "learning_rate": 9.589564364321855e-05, + "loss": 0.5723769068717957, + "step": 2838 + }, + { + "epoch": 1.1983122362869199, + "grad_norm": 1.2706246376037598, + "learning_rate": 9.588610978074277e-05, + "loss": 0.6618966460227966, + "step": 2840 + }, + { + "epoch": 1.1991561181434598, + "grad_norm": 1.1921758651733398, + "learning_rate": 9.587656533322273e-05, + "loss": 0.7090804576873779, + "step": 2842 + }, + { + "epoch": 1.2, + "grad_norm": 1.36713445186615, + "learning_rate": 9.586701030286014e-05, + "loss": 0.6930652856826782, + "step": 2844 + }, + { + "epoch": 1.2008438818565401, + "grad_norm": 1.3084295988082886, + "learning_rate": 9.585744469185917e-05, + "loss": 0.7386236190795898, + "step": 2846 + }, + { + "epoch": 1.20168776371308, + "grad_norm": 1.198922038078308, + "learning_rate": 9.584786850242642e-05, + "loss": 0.6179903149604797, + "step": 2848 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.2106369733810425, + "learning_rate": 9.583828173677092e-05, + "loss": 0.7027528882026672, + "step": 2850 + }, + { + "epoch": 1.2033755274261604, + "grad_norm": 1.2959522008895874, + "learning_rate": 9.582868439710418e-05, + "loss": 0.6612945199012756, + "step": 2852 + }, + { + "epoch": 1.2042194092827003, + "grad_norm": 1.1441705226898193, + "learning_rate": 9.58190764856401e-05, + "loss": 0.7085917592048645, + "step": 2854 + }, + { + "epoch": 1.2050632911392405, + "grad_norm": 1.1586185693740845, + "learning_rate": 9.580945800459504e-05, + "loss": 0.7480600476264954, + "step": 2856 + }, + { + "epoch": 1.2059071729957807, + "grad_norm": 1.2068266868591309, + "learning_rate": 9.579982895618783e-05, + "loss": 0.7185836434364319, + "step": 2858 + }, + { + "epoch": 1.2067510548523206, + "grad_norm": 1.2188525199890137, + "learning_rate": 9.579018934263966e-05, + "loss": 0.6737306118011475, + "step": 2860 + }, + { + "epoch": 1.2075949367088608, + "grad_norm": 1.1513181924819946, + "learning_rate": 9.578053916617423e-05, + "loss": 0.7239293456077576, + "step": 2862 + }, + { + "epoch": 1.208438818565401, + "grad_norm": 1.2063703536987305, + "learning_rate": 9.577087842901764e-05, + "loss": 0.6416276097297668, + "step": 2864 + }, + { + "epoch": 1.2092827004219409, + "grad_norm": 1.102460503578186, + "learning_rate": 9.576120713339844e-05, + "loss": 0.697213351726532, + "step": 2866 + }, + { + "epoch": 1.210126582278481, + "grad_norm": 1.2484638690948486, + "learning_rate": 9.575152528154763e-05, + "loss": 0.6664742231369019, + "step": 2868 + }, + { + "epoch": 1.2109704641350212, + "grad_norm": 1.4476624727249146, + "learning_rate": 9.57418328756986e-05, + "loss": 0.6914868354797363, + "step": 2870 + }, + { + "epoch": 1.2118143459915611, + "grad_norm": 1.0130122900009155, + "learning_rate": 9.573212991808722e-05, + "loss": 0.662024736404419, + "step": 2872 + }, + { + "epoch": 1.2126582278481013, + "grad_norm": 1.014470100402832, + "learning_rate": 9.572241641095177e-05, + "loss": 0.6330409646034241, + "step": 2874 + }, + { + "epoch": 1.2135021097046415, + "grad_norm": 1.1803333759307861, + "learning_rate": 9.571269235653298e-05, + "loss": 0.6607463955879211, + "step": 2876 + }, + { + "epoch": 1.2143459915611814, + "grad_norm": 1.261366844177246, + "learning_rate": 9.570295775707398e-05, + "loss": 0.6925629377365112, + "step": 2878 + }, + { + "epoch": 1.2151898734177216, + "grad_norm": 1.226670503616333, + "learning_rate": 9.569321261482037e-05, + "loss": 0.7070510983467102, + "step": 2880 + }, + { + "epoch": 1.2160337552742617, + "grad_norm": 1.164565920829773, + "learning_rate": 9.568345693202016e-05, + "loss": 0.7243561744689941, + "step": 2882 + }, + { + "epoch": 1.2168776371308017, + "grad_norm": 1.060331106185913, + "learning_rate": 9.567369071092382e-05, + "loss": 0.6316909790039062, + "step": 2884 + }, + { + "epoch": 1.2177215189873418, + "grad_norm": 1.1998693943023682, + "learning_rate": 9.566391395378419e-05, + "loss": 0.6139125227928162, + "step": 2886 + }, + { + "epoch": 1.2185654008438818, + "grad_norm": 1.1875834465026855, + "learning_rate": 9.565412666285661e-05, + "loss": 0.688897430896759, + "step": 2888 + }, + { + "epoch": 1.219409282700422, + "grad_norm": 1.199174404144287, + "learning_rate": 9.564432884039882e-05, + "loss": 0.684590756893158, + "step": 2890 + }, + { + "epoch": 1.220253164556962, + "grad_norm": 1.2428219318389893, + "learning_rate": 9.563452048867099e-05, + "loss": 0.67433100938797, + "step": 2892 + }, + { + "epoch": 1.221097046413502, + "grad_norm": 1.0826431512832642, + "learning_rate": 9.562470160993568e-05, + "loss": 0.6959785223007202, + "step": 2894 + }, + { + "epoch": 1.2219409282700422, + "grad_norm": 1.3140246868133545, + "learning_rate": 9.561487220645797e-05, + "loss": 0.6443175673484802, + "step": 2896 + }, + { + "epoch": 1.2227848101265824, + "grad_norm": 1.2758334875106812, + "learning_rate": 9.560503228050529e-05, + "loss": 0.6715332865715027, + "step": 2898 + }, + { + "epoch": 1.2236286919831223, + "grad_norm": 1.3326421976089478, + "learning_rate": 9.559518183434753e-05, + "loss": 0.6896081566810608, + "step": 2900 + }, + { + "epoch": 1.2236286919831223, + "eval_loss": 0.7281573414802551, + "eval_runtime": 854.563, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 2.466, + "step": 2900 + }, + { + "epoch": 1.2244725738396625, + "grad_norm": 1.3225606679916382, + "learning_rate": 9.558532087025697e-05, + "loss": 0.6797633171081543, + "step": 2902 + }, + { + "epoch": 1.2253164556962026, + "grad_norm": 1.3058340549468994, + "learning_rate": 9.55754493905084e-05, + "loss": 0.6510948538780212, + "step": 2904 + }, + { + "epoch": 1.2261603375527426, + "grad_norm": 1.140268087387085, + "learning_rate": 9.556556739737892e-05, + "loss": 0.6481176614761353, + "step": 2906 + }, + { + "epoch": 1.2270042194092827, + "grad_norm": 1.465113639831543, + "learning_rate": 9.555567489314816e-05, + "loss": 0.7533771991729736, + "step": 2908 + }, + { + "epoch": 1.2278481012658227, + "grad_norm": 1.1468979120254517, + "learning_rate": 9.554577188009812e-05, + "loss": 0.6924305558204651, + "step": 2910 + }, + { + "epoch": 1.2286919831223628, + "grad_norm": 1.2193517684936523, + "learning_rate": 9.553585836051321e-05, + "loss": 0.7082820534706116, + "step": 2912 + }, + { + "epoch": 1.229535864978903, + "grad_norm": 1.2015037536621094, + "learning_rate": 9.552593433668034e-05, + "loss": 0.6735695004463196, + "step": 2914 + }, + { + "epoch": 1.230379746835443, + "grad_norm": 1.1915435791015625, + "learning_rate": 9.551599981088874e-05, + "loss": 0.7312048673629761, + "step": 2916 + }, + { + "epoch": 1.231223628691983, + "grad_norm": 1.2849410772323608, + "learning_rate": 9.550605478543013e-05, + "loss": 0.6590308547019958, + "step": 2918 + }, + { + "epoch": 1.2320675105485233, + "grad_norm": 1.192238688468933, + "learning_rate": 9.549609926259866e-05, + "loss": 0.6237715482711792, + "step": 2920 + }, + { + "epoch": 1.2329113924050632, + "grad_norm": 1.141845703125, + "learning_rate": 9.548613324469085e-05, + "loss": 0.6546295881271362, + "step": 2922 + }, + { + "epoch": 1.2337552742616034, + "grad_norm": 1.1662311553955078, + "learning_rate": 9.547615673400566e-05, + "loss": 0.5800934433937073, + "step": 2924 + }, + { + "epoch": 1.2345991561181435, + "grad_norm": 1.120578646659851, + "learning_rate": 9.546616973284453e-05, + "loss": 0.6487136483192444, + "step": 2926 + }, + { + "epoch": 1.2354430379746835, + "grad_norm": 1.0884860754013062, + "learning_rate": 9.54561722435112e-05, + "loss": 0.7515342235565186, + "step": 2928 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 1.4208670854568481, + "learning_rate": 9.544616426831196e-05, + "loss": 0.7162003517150879, + "step": 2930 + }, + { + "epoch": 1.2371308016877638, + "grad_norm": 1.083389401435852, + "learning_rate": 9.543614580955543e-05, + "loss": 0.708450198173523, + "step": 2932 + }, + { + "epoch": 1.2379746835443037, + "grad_norm": 1.141364336013794, + "learning_rate": 9.542611686955268e-05, + "loss": 0.6255859732627869, + "step": 2934 + }, + { + "epoch": 1.238818565400844, + "grad_norm": 1.122036099433899, + "learning_rate": 9.54160774506172e-05, + "loss": 0.6485402584075928, + "step": 2936 + }, + { + "epoch": 1.239662447257384, + "grad_norm": 1.3514165878295898, + "learning_rate": 9.540602755506487e-05, + "loss": 0.6735473871231079, + "step": 2938 + }, + { + "epoch": 1.240506329113924, + "grad_norm": 1.1762629747390747, + "learning_rate": 9.539596718521403e-05, + "loss": 0.6154970526695251, + "step": 2940 + }, + { + "epoch": 1.2413502109704642, + "grad_norm": 1.1609408855438232, + "learning_rate": 9.53858963433854e-05, + "loss": 0.6410251259803772, + "step": 2942 + }, + { + "epoch": 1.2421940928270043, + "grad_norm": 1.1750361919403076, + "learning_rate": 9.537581503190214e-05, + "loss": 0.6841039657592773, + "step": 2944 + }, + { + "epoch": 1.2430379746835443, + "grad_norm": 1.3125680685043335, + "learning_rate": 9.536572325308982e-05, + "loss": 0.7293462753295898, + "step": 2946 + }, + { + "epoch": 1.2438818565400844, + "grad_norm": 1.1737277507781982, + "learning_rate": 9.53556210092764e-05, + "loss": 0.7713663578033447, + "step": 2948 + }, + { + "epoch": 1.2447257383966246, + "grad_norm": 1.1702152490615845, + "learning_rate": 9.53455083027923e-05, + "loss": 0.6612298488616943, + "step": 2950 + }, + { + "epoch": 1.2455696202531645, + "grad_norm": 1.2594486474990845, + "learning_rate": 9.533538513597028e-05, + "loss": 0.6725803017616272, + "step": 2952 + }, + { + "epoch": 1.2464135021097047, + "grad_norm": 1.180816411972046, + "learning_rate": 9.532525151114562e-05, + "loss": 0.6421069502830505, + "step": 2954 + }, + { + "epoch": 1.2472573839662446, + "grad_norm": 1.25814688205719, + "learning_rate": 9.531510743065593e-05, + "loss": 0.7042996287345886, + "step": 2956 + }, + { + "epoch": 1.2481012658227848, + "grad_norm": 1.2101783752441406, + "learning_rate": 9.530495289684122e-05, + "loss": 0.7359137535095215, + "step": 2958 + }, + { + "epoch": 1.248945147679325, + "grad_norm": 1.1438405513763428, + "learning_rate": 9.5294787912044e-05, + "loss": 0.6186386346817017, + "step": 2960 + }, + { + "epoch": 1.249789029535865, + "grad_norm": 1.163364291191101, + "learning_rate": 9.52846124786091e-05, + "loss": 0.6243056058883667, + "step": 2962 + }, + { + "epoch": 1.250632911392405, + "grad_norm": 1.0695953369140625, + "learning_rate": 9.52744265988838e-05, + "loss": 0.6568763852119446, + "step": 2964 + }, + { + "epoch": 1.2514767932489452, + "grad_norm": 1.2228879928588867, + "learning_rate": 9.52642302752178e-05, + "loss": 0.6486776471138, + "step": 2966 + }, + { + "epoch": 1.2523206751054852, + "grad_norm": 1.2262967824935913, + "learning_rate": 9.52540235099632e-05, + "loss": 0.6293455958366394, + "step": 2968 + }, + { + "epoch": 1.2531645569620253, + "grad_norm": 1.0862956047058105, + "learning_rate": 9.524380630547449e-05, + "loss": 0.6549884080886841, + "step": 2970 + }, + { + "epoch": 1.2540084388185653, + "grad_norm": 1.1721880435943604, + "learning_rate": 9.52335786641086e-05, + "loss": 0.6126490831375122, + "step": 2972 + }, + { + "epoch": 1.2548523206751054, + "grad_norm": 1.2452391386032104, + "learning_rate": 9.522334058822483e-05, + "loss": 0.7078590393066406, + "step": 2974 + }, + { + "epoch": 1.2556962025316456, + "grad_norm": 1.2290222644805908, + "learning_rate": 9.521309208018492e-05, + "loss": 0.6166214942932129, + "step": 2976 + }, + { + "epoch": 1.2565400843881855, + "grad_norm": 1.1823618412017822, + "learning_rate": 9.520283314235299e-05, + "loss": 0.666228175163269, + "step": 2978 + }, + { + "epoch": 1.2573839662447257, + "grad_norm": 1.1702475547790527, + "learning_rate": 9.51925637770956e-05, + "loss": 0.7436795830726624, + "step": 2980 + }, + { + "epoch": 1.2582278481012659, + "grad_norm": 1.0879321098327637, + "learning_rate": 9.518228398678168e-05, + "loss": 0.7120893001556396, + "step": 2982 + }, + { + "epoch": 1.2590717299578058, + "grad_norm": 1.1608418226242065, + "learning_rate": 9.517199377378261e-05, + "loss": 0.6931713223457336, + "step": 2984 + }, + { + "epoch": 1.259915611814346, + "grad_norm": 1.1289087533950806, + "learning_rate": 9.51616931404721e-05, + "loss": 0.6803538799285889, + "step": 2986 + }, + { + "epoch": 1.2607594936708861, + "grad_norm": 1.1622236967086792, + "learning_rate": 9.515138208922633e-05, + "loss": 0.6499706506729126, + "step": 2988 + }, + { + "epoch": 1.261603375527426, + "grad_norm": 1.2492594718933105, + "learning_rate": 9.514106062242386e-05, + "loss": 0.6132655739784241, + "step": 2990 + }, + { + "epoch": 1.2624472573839662, + "grad_norm": 1.1538822650909424, + "learning_rate": 9.513072874244567e-05, + "loss": 0.6309265494346619, + "step": 2992 + }, + { + "epoch": 1.2632911392405064, + "grad_norm": 1.0828478336334229, + "learning_rate": 9.512038645167509e-05, + "loss": 0.6297751665115356, + "step": 2994 + }, + { + "epoch": 1.2641350210970463, + "grad_norm": 1.2440937757492065, + "learning_rate": 9.511003375249792e-05, + "loss": 0.6335258483886719, + "step": 2996 + }, + { + "epoch": 1.2649789029535865, + "grad_norm": 1.1259970664978027, + "learning_rate": 9.50996706473023e-05, + "loss": 0.6513770818710327, + "step": 2998 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.1530309915542603, + "learning_rate": 9.508929713847884e-05, + "loss": 0.6490892767906189, + "step": 3000 + }, + { + "epoch": 1.2658227848101267, + "eval_loss": 0.72515869140625, + "eval_runtime": 868.0515, + "eval_samples_per_second": 2.427, + "eval_steps_per_second": 2.427, + "step": 3000 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.2257169485092163, + "learning_rate": 9.507891322842048e-05, + "loss": 0.6936060786247253, + "step": 3002 + }, + { + "epoch": 1.2675105485232068, + "grad_norm": 1.0380109548568726, + "learning_rate": 9.506851891952259e-05, + "loss": 0.5941951870918274, + "step": 3004 + }, + { + "epoch": 1.268354430379747, + "grad_norm": 1.2830222845077515, + "learning_rate": 9.505811421418296e-05, + "loss": 0.648429811000824, + "step": 3006 + }, + { + "epoch": 1.2691983122362869, + "grad_norm": 1.2212986946105957, + "learning_rate": 9.504769911480171e-05, + "loss": 0.6868565678596497, + "step": 3008 + }, + { + "epoch": 1.270042194092827, + "grad_norm": 1.104656457901001, + "learning_rate": 9.503727362378145e-05, + "loss": 0.6777986288070679, + "step": 3010 + }, + { + "epoch": 1.2708860759493672, + "grad_norm": 1.1449005603790283, + "learning_rate": 9.502683774352713e-05, + "loss": 0.6581128239631653, + "step": 3012 + }, + { + "epoch": 1.2717299578059071, + "grad_norm": 1.2753362655639648, + "learning_rate": 9.501639147644608e-05, + "loss": 0.689930260181427, + "step": 3014 + }, + { + "epoch": 1.2725738396624473, + "grad_norm": 1.3367106914520264, + "learning_rate": 9.500593482494809e-05, + "loss": 0.7549214363098145, + "step": 3016 + }, + { + "epoch": 1.2734177215189875, + "grad_norm": 1.2309048175811768, + "learning_rate": 9.499546779144528e-05, + "loss": 0.6713513135910034, + "step": 3018 + }, + { + "epoch": 1.2742616033755274, + "grad_norm": 1.3833240270614624, + "learning_rate": 9.49849903783522e-05, + "loss": 0.7045458555221558, + "step": 3020 + }, + { + "epoch": 1.2751054852320676, + "grad_norm": 1.1402570009231567, + "learning_rate": 9.49745025880858e-05, + "loss": 0.708249568939209, + "step": 3022 + }, + { + "epoch": 1.2759493670886077, + "grad_norm": 1.0476267337799072, + "learning_rate": 9.496400442306541e-05, + "loss": 0.616210401058197, + "step": 3024 + }, + { + "epoch": 1.2767932489451477, + "grad_norm": 1.1045979261398315, + "learning_rate": 9.495349588571274e-05, + "loss": 0.6691827178001404, + "step": 3026 + }, + { + "epoch": 1.2776371308016878, + "grad_norm": 1.1760368347167969, + "learning_rate": 9.494297697845194e-05, + "loss": 0.6198306083679199, + "step": 3028 + }, + { + "epoch": 1.2784810126582278, + "grad_norm": 1.0015549659729004, + "learning_rate": 9.493244770370946e-05, + "loss": 0.5756480097770691, + "step": 3030 + }, + { + "epoch": 1.279324894514768, + "grad_norm": 1.2190428972244263, + "learning_rate": 9.492190806391427e-05, + "loss": 0.6794419884681702, + "step": 3032 + }, + { + "epoch": 1.2801687763713079, + "grad_norm": 1.0210410356521606, + "learning_rate": 9.491135806149762e-05, + "loss": 0.5847988724708557, + "step": 3034 + }, + { + "epoch": 1.281012658227848, + "grad_norm": 1.0678503513336182, + "learning_rate": 9.490079769889319e-05, + "loss": 0.6760231256484985, + "step": 3036 + }, + { + "epoch": 1.2818565400843882, + "grad_norm": 1.1811012029647827, + "learning_rate": 9.489022697853709e-05, + "loss": 0.7188448309898376, + "step": 3038 + }, + { + "epoch": 1.2827004219409281, + "grad_norm": 1.1134302616119385, + "learning_rate": 9.487964590286776e-05, + "loss": 0.674904465675354, + "step": 3040 + }, + { + "epoch": 1.2835443037974683, + "grad_norm": 1.1868232488632202, + "learning_rate": 9.486905447432603e-05, + "loss": 0.6016344428062439, + "step": 3042 + }, + { + "epoch": 1.2843881856540085, + "grad_norm": 1.1586613655090332, + "learning_rate": 9.485845269535517e-05, + "loss": 0.6965603828430176, + "step": 3044 + }, + { + "epoch": 1.2852320675105484, + "grad_norm": 1.149837613105774, + "learning_rate": 9.48478405684008e-05, + "loss": 0.656144380569458, + "step": 3046 + }, + { + "epoch": 1.2860759493670886, + "grad_norm": 1.228752613067627, + "learning_rate": 9.48372180959109e-05, + "loss": 0.6388653516769409, + "step": 3048 + }, + { + "epoch": 1.2869198312236287, + "grad_norm": 1.2403100728988647, + "learning_rate": 9.482658528033595e-05, + "loss": 0.6255465745925903, + "step": 3050 + }, + { + "epoch": 1.2877637130801687, + "grad_norm": 1.2483839988708496, + "learning_rate": 9.481594212412865e-05, + "loss": 0.6828253269195557, + "step": 3052 + }, + { + "epoch": 1.2886075949367088, + "grad_norm": 1.4161021709442139, + "learning_rate": 9.480528862974422e-05, + "loss": 0.7072080373764038, + "step": 3054 + }, + { + "epoch": 1.289451476793249, + "grad_norm": 1.1500437259674072, + "learning_rate": 9.479462479964021e-05, + "loss": 0.6082415580749512, + "step": 3056 + }, + { + "epoch": 1.290295358649789, + "grad_norm": 1.196595549583435, + "learning_rate": 9.478395063627654e-05, + "loss": 0.6653015613555908, + "step": 3058 + }, + { + "epoch": 1.2911392405063291, + "grad_norm": 1.2832285165786743, + "learning_rate": 9.477326614211557e-05, + "loss": 0.7095832824707031, + "step": 3060 + }, + { + "epoch": 1.2919831223628693, + "grad_norm": 1.2234288454055786, + "learning_rate": 9.476257131962198e-05, + "loss": 0.7183426022529602, + "step": 3062 + }, + { + "epoch": 1.2928270042194092, + "grad_norm": 1.2350459098815918, + "learning_rate": 9.475186617126286e-05, + "loss": 0.713284432888031, + "step": 3064 + }, + { + "epoch": 1.2936708860759494, + "grad_norm": 1.2079555988311768, + "learning_rate": 9.47411506995077e-05, + "loss": 0.6580002307891846, + "step": 3066 + }, + { + "epoch": 1.2945147679324895, + "grad_norm": 1.129796028137207, + "learning_rate": 9.473042490682835e-05, + "loss": 0.5967763662338257, + "step": 3068 + }, + { + "epoch": 1.2953586497890295, + "grad_norm": 1.1706618070602417, + "learning_rate": 9.471968879569901e-05, + "loss": 0.6724388003349304, + "step": 3070 + }, + { + "epoch": 1.2962025316455696, + "grad_norm": 1.0336005687713623, + "learning_rate": 9.470894236859635e-05, + "loss": 0.6527577638626099, + "step": 3072 + }, + { + "epoch": 1.2970464135021098, + "grad_norm": 1.1124558448791504, + "learning_rate": 9.469818562799932e-05, + "loss": 0.677132785320282, + "step": 3074 + }, + { + "epoch": 1.2978902953586497, + "grad_norm": 1.158069372177124, + "learning_rate": 9.468741857638933e-05, + "loss": 0.649718165397644, + "step": 3076 + }, + { + "epoch": 1.29873417721519, + "grad_norm": 1.092926263809204, + "learning_rate": 9.46766412162501e-05, + "loss": 0.6872133612632751, + "step": 3078 + }, + { + "epoch": 1.29957805907173, + "grad_norm": 1.1324822902679443, + "learning_rate": 9.466585355006777e-05, + "loss": 0.6495246291160583, + "step": 3080 + }, + { + "epoch": 1.30042194092827, + "grad_norm": 1.5882837772369385, + "learning_rate": 9.465505558033086e-05, + "loss": 0.6730570197105408, + "step": 3082 + }, + { + "epoch": 1.3012658227848102, + "grad_norm": 0.9866069555282593, + "learning_rate": 9.464424730953023e-05, + "loss": 0.5677527785301208, + "step": 3084 + }, + { + "epoch": 1.3021097046413503, + "grad_norm": 1.1560224294662476, + "learning_rate": 9.463342874015917e-05, + "loss": 0.6247856020927429, + "step": 3086 + }, + { + "epoch": 1.3029535864978903, + "grad_norm": 1.135939359664917, + "learning_rate": 9.462259987471329e-05, + "loss": 0.6889358758926392, + "step": 3088 + }, + { + "epoch": 1.3037974683544304, + "grad_norm": 1.3935760259628296, + "learning_rate": 9.461176071569063e-05, + "loss": 0.7097522020339966, + "step": 3090 + }, + { + "epoch": 1.3046413502109704, + "grad_norm": 1.153518795967102, + "learning_rate": 9.460091126559155e-05, + "loss": 0.7044580578804016, + "step": 3092 + }, + { + "epoch": 1.3054852320675105, + "grad_norm": 1.2112717628479004, + "learning_rate": 9.45900515269188e-05, + "loss": 0.6119300723075867, + "step": 3094 + }, + { + "epoch": 1.3063291139240507, + "grad_norm": 1.295591115951538, + "learning_rate": 9.457918150217754e-05, + "loss": 0.7150222063064575, + "step": 3096 + }, + { + "epoch": 1.3071729957805907, + "grad_norm": 1.1175775527954102, + "learning_rate": 9.456830119387527e-05, + "loss": 0.6043334007263184, + "step": 3098 + }, + { + "epoch": 1.3080168776371308, + "grad_norm": 1.4022588729858398, + "learning_rate": 9.455741060452186e-05, + "loss": 0.6354425549507141, + "step": 3100 + }, + { + "epoch": 1.3080168776371308, + "eval_loss": 0.7225774526596069, + "eval_runtime": 862.4006, + "eval_samples_per_second": 2.443, + "eval_steps_per_second": 2.443, + "step": 3100 + }, + { + "epoch": 1.3088607594936708, + "grad_norm": 1.1657692193984985, + "learning_rate": 9.454650973662957e-05, + "loss": 0.7281571626663208, + "step": 3102 + }, + { + "epoch": 1.309704641350211, + "grad_norm": 1.6169127225875854, + "learning_rate": 9.453559859271301e-05, + "loss": 0.8038214445114136, + "step": 3104 + }, + { + "epoch": 1.310548523206751, + "grad_norm": 1.1256520748138428, + "learning_rate": 9.452467717528918e-05, + "loss": 0.6488606333732605, + "step": 3106 + }, + { + "epoch": 1.311392405063291, + "grad_norm": 1.1224530935287476, + "learning_rate": 9.451374548687745e-05, + "loss": 0.6897066235542297, + "step": 3108 + }, + { + "epoch": 1.3122362869198312, + "grad_norm": 1.1123055219650269, + "learning_rate": 9.450280352999952e-05, + "loss": 0.6332913041114807, + "step": 3110 + }, + { + "epoch": 1.3130801687763713, + "grad_norm": 1.1688940525054932, + "learning_rate": 9.449185130717952e-05, + "loss": 0.7426630854606628, + "step": 3112 + }, + { + "epoch": 1.3139240506329113, + "grad_norm": 1.1898044347763062, + "learning_rate": 9.44808888209439e-05, + "loss": 0.7156099677085876, + "step": 3114 + }, + { + "epoch": 1.3147679324894515, + "grad_norm": 1.3030686378479004, + "learning_rate": 9.44699160738215e-05, + "loss": 0.7150979042053223, + "step": 3116 + }, + { + "epoch": 1.3156118143459916, + "grad_norm": 1.1539074182510376, + "learning_rate": 9.445893306834352e-05, + "loss": 0.6687285900115967, + "step": 3118 + }, + { + "epoch": 1.3164556962025316, + "grad_norm": 1.311808466911316, + "learning_rate": 9.444793980704355e-05, + "loss": 0.7340983152389526, + "step": 3120 + }, + { + "epoch": 1.3172995780590717, + "grad_norm": 1.3325430154800415, + "learning_rate": 9.44369362924575e-05, + "loss": 0.6620677709579468, + "step": 3122 + }, + { + "epoch": 1.3181434599156119, + "grad_norm": 1.201518177986145, + "learning_rate": 9.442592252712365e-05, + "loss": 0.6169955134391785, + "step": 3124 + }, + { + "epoch": 1.3189873417721518, + "grad_norm": 1.2124013900756836, + "learning_rate": 9.441489851358272e-05, + "loss": 0.6696792840957642, + "step": 3126 + }, + { + "epoch": 1.319831223628692, + "grad_norm": 1.2186850309371948, + "learning_rate": 9.440386425437768e-05, + "loss": 0.7303428649902344, + "step": 3128 + }, + { + "epoch": 1.3206751054852321, + "grad_norm": 1.3780523538589478, + "learning_rate": 9.439281975205396e-05, + "loss": 0.7093026638031006, + "step": 3130 + }, + { + "epoch": 1.321518987341772, + "grad_norm": 1.233353614807129, + "learning_rate": 9.438176500915932e-05, + "loss": 0.6821767687797546, + "step": 3132 + }, + { + "epoch": 1.3223628691983123, + "grad_norm": 1.2425329685211182, + "learning_rate": 9.437070002824385e-05, + "loss": 0.700680136680603, + "step": 3134 + }, + { + "epoch": 1.3232067510548524, + "grad_norm": 1.1600432395935059, + "learning_rate": 9.435962481186003e-05, + "loss": 0.6173145771026611, + "step": 3136 + }, + { + "epoch": 1.3240506329113924, + "grad_norm": 1.279336929321289, + "learning_rate": 9.434853936256272e-05, + "loss": 0.6597106456756592, + "step": 3138 + }, + { + "epoch": 1.3248945147679325, + "grad_norm": 1.1787258386611938, + "learning_rate": 9.433744368290909e-05, + "loss": 0.6655287742614746, + "step": 3140 + }, + { + "epoch": 1.3257383966244727, + "grad_norm": 1.3658509254455566, + "learning_rate": 9.432633777545874e-05, + "loss": 0.6312944889068604, + "step": 3142 + }, + { + "epoch": 1.3265822784810126, + "grad_norm": 1.1220000982284546, + "learning_rate": 9.431522164277356e-05, + "loss": 0.6696156859397888, + "step": 3144 + }, + { + "epoch": 1.3274261603375528, + "grad_norm": 1.224761724472046, + "learning_rate": 9.430409528741783e-05, + "loss": 0.6586571335792542, + "step": 3146 + }, + { + "epoch": 1.328270042194093, + "grad_norm": 1.227510929107666, + "learning_rate": 9.429295871195821e-05, + "loss": 0.64905846118927, + "step": 3148 + }, + { + "epoch": 1.3291139240506329, + "grad_norm": 1.1359103918075562, + "learning_rate": 9.428181191896366e-05, + "loss": 0.6407933831214905, + "step": 3150 + }, + { + "epoch": 1.329957805907173, + "grad_norm": 1.2729473114013672, + "learning_rate": 9.427065491100556e-05, + "loss": 0.7004884481430054, + "step": 3152 + }, + { + "epoch": 1.3308016877637132, + "grad_norm": 1.1182841062545776, + "learning_rate": 9.42594876906576e-05, + "loss": 0.6835907101631165, + "step": 3154 + }, + { + "epoch": 1.3316455696202532, + "grad_norm": 1.2309781312942505, + "learning_rate": 9.424831026049585e-05, + "loss": 0.7476315498352051, + "step": 3156 + }, + { + "epoch": 1.3324894514767933, + "grad_norm": 1.0857728719711304, + "learning_rate": 9.423712262309873e-05, + "loss": 0.6811426281929016, + "step": 3158 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.299680233001709, + "learning_rate": 9.4225924781047e-05, + "loss": 0.6403942108154297, + "step": 3160 + }, + { + "epoch": 1.3341772151898734, + "grad_norm": 1.226472020149231, + "learning_rate": 9.421471673692382e-05, + "loss": 0.6758930683135986, + "step": 3162 + }, + { + "epoch": 1.3350210970464136, + "grad_norm": 1.1403205394744873, + "learning_rate": 9.420349849331463e-05, + "loss": 0.7119444608688354, + "step": 3164 + }, + { + "epoch": 1.3358649789029535, + "grad_norm": 1.2888442277908325, + "learning_rate": 9.419227005280729e-05, + "loss": 0.7411463260650635, + "step": 3166 + }, + { + "epoch": 1.3367088607594937, + "grad_norm": 1.1929190158843994, + "learning_rate": 9.418103141799197e-05, + "loss": 0.5992606282234192, + "step": 3168 + }, + { + "epoch": 1.3375527426160336, + "grad_norm": 1.2574355602264404, + "learning_rate": 9.416978259146122e-05, + "loss": 0.6728890538215637, + "step": 3170 + }, + { + "epoch": 1.3383966244725738, + "grad_norm": 0.9653727412223816, + "learning_rate": 9.415852357580992e-05, + "loss": 0.6294883489608765, + "step": 3172 + }, + { + "epoch": 1.339240506329114, + "grad_norm": 1.2107670307159424, + "learning_rate": 9.414725437363532e-05, + "loss": 0.6816665530204773, + "step": 3174 + }, + { + "epoch": 1.340084388185654, + "grad_norm": 1.024849534034729, + "learning_rate": 9.4135974987537e-05, + "loss": 0.6186381578445435, + "step": 3176 + }, + { + "epoch": 1.340928270042194, + "grad_norm": 1.1556614637374878, + "learning_rate": 9.41246854201169e-05, + "loss": 0.6071005463600159, + "step": 3178 + }, + { + "epoch": 1.3417721518987342, + "grad_norm": 1.2382808923721313, + "learning_rate": 9.41133856739793e-05, + "loss": 0.7871434092521667, + "step": 3180 + }, + { + "epoch": 1.3426160337552742, + "grad_norm": 1.0499578714370728, + "learning_rate": 9.410207575173082e-05, + "loss": 0.6578201651573181, + "step": 3182 + }, + { + "epoch": 1.3434599156118143, + "grad_norm": 1.2048250436782837, + "learning_rate": 9.409075565598049e-05, + "loss": 0.6271620392799377, + "step": 3184 + }, + { + "epoch": 1.3443037974683545, + "grad_norm": 1.0287591218948364, + "learning_rate": 9.407942538933958e-05, + "loss": 0.5773864388465881, + "step": 3186 + }, + { + "epoch": 1.3451476793248944, + "grad_norm": 1.1125097274780273, + "learning_rate": 9.406808495442181e-05, + "loss": 0.6745175719261169, + "step": 3188 + }, + { + "epoch": 1.3459915611814346, + "grad_norm": 1.036125898361206, + "learning_rate": 9.405673435384319e-05, + "loss": 0.6001214385032654, + "step": 3190 + }, + { + "epoch": 1.3468354430379748, + "grad_norm": 1.2771985530853271, + "learning_rate": 9.404537359022207e-05, + "loss": 0.6703945994377136, + "step": 3192 + }, + { + "epoch": 1.3476793248945147, + "grad_norm": 1.0891097784042358, + "learning_rate": 9.403400266617918e-05, + "loss": 0.6159096360206604, + "step": 3194 + }, + { + "epoch": 1.3485232067510549, + "grad_norm": 1.1926233768463135, + "learning_rate": 9.402262158433755e-05, + "loss": 0.6439315676689148, + "step": 3196 + }, + { + "epoch": 1.349367088607595, + "grad_norm": 1.272557020187378, + "learning_rate": 9.40112303473226e-05, + "loss": 0.7125352025032043, + "step": 3198 + }, + { + "epoch": 1.350210970464135, + "grad_norm": 1.052037239074707, + "learning_rate": 9.399982895776207e-05, + "loss": 0.594719648361206, + "step": 3200 + }, + { + "epoch": 1.350210970464135, + "eval_loss": 0.7200453281402588, + "eval_runtime": 846.2953, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 3200 + }, + { + "epoch": 1.3510548523206751, + "grad_norm": 1.204728126525879, + "learning_rate": 9.398841741828601e-05, + "loss": 0.6390520334243774, + "step": 3202 + }, + { + "epoch": 1.3518987341772153, + "grad_norm": 1.0873899459838867, + "learning_rate": 9.397699573152689e-05, + "loss": 0.6010531187057495, + "step": 3204 + }, + { + "epoch": 1.3527426160337552, + "grad_norm": 1.3124359846115112, + "learning_rate": 9.396556390011944e-05, + "loss": 0.724280834197998, + "step": 3206 + }, + { + "epoch": 1.3535864978902954, + "grad_norm": 1.2179948091506958, + "learning_rate": 9.395412192670075e-05, + "loss": 0.6430405378341675, + "step": 3208 + }, + { + "epoch": 1.3544303797468356, + "grad_norm": 1.2617219686508179, + "learning_rate": 9.394266981391031e-05, + "loss": 0.7188641428947449, + "step": 3210 + }, + { + "epoch": 1.3552742616033755, + "grad_norm": 1.2151501178741455, + "learning_rate": 9.393120756438988e-05, + "loss": 0.6724364757537842, + "step": 3212 + }, + { + "epoch": 1.3561181434599157, + "grad_norm": 1.221528172492981, + "learning_rate": 9.391973518078357e-05, + "loss": 0.6340664625167847, + "step": 3214 + }, + { + "epoch": 1.3569620253164558, + "grad_norm": 1.3180092573165894, + "learning_rate": 9.390825266573786e-05, + "loss": 0.6914255023002625, + "step": 3216 + }, + { + "epoch": 1.3578059071729958, + "grad_norm": 1.103994369506836, + "learning_rate": 9.38967600219015e-05, + "loss": 0.6137136220932007, + "step": 3218 + }, + { + "epoch": 1.358649789029536, + "grad_norm": 1.33389413356781, + "learning_rate": 9.38852572519257e-05, + "loss": 0.7173700332641602, + "step": 3220 + }, + { + "epoch": 1.3594936708860759, + "grad_norm": 1.1074159145355225, + "learning_rate": 9.387374435846386e-05, + "loss": 0.5942243933677673, + "step": 3222 + }, + { + "epoch": 1.360337552742616, + "grad_norm": 1.1157063245773315, + "learning_rate": 9.386222134417182e-05, + "loss": 0.6362866163253784, + "step": 3224 + }, + { + "epoch": 1.3611814345991562, + "grad_norm": 1.1717792749404907, + "learning_rate": 9.38506882117077e-05, + "loss": 0.6784523129463196, + "step": 3226 + }, + { + "epoch": 1.3620253164556961, + "grad_norm": 1.0946043729782104, + "learning_rate": 9.383914496373197e-05, + "loss": 0.6647377014160156, + "step": 3228 + }, + { + "epoch": 1.3628691983122363, + "grad_norm": 1.1519699096679688, + "learning_rate": 9.382759160290746e-05, + "loss": 0.6302075982093811, + "step": 3230 + }, + { + "epoch": 1.3637130801687762, + "grad_norm": 0.9928684830665588, + "learning_rate": 9.381602813189929e-05, + "loss": 0.5979090332984924, + "step": 3232 + }, + { + "epoch": 1.3645569620253164, + "grad_norm": 1.2488124370574951, + "learning_rate": 9.380445455337492e-05, + "loss": 0.6949353218078613, + "step": 3234 + }, + { + "epoch": 1.3654008438818566, + "grad_norm": 1.3884797096252441, + "learning_rate": 9.379287087000416e-05, + "loss": 0.7225558161735535, + "step": 3236 + }, + { + "epoch": 1.3662447257383965, + "grad_norm": 1.2981176376342773, + "learning_rate": 9.378127708445917e-05, + "loss": 0.6993390917778015, + "step": 3238 + }, + { + "epoch": 1.3670886075949367, + "grad_norm": 0.9884640574455261, + "learning_rate": 9.376967319941438e-05, + "loss": 0.6983805894851685, + "step": 3240 + }, + { + "epoch": 1.3679324894514768, + "grad_norm": 1.2051894664764404, + "learning_rate": 9.375805921754659e-05, + "loss": 0.7062534689903259, + "step": 3242 + }, + { + "epoch": 1.3687763713080168, + "grad_norm": 1.1943434476852417, + "learning_rate": 9.374643514153494e-05, + "loss": 0.6405107378959656, + "step": 3244 + }, + { + "epoch": 1.369620253164557, + "grad_norm": 1.249214768409729, + "learning_rate": 9.373480097406086e-05, + "loss": 0.6844781637191772, + "step": 3246 + }, + { + "epoch": 1.370464135021097, + "grad_norm": 1.1847131252288818, + "learning_rate": 9.372315671780813e-05, + "loss": 0.6048306226730347, + "step": 3248 + }, + { + "epoch": 1.371308016877637, + "grad_norm": 1.125545859336853, + "learning_rate": 9.37115023754629e-05, + "loss": 0.6772685050964355, + "step": 3250 + }, + { + "epoch": 1.3721518987341772, + "grad_norm": 1.466615915298462, + "learning_rate": 9.369983794971354e-05, + "loss": 0.7536272406578064, + "step": 3252 + }, + { + "epoch": 1.3729957805907174, + "grad_norm": 1.066699504852295, + "learning_rate": 9.368816344325084e-05, + "loss": 0.6640655398368835, + "step": 3254 + }, + { + "epoch": 1.3738396624472573, + "grad_norm": 1.4793988466262817, + "learning_rate": 9.367647885876787e-05, + "loss": 0.7029458284378052, + "step": 3256 + }, + { + "epoch": 1.3746835443037975, + "grad_norm": 1.258540153503418, + "learning_rate": 9.366478419896006e-05, + "loss": 0.7231863737106323, + "step": 3258 + }, + { + "epoch": 1.3755274261603376, + "grad_norm": 1.176106333732605, + "learning_rate": 9.365307946652512e-05, + "loss": 0.6679144501686096, + "step": 3260 + }, + { + "epoch": 1.3763713080168776, + "grad_norm": 1.3301753997802734, + "learning_rate": 9.364136466416316e-05, + "loss": 0.6282188296318054, + "step": 3262 + }, + { + "epoch": 1.3772151898734177, + "grad_norm": 1.3616732358932495, + "learning_rate": 9.362963979457648e-05, + "loss": 0.6870840191841125, + "step": 3264 + }, + { + "epoch": 1.378059071729958, + "grad_norm": 1.1982418298721313, + "learning_rate": 9.361790486046985e-05, + "loss": 0.6823731660842896, + "step": 3266 + }, + { + "epoch": 1.3789029535864978, + "grad_norm": 1.1869033575057983, + "learning_rate": 9.360615986455024e-05, + "loss": 0.6582897305488586, + "step": 3268 + }, + { + "epoch": 1.379746835443038, + "grad_norm": 1.1192975044250488, + "learning_rate": 9.359440480952703e-05, + "loss": 0.716654360294342, + "step": 3270 + }, + { + "epoch": 1.3805907172995782, + "grad_norm": 1.2210016250610352, + "learning_rate": 9.358263969811189e-05, + "loss": 0.6880061626434326, + "step": 3272 + }, + { + "epoch": 1.381434599156118, + "grad_norm": 1.0358284711837769, + "learning_rate": 9.357086453301878e-05, + "loss": 0.666864812374115, + "step": 3274 + }, + { + "epoch": 1.3822784810126583, + "grad_norm": 1.2790803909301758, + "learning_rate": 9.355907931696401e-05, + "loss": 0.6872087121009827, + "step": 3276 + }, + { + "epoch": 1.3831223628691984, + "grad_norm": 1.182991623878479, + "learning_rate": 9.354728405266623e-05, + "loss": 0.5929665565490723, + "step": 3278 + }, + { + "epoch": 1.3839662447257384, + "grad_norm": 1.1071184873580933, + "learning_rate": 9.353547874284634e-05, + "loss": 0.5928181409835815, + "step": 3280 + }, + { + "epoch": 1.3848101265822785, + "grad_norm": 1.3139623403549194, + "learning_rate": 9.352366339022763e-05, + "loss": 0.6783652901649475, + "step": 3282 + }, + { + "epoch": 1.3856540084388187, + "grad_norm": 1.2534632682800293, + "learning_rate": 9.351183799753567e-05, + "loss": 0.7652941346168518, + "step": 3284 + }, + { + "epoch": 1.3864978902953586, + "grad_norm": 1.4487930536270142, + "learning_rate": 9.350000256749833e-05, + "loss": 0.7430433630943298, + "step": 3286 + }, + { + "epoch": 1.3873417721518988, + "grad_norm": 1.0786021947860718, + "learning_rate": 9.348815710284584e-05, + "loss": 0.5854598879814148, + "step": 3288 + }, + { + "epoch": 1.3881856540084387, + "grad_norm": 1.0544480085372925, + "learning_rate": 9.347630160631071e-05, + "loss": 0.6365222334861755, + "step": 3290 + }, + { + "epoch": 1.389029535864979, + "grad_norm": 0.9989988207817078, + "learning_rate": 9.346443608062778e-05, + "loss": 0.6485803127288818, + "step": 3292 + }, + { + "epoch": 1.389873417721519, + "grad_norm": 1.100951910018921, + "learning_rate": 9.345256052853419e-05, + "loss": 0.6417753100395203, + "step": 3294 + }, + { + "epoch": 1.390717299578059, + "grad_norm": 1.1398471593856812, + "learning_rate": 9.344067495276942e-05, + "loss": 0.6333693861961365, + "step": 3296 + }, + { + "epoch": 1.3915611814345992, + "grad_norm": 1.1745941638946533, + "learning_rate": 9.342877935607521e-05, + "loss": 0.677288293838501, + "step": 3298 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 1.2651115655899048, + "learning_rate": 9.34168737411957e-05, + "loss": 0.7408396005630493, + "step": 3300 + }, + { + "epoch": 1.3924050632911391, + "eval_loss": 0.7173135876655579, + "eval_runtime": 853.5344, + "eval_samples_per_second": 2.469, + "eval_steps_per_second": 2.469, + "step": 3300 + }, + { + "epoch": 1.3932489451476793, + "grad_norm": 1.0747730731964111, + "learning_rate": 9.340495811087723e-05, + "loss": 0.6810371279716492, + "step": 3302 + }, + { + "epoch": 1.3940928270042194, + "grad_norm": 1.2857651710510254, + "learning_rate": 9.339303246786854e-05, + "loss": 0.6693953275680542, + "step": 3304 + }, + { + "epoch": 1.3949367088607594, + "grad_norm": 1.4544212818145752, + "learning_rate": 9.338109681492063e-05, + "loss": 0.7019274234771729, + "step": 3306 + }, + { + "epoch": 1.3957805907172995, + "grad_norm": 1.687755823135376, + "learning_rate": 9.336915115478685e-05, + "loss": 0.6074224710464478, + "step": 3308 + }, + { + "epoch": 1.3966244725738397, + "grad_norm": 1.1645431518554688, + "learning_rate": 9.33571954902228e-05, + "loss": 0.6981383562088013, + "step": 3310 + }, + { + "epoch": 1.3974683544303796, + "grad_norm": 1.6173527240753174, + "learning_rate": 9.334522982398646e-05, + "loss": 0.7282926440238953, + "step": 3312 + }, + { + "epoch": 1.3983122362869198, + "grad_norm": 1.3132909536361694, + "learning_rate": 9.333325415883804e-05, + "loss": 0.6574883460998535, + "step": 3314 + }, + { + "epoch": 1.39915611814346, + "grad_norm": 1.1629762649536133, + "learning_rate": 9.332126849754014e-05, + "loss": 0.6559937596321106, + "step": 3316 + }, + { + "epoch": 1.4, + "grad_norm": 1.1666897535324097, + "learning_rate": 9.33092728428576e-05, + "loss": 0.683718740940094, + "step": 3318 + }, + { + "epoch": 1.40084388185654, + "grad_norm": 1.2269554138183594, + "learning_rate": 9.329726719755756e-05, + "loss": 0.6909779906272888, + "step": 3320 + }, + { + "epoch": 1.4016877637130802, + "grad_norm": 1.1010066270828247, + "learning_rate": 9.328525156440952e-05, + "loss": 0.6051948666572571, + "step": 3322 + }, + { + "epoch": 1.4025316455696202, + "grad_norm": 1.127143144607544, + "learning_rate": 9.327322594618528e-05, + "loss": 0.6266679763793945, + "step": 3324 + }, + { + "epoch": 1.4033755274261603, + "grad_norm": 1.2160708904266357, + "learning_rate": 9.326119034565887e-05, + "loss": 0.6587526202201843, + "step": 3326 + }, + { + "epoch": 1.4042194092827005, + "grad_norm": 1.0853947401046753, + "learning_rate": 9.32491447656067e-05, + "loss": 0.5916946530342102, + "step": 3328 + }, + { + "epoch": 1.4050632911392404, + "grad_norm": 1.2205027341842651, + "learning_rate": 9.323708920880744e-05, + "loss": 0.6032452583312988, + "step": 3330 + }, + { + "epoch": 1.4059071729957806, + "grad_norm": 1.1964668035507202, + "learning_rate": 9.32250236780421e-05, + "loss": 0.6649114489555359, + "step": 3332 + }, + { + "epoch": 1.4067510548523208, + "grad_norm": 1.2507994174957275, + "learning_rate": 9.321294817609394e-05, + "loss": 0.7142994403839111, + "step": 3334 + }, + { + "epoch": 1.4075949367088607, + "grad_norm": 1.1310259103775024, + "learning_rate": 9.320086270574854e-05, + "loss": 0.709568977355957, + "step": 3336 + }, + { + "epoch": 1.4084388185654009, + "grad_norm": 1.2454090118408203, + "learning_rate": 9.318876726979385e-05, + "loss": 0.7800853848457336, + "step": 3338 + }, + { + "epoch": 1.409282700421941, + "grad_norm": 1.1168389320373535, + "learning_rate": 9.317666187101996e-05, + "loss": 0.6187908053398132, + "step": 3340 + }, + { + "epoch": 1.410126582278481, + "grad_norm": 1.6696287393569946, + "learning_rate": 9.316454651221942e-05, + "loss": 0.6222613453865051, + "step": 3342 + }, + { + "epoch": 1.4109704641350211, + "grad_norm": 0.9500295519828796, + "learning_rate": 9.315242119618698e-05, + "loss": 0.6116594672203064, + "step": 3344 + }, + { + "epoch": 1.4118143459915613, + "grad_norm": 1.186358094215393, + "learning_rate": 9.314028592571973e-05, + "loss": 0.633224368095398, + "step": 3346 + }, + { + "epoch": 1.4126582278481012, + "grad_norm": 1.1855978965759277, + "learning_rate": 9.312814070361705e-05, + "loss": 0.6675921082496643, + "step": 3348 + }, + { + "epoch": 1.4135021097046414, + "grad_norm": 1.2465872764587402, + "learning_rate": 9.311598553268059e-05, + "loss": 0.7268879413604736, + "step": 3350 + }, + { + "epoch": 1.4143459915611816, + "grad_norm": 1.151274561882019, + "learning_rate": 9.310382041571435e-05, + "loss": 0.6147416830062866, + "step": 3352 + }, + { + "epoch": 1.4151898734177215, + "grad_norm": 1.1226807832717896, + "learning_rate": 9.309164535552453e-05, + "loss": 0.6678543090820312, + "step": 3354 + }, + { + "epoch": 1.4160337552742617, + "grad_norm": 1.375842571258545, + "learning_rate": 9.307946035491975e-05, + "loss": 0.6334129571914673, + "step": 3356 + }, + { + "epoch": 1.4168776371308016, + "grad_norm": 1.058353066444397, + "learning_rate": 9.306726541671081e-05, + "loss": 0.6582583785057068, + "step": 3358 + }, + { + "epoch": 1.4177215189873418, + "grad_norm": 1.0511330366134644, + "learning_rate": 9.305506054371084e-05, + "loss": 0.5877419114112854, + "step": 3360 + }, + { + "epoch": 1.4185654008438817, + "grad_norm": 1.2246462106704712, + "learning_rate": 9.304284573873532e-05, + "loss": 0.711665689945221, + "step": 3362 + }, + { + "epoch": 1.4194092827004219, + "grad_norm": 1.0242294073104858, + "learning_rate": 9.303062100460193e-05, + "loss": 0.6743642687797546, + "step": 3364 + }, + { + "epoch": 1.420253164556962, + "grad_norm": 1.1432100534439087, + "learning_rate": 9.301838634413069e-05, + "loss": 0.6825576424598694, + "step": 3366 + }, + { + "epoch": 1.421097046413502, + "grad_norm": 1.0128604173660278, + "learning_rate": 9.30061417601439e-05, + "loss": 0.624455988407135, + "step": 3368 + }, + { + "epoch": 1.4219409282700421, + "grad_norm": 1.2738330364227295, + "learning_rate": 9.299388725546617e-05, + "loss": 0.7029586434364319, + "step": 3370 + }, + { + "epoch": 1.4227848101265823, + "grad_norm": 1.0857324600219727, + "learning_rate": 9.298162283292435e-05, + "loss": 0.5994319915771484, + "step": 3372 + }, + { + "epoch": 1.4236286919831223, + "grad_norm": 1.0811917781829834, + "learning_rate": 9.296934849534763e-05, + "loss": 0.6537772417068481, + "step": 3374 + }, + { + "epoch": 1.4244725738396624, + "grad_norm": 1.006913185119629, + "learning_rate": 9.295706424556745e-05, + "loss": 0.5775008201599121, + "step": 3376 + }, + { + "epoch": 1.4253164556962026, + "grad_norm": 1.2306486368179321, + "learning_rate": 9.294477008641755e-05, + "loss": 0.7445536255836487, + "step": 3378 + }, + { + "epoch": 1.4261603375527425, + "grad_norm": 1.223608374595642, + "learning_rate": 9.293246602073398e-05, + "loss": 0.6081538796424866, + "step": 3380 + }, + { + "epoch": 1.4270042194092827, + "grad_norm": 1.0933321714401245, + "learning_rate": 9.2920152051355e-05, + "loss": 0.6134634613990784, + "step": 3382 + }, + { + "epoch": 1.4278481012658228, + "grad_norm": 1.1738401651382446, + "learning_rate": 9.290782818112127e-05, + "loss": 0.5961087346076965, + "step": 3384 + }, + { + "epoch": 1.4286919831223628, + "grad_norm": 1.1493438482284546, + "learning_rate": 9.289549441287561e-05, + "loss": 0.6284122467041016, + "step": 3386 + }, + { + "epoch": 1.429535864978903, + "grad_norm": 1.1907998323440552, + "learning_rate": 9.288315074946324e-05, + "loss": 0.6654639840126038, + "step": 3388 + }, + { + "epoch": 1.4303797468354431, + "grad_norm": 1.3423025608062744, + "learning_rate": 9.287079719373157e-05, + "loss": 0.652850329875946, + "step": 3390 + }, + { + "epoch": 1.431223628691983, + "grad_norm": 1.3932039737701416, + "learning_rate": 9.285843374853034e-05, + "loss": 0.703445315361023, + "step": 3392 + }, + { + "epoch": 1.4320675105485232, + "grad_norm": 5.349400043487549, + "learning_rate": 9.284606041671155e-05, + "loss": 0.693265438079834, + "step": 3394 + }, + { + "epoch": 1.4329113924050634, + "grad_norm": 1.0921961069107056, + "learning_rate": 9.28336772011295e-05, + "loss": 0.6578536033630371, + "step": 3396 + }, + { + "epoch": 1.4337552742616033, + "grad_norm": 1.184157133102417, + "learning_rate": 9.282128410464074e-05, + "loss": 0.7092277407646179, + "step": 3398 + }, + { + "epoch": 1.4345991561181435, + "grad_norm": 1.0923491716384888, + "learning_rate": 9.280888113010415e-05, + "loss": 0.6866328120231628, + "step": 3400 + }, + { + "epoch": 1.4345991561181435, + "eval_loss": 0.715917706489563, + "eval_runtime": 868.51, + "eval_samples_per_second": 2.426, + "eval_steps_per_second": 2.426, + "step": 3400 + }, + { + "epoch": 1.4354430379746836, + "grad_norm": 1.2515597343444824, + "learning_rate": 9.279646828038083e-05, + "loss": 0.6617444157600403, + "step": 3402 + }, + { + "epoch": 1.4362869198312236, + "grad_norm": 1.2122540473937988, + "learning_rate": 9.278404555833422e-05, + "loss": 0.6373176574707031, + "step": 3404 + }, + { + "epoch": 1.4371308016877637, + "grad_norm": 1.191904902458191, + "learning_rate": 9.277161296682997e-05, + "loss": 0.6506488919258118, + "step": 3406 + }, + { + "epoch": 1.437974683544304, + "grad_norm": 1.2492214441299438, + "learning_rate": 9.275917050873606e-05, + "loss": 0.7172291874885559, + "step": 3408 + }, + { + "epoch": 1.4388185654008439, + "grad_norm": 1.0518640279769897, + "learning_rate": 9.274671818692272e-05, + "loss": 0.6180248260498047, + "step": 3410 + }, + { + "epoch": 1.439662447257384, + "grad_norm": 1.150563359260559, + "learning_rate": 9.273425600426245e-05, + "loss": 0.6828892827033997, + "step": 3412 + }, + { + "epoch": 1.4405063291139242, + "grad_norm": 1.76945960521698, + "learning_rate": 9.272178396363005e-05, + "loss": 0.6585919857025146, + "step": 3414 + }, + { + "epoch": 1.4413502109704641, + "grad_norm": 1.2367758750915527, + "learning_rate": 9.270930206790257e-05, + "loss": 0.7548692226409912, + "step": 3416 + }, + { + "epoch": 1.4421940928270043, + "grad_norm": 1.2292778491973877, + "learning_rate": 9.269681031995936e-05, + "loss": 0.7017102837562561, + "step": 3418 + }, + { + "epoch": 1.4430379746835442, + "grad_norm": 1.2193396091461182, + "learning_rate": 9.268430872268202e-05, + "loss": 0.6657648682594299, + "step": 3420 + }, + { + "epoch": 1.4438818565400844, + "grad_norm": 1.0505954027175903, + "learning_rate": 9.267179727895443e-05, + "loss": 0.6950910091400146, + "step": 3422 + }, + { + "epoch": 1.4447257383966245, + "grad_norm": 1.1560698747634888, + "learning_rate": 9.265927599166272e-05, + "loss": 0.689308226108551, + "step": 3424 + }, + { + "epoch": 1.4455696202531645, + "grad_norm": 1.189336895942688, + "learning_rate": 9.264674486369533e-05, + "loss": 0.6481659412384033, + "step": 3426 + }, + { + "epoch": 1.4464135021097047, + "grad_norm": 1.3527976274490356, + "learning_rate": 9.263420389794294e-05, + "loss": 0.6626612544059753, + "step": 3428 + }, + { + "epoch": 1.4472573839662446, + "grad_norm": 1.096303105354309, + "learning_rate": 9.262165309729854e-05, + "loss": 0.690841794013977, + "step": 3430 + }, + { + "epoch": 1.4481012658227848, + "grad_norm": 1.2131421566009521, + "learning_rate": 9.260909246465732e-05, + "loss": 0.6497649550437927, + "step": 3432 + }, + { + "epoch": 1.448945147679325, + "grad_norm": 1.1831032037734985, + "learning_rate": 9.259652200291678e-05, + "loss": 0.6236130595207214, + "step": 3434 + }, + { + "epoch": 1.4497890295358649, + "grad_norm": 0.9745979309082031, + "learning_rate": 9.25839417149767e-05, + "loss": 0.5223423838615417, + "step": 3436 + }, + { + "epoch": 1.450632911392405, + "grad_norm": 1.372460126876831, + "learning_rate": 9.257135160373912e-05, + "loss": 0.6642022728919983, + "step": 3438 + }, + { + "epoch": 1.4514767932489452, + "grad_norm": 1.421044111251831, + "learning_rate": 9.255875167210832e-05, + "loss": 0.5426992774009705, + "step": 3440 + }, + { + "epoch": 1.4523206751054851, + "grad_norm": 1.1694250106811523, + "learning_rate": 9.254614192299086e-05, + "loss": 0.6260567307472229, + "step": 3442 + }, + { + "epoch": 1.4531645569620253, + "grad_norm": 1.0892298221588135, + "learning_rate": 9.253352235929558e-05, + "loss": 0.5776100158691406, + "step": 3444 + }, + { + "epoch": 1.4540084388185655, + "grad_norm": 1.1841259002685547, + "learning_rate": 9.252089298393356e-05, + "loss": 0.6495202779769897, + "step": 3446 + }, + { + "epoch": 1.4548523206751054, + "grad_norm": 1.1133549213409424, + "learning_rate": 9.250825379981815e-05, + "loss": 0.6570594906806946, + "step": 3448 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 1.197100281715393, + "learning_rate": 9.249560480986498e-05, + "loss": 0.6496587991714478, + "step": 3450 + }, + { + "epoch": 1.4565400843881857, + "grad_norm": 1.1661107540130615, + "learning_rate": 9.248294601699193e-05, + "loss": 0.6644704341888428, + "step": 3452 + }, + { + "epoch": 1.4573839662447257, + "grad_norm": 1.2257879972457886, + "learning_rate": 9.247027742411912e-05, + "loss": 0.6451231241226196, + "step": 3454 + }, + { + "epoch": 1.4582278481012658, + "grad_norm": 1.3634982109069824, + "learning_rate": 9.245759903416897e-05, + "loss": 0.6108601093292236, + "step": 3456 + }, + { + "epoch": 1.459071729957806, + "grad_norm": 1.1802605390548706, + "learning_rate": 9.244491085006615e-05, + "loss": 0.6080004572868347, + "step": 3458 + }, + { + "epoch": 1.459915611814346, + "grad_norm": 1.280831217765808, + "learning_rate": 9.243221287473756e-05, + "loss": 0.6406423449516296, + "step": 3460 + }, + { + "epoch": 1.460759493670886, + "grad_norm": 1.3127192258834839, + "learning_rate": 9.241950511111237e-05, + "loss": 0.7320113778114319, + "step": 3462 + }, + { + "epoch": 1.4616033755274263, + "grad_norm": 1.1711835861206055, + "learning_rate": 9.240678756212204e-05, + "loss": 0.572110652923584, + "step": 3464 + }, + { + "epoch": 1.4624472573839662, + "grad_norm": 1.347143292427063, + "learning_rate": 9.239406023070028e-05, + "loss": 0.7446795105934143, + "step": 3466 + }, + { + "epoch": 1.4632911392405064, + "grad_norm": 1.4953652620315552, + "learning_rate": 9.238132311978299e-05, + "loss": 0.6709978580474854, + "step": 3468 + }, + { + "epoch": 1.4641350210970465, + "grad_norm": 1.2199387550354004, + "learning_rate": 9.236857623230842e-05, + "loss": 0.6691445112228394, + "step": 3470 + }, + { + "epoch": 1.4649789029535865, + "grad_norm": 1.0959199666976929, + "learning_rate": 9.235581957121702e-05, + "loss": 0.6964292526245117, + "step": 3472 + }, + { + "epoch": 1.4658227848101266, + "grad_norm": 1.455505609512329, + "learning_rate": 9.234305313945149e-05, + "loss": 0.6880454421043396, + "step": 3474 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.2820862531661987, + "learning_rate": 9.233027693995681e-05, + "loss": 0.6737138032913208, + "step": 3476 + }, + { + "epoch": 1.4675105485232067, + "grad_norm": 1.3459213972091675, + "learning_rate": 9.231749097568023e-05, + "loss": 0.6874006390571594, + "step": 3478 + }, + { + "epoch": 1.4683544303797469, + "grad_norm": 1.2815442085266113, + "learning_rate": 9.230469524957119e-05, + "loss": 0.7179469466209412, + "step": 3480 + }, + { + "epoch": 1.469198312236287, + "grad_norm": 1.6181597709655762, + "learning_rate": 9.229188976458145e-05, + "loss": 0.7525522112846375, + "step": 3482 + }, + { + "epoch": 1.470042194092827, + "grad_norm": 1.0633227825164795, + "learning_rate": 9.227907452366495e-05, + "loss": 0.5918128490447998, + "step": 3484 + }, + { + "epoch": 1.4708860759493672, + "grad_norm": 1.2055985927581787, + "learning_rate": 9.226624952977796e-05, + "loss": 0.6686186194419861, + "step": 3486 + }, + { + "epoch": 1.471729957805907, + "grad_norm": 1.2495088577270508, + "learning_rate": 9.225341478587893e-05, + "loss": 0.764410674571991, + "step": 3488 + }, + { + "epoch": 1.4725738396624473, + "grad_norm": 1.174229383468628, + "learning_rate": 9.22405702949286e-05, + "loss": 0.7066780924797058, + "step": 3490 + }, + { + "epoch": 1.4734177215189874, + "grad_norm": 1.0970302820205688, + "learning_rate": 9.222771605988995e-05, + "loss": 0.6740228533744812, + "step": 3492 + }, + { + "epoch": 1.4742616033755274, + "grad_norm": 1.2470436096191406, + "learning_rate": 9.221485208372822e-05, + "loss": 0.698371410369873, + "step": 3494 + }, + { + "epoch": 1.4751054852320675, + "grad_norm": 1.0750112533569336, + "learning_rate": 9.220197836941084e-05, + "loss": 0.6354188919067383, + "step": 3496 + }, + { + "epoch": 1.4759493670886075, + "grad_norm": 1.2656232118606567, + "learning_rate": 9.218909491990757e-05, + "loss": 0.7268608212471008, + "step": 3498 + }, + { + "epoch": 1.4767932489451476, + "grad_norm": 1.2389028072357178, + "learning_rate": 9.217620173819037e-05, + "loss": 0.6652966141700745, + "step": 3500 + }, + { + "epoch": 1.4767932489451476, + "eval_loss": 0.7155047059059143, + "eval_runtime": 855.8428, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 2.462, + "step": 3500 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 1 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.6346648626716406e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-3500/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-3500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4000/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4000/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4000/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..23869e32e3983b5bc9964f3382d0e57db9cc4d73 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:173a803579223c82285604e1b52adbd56d0c7bce3dacfd03542a70e4c687754d +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4000/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d51952da9ffbc07833d143822a4a14d19c8c779 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecf55d2c399bc1b0d845729a6b157e58e30c2ba7e64c09a13f9f815790a57656 +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4000/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1bb7b00b7753923e97897743dabd3e1c12bd9ccf --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c3471bbfccf29229fd8d758df0990b2b6bed4da552506dccf18432cdaf667e8 +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4000/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e159360da705618af0016ddb18545746f2ada3b8 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e026bb3fd7728930bfedd3c5adee45849b3409c0ff3f1fd92edac6187cbc0f6b +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4000/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aab536d595d3a9c31fa828cf89ce3b0e346a3109 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/trainer_state.json @@ -0,0 +1,14363 @@ +{ + "best_global_step": 4000, + "best_metric": 0.7027890682220459, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-4000", + "epoch": 1.6877637130801688, + "eval_steps": 100, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + }, + { + "epoch": 0.8447257383966245, + "grad_norm": 1.145434021949768, + "learning_rate": 9.893006338154401e-05, + "loss": 0.731850802898407, + "step": 2002 + }, + { + "epoch": 0.8455696202531645, + "grad_norm": 1.0618077516555786, + "learning_rate": 9.892511635084101e-05, + "loss": 0.6711665391921997, + "step": 2004 + }, + { + "epoch": 0.8464135021097047, + "grad_norm": 1.1657867431640625, + "learning_rate": 9.892015803405331e-05, + "loss": 0.6894803643226624, + "step": 2006 + }, + { + "epoch": 0.8472573839662447, + "grad_norm": 1.080140233039856, + "learning_rate": 9.891518843232467e-05, + "loss": 0.628146231174469, + "step": 2008 + }, + { + "epoch": 0.8481012658227848, + "grad_norm": 1.0664509534835815, + "learning_rate": 9.891020754680151e-05, + "loss": 0.740858793258667, + "step": 2010 + }, + { + "epoch": 0.8489451476793249, + "grad_norm": 1.5567615032196045, + "learning_rate": 9.89052153786328e-05, + "loss": 0.7763919234275818, + "step": 2012 + }, + { + "epoch": 0.849789029535865, + "grad_norm": 1.4347095489501953, + "learning_rate": 9.890021192897016e-05, + "loss": 0.8131396770477295, + "step": 2014 + }, + { + "epoch": 0.850632911392405, + "grad_norm": 1.1787892580032349, + "learning_rate": 9.889519719896776e-05, + "loss": 0.6829051375389099, + "step": 2016 + }, + { + "epoch": 0.8514767932489451, + "grad_norm": 1.239745855331421, + "learning_rate": 9.889017118978241e-05, + "loss": 0.7664558291435242, + "step": 2018 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 1.1224207878112793, + "learning_rate": 9.888513390257352e-05, + "loss": 0.7307376861572266, + "step": 2020 + }, + { + "epoch": 0.8531645569620253, + "grad_norm": 1.100536823272705, + "learning_rate": 9.88800853385031e-05, + "loss": 0.6786578893661499, + "step": 2022 + }, + { + "epoch": 0.8540084388185654, + "grad_norm": 1.25773024559021, + "learning_rate": 9.887502549873576e-05, + "loss": 0.7971984148025513, + "step": 2024 + }, + { + "epoch": 0.8548523206751055, + "grad_norm": 0.9980104565620422, + "learning_rate": 9.886995438443868e-05, + "loss": 0.6990941166877747, + "step": 2026 + }, + { + "epoch": 0.8556962025316456, + "grad_norm": 1.0464621782302856, + "learning_rate": 9.886487199678171e-05, + "loss": 0.763938307762146, + "step": 2028 + }, + { + "epoch": 0.8565400843881856, + "grad_norm": 1.2303017377853394, + "learning_rate": 9.885977833693724e-05, + "loss": 0.7165632247924805, + "step": 2030 + }, + { + "epoch": 0.8573839662447258, + "grad_norm": 1.2203325033187866, + "learning_rate": 9.885467340608027e-05, + "loss": 0.7586364150047302, + "step": 2032 + }, + { + "epoch": 0.8582278481012658, + "grad_norm": 1.113882064819336, + "learning_rate": 9.884955720538843e-05, + "loss": 0.703253984451294, + "step": 2034 + }, + { + "epoch": 0.8590717299578059, + "grad_norm": 1.1731632947921753, + "learning_rate": 9.88444297360419e-05, + "loss": 0.8530917763710022, + "step": 2036 + }, + { + "epoch": 0.859915611814346, + "grad_norm": 1.4592338800430298, + "learning_rate": 9.883929099922349e-05, + "loss": 0.8166638612747192, + "step": 2038 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.1279125213623047, + "learning_rate": 9.883414099611864e-05, + "loss": 0.6762415170669556, + "step": 2040 + }, + { + "epoch": 0.8616033755274262, + "grad_norm": 1.1587293148040771, + "learning_rate": 9.882897972791534e-05, + "loss": 0.6826539039611816, + "step": 2042 + }, + { + "epoch": 0.8624472573839662, + "grad_norm": 1.1909502744674683, + "learning_rate": 9.88238071958042e-05, + "loss": 0.7372410893440247, + "step": 2044 + }, + { + "epoch": 0.8632911392405064, + "grad_norm": 1.0340155363082886, + "learning_rate": 9.881862340097841e-05, + "loss": 0.699260950088501, + "step": 2046 + }, + { + "epoch": 0.8641350210970464, + "grad_norm": 1.1745870113372803, + "learning_rate": 9.881342834463379e-05, + "loss": 0.7689789533615112, + "step": 2048 + }, + { + "epoch": 0.8649789029535865, + "grad_norm": 1.0003606081008911, + "learning_rate": 9.880822202796872e-05, + "loss": 0.6877372860908508, + "step": 2050 + }, + { + "epoch": 0.8658227848101265, + "grad_norm": 1.2546781301498413, + "learning_rate": 9.88030044521842e-05, + "loss": 0.7632413506507874, + "step": 2052 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.1178704500198364, + "learning_rate": 9.879777561848385e-05, + "loss": 0.6776729822158813, + "step": 2054 + }, + { + "epoch": 0.8675105485232067, + "grad_norm": 1.523606777191162, + "learning_rate": 9.879253552807384e-05, + "loss": 0.7592973709106445, + "step": 2056 + }, + { + "epoch": 0.8683544303797468, + "grad_norm": 1.3490995168685913, + "learning_rate": 9.878728418216296e-05, + "loss": 0.8028839230537415, + "step": 2058 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 1.1851624250411987, + "learning_rate": 9.87820215819626e-05, + "loss": 0.7499933838844299, + "step": 2060 + }, + { + "epoch": 0.870042194092827, + "grad_norm": 1.1877925395965576, + "learning_rate": 9.877674772868672e-05, + "loss": 0.7324717044830322, + "step": 2062 + }, + { + "epoch": 0.8708860759493671, + "grad_norm": 1.2982885837554932, + "learning_rate": 9.877146262355194e-05, + "loss": 0.7456585168838501, + "step": 2064 + }, + { + "epoch": 0.8717299578059071, + "grad_norm": 1.043912649154663, + "learning_rate": 9.876616626777739e-05, + "loss": 0.7552799582481384, + "step": 2066 + }, + { + "epoch": 0.8725738396624473, + "grad_norm": 1.172580599784851, + "learning_rate": 9.876085866258487e-05, + "loss": 0.6964990496635437, + "step": 2068 + }, + { + "epoch": 0.8734177215189873, + "grad_norm": 1.26815927028656, + "learning_rate": 9.875553980919871e-05, + "loss": 0.7368612289428711, + "step": 2070 + }, + { + "epoch": 0.8742616033755274, + "grad_norm": 1.1268136501312256, + "learning_rate": 9.875020970884587e-05, + "loss": 0.7400802969932556, + "step": 2072 + }, + { + "epoch": 0.8751054852320675, + "grad_norm": 1.0556721687316895, + "learning_rate": 9.874486836275594e-05, + "loss": 0.6931334137916565, + "step": 2074 + }, + { + "epoch": 0.8759493670886076, + "grad_norm": 1.1967823505401611, + "learning_rate": 9.873951577216106e-05, + "loss": 0.7124089002609253, + "step": 2076 + }, + { + "epoch": 0.8767932489451477, + "grad_norm": 1.1753164529800415, + "learning_rate": 9.873415193829591e-05, + "loss": 0.7462030053138733, + "step": 2078 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 1.326923131942749, + "learning_rate": 9.872877686239789e-05, + "loss": 0.778078019618988, + "step": 2080 + }, + { + "epoch": 0.8784810126582279, + "grad_norm": 1.1472662687301636, + "learning_rate": 9.87233905457069e-05, + "loss": 0.6592919826507568, + "step": 2082 + }, + { + "epoch": 0.8793248945147679, + "grad_norm": 1.1162762641906738, + "learning_rate": 9.871799298946544e-05, + "loss": 0.661717414855957, + "step": 2084 + }, + { + "epoch": 0.880168776371308, + "grad_norm": 1.1694408655166626, + "learning_rate": 9.871258419491866e-05, + "loss": 0.6203670501708984, + "step": 2086 + }, + { + "epoch": 0.8810126582278481, + "grad_norm": 1.229691505432129, + "learning_rate": 9.870716416331425e-05, + "loss": 0.758888304233551, + "step": 2088 + }, + { + "epoch": 0.8818565400843882, + "grad_norm": 1.540377140045166, + "learning_rate": 9.870173289590251e-05, + "loss": 0.760649561882019, + "step": 2090 + }, + { + "epoch": 0.8827004219409282, + "grad_norm": 1.173628568649292, + "learning_rate": 9.869629039393632e-05, + "loss": 0.6981227397918701, + "step": 2092 + }, + { + "epoch": 0.8835443037974684, + "grad_norm": 1.1404013633728027, + "learning_rate": 9.869083665867116e-05, + "loss": 0.7808336615562439, + "step": 2094 + }, + { + "epoch": 0.8843881856540085, + "grad_norm": 1.1038721799850464, + "learning_rate": 9.868537169136511e-05, + "loss": 0.7540555596351624, + "step": 2096 + }, + { + "epoch": 0.8852320675105485, + "grad_norm": 1.1510080099105835, + "learning_rate": 9.867989549327885e-05, + "loss": 0.6650454998016357, + "step": 2098 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.166912317276001, + "learning_rate": 9.867440806567561e-05, + "loss": 0.673769474029541, + "step": 2100 + }, + { + "epoch": 0.8860759493670886, + "eval_loss": 0.7559094429016113, + "eval_runtime": 847.8311, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2100 + }, + { + "epoch": 0.8869198312236287, + "grad_norm": 1.227583885192871, + "learning_rate": 9.866890940982121e-05, + "loss": 0.8314241766929626, + "step": 2102 + }, + { + "epoch": 0.8877637130801688, + "grad_norm": 1.1813976764678955, + "learning_rate": 9.866339952698413e-05, + "loss": 0.6770843863487244, + "step": 2104 + }, + { + "epoch": 0.8886075949367088, + "grad_norm": 1.2471063137054443, + "learning_rate": 9.865787841843539e-05, + "loss": 0.7142292857170105, + "step": 2106 + }, + { + "epoch": 0.889451476793249, + "grad_norm": 1.1602860689163208, + "learning_rate": 9.865234608544858e-05, + "loss": 0.6981731653213501, + "step": 2108 + }, + { + "epoch": 0.890295358649789, + "grad_norm": 1.145677089691162, + "learning_rate": 9.864680252929992e-05, + "loss": 0.7019379138946533, + "step": 2110 + }, + { + "epoch": 0.8911392405063291, + "grad_norm": 1.2222462892532349, + "learning_rate": 9.86412477512682e-05, + "loss": 0.7690986394882202, + "step": 2112 + }, + { + "epoch": 0.8919831223628693, + "grad_norm": 1.1288166046142578, + "learning_rate": 9.863568175263478e-05, + "loss": 0.7241792678833008, + "step": 2114 + }, + { + "epoch": 0.8928270042194093, + "grad_norm": 1.1773978471755981, + "learning_rate": 9.863010453468364e-05, + "loss": 0.7392162084579468, + "step": 2116 + }, + { + "epoch": 0.8936708860759494, + "grad_norm": 1.102638840675354, + "learning_rate": 9.862451609870136e-05, + "loss": 0.7603078484535217, + "step": 2118 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.861891644597707e-05, + "loss": 0.6804911494255066, + "step": 2120 + }, + { + "epoch": 0.8953586497890296, + "grad_norm": 1.1381969451904297, + "learning_rate": 9.86133055778025e-05, + "loss": 0.787288248538971, + "step": 2122 + }, + { + "epoch": 0.8962025316455696, + "grad_norm": 1.2454546689987183, + "learning_rate": 9.860768349547196e-05, + "loss": 0.7282505035400391, + "step": 2124 + }, + { + "epoch": 0.8970464135021097, + "grad_norm": 1.2568305730819702, + "learning_rate": 9.860205020028237e-05, + "loss": 0.7554803490638733, + "step": 2126 + }, + { + "epoch": 0.8978902953586498, + "grad_norm": 1.1523523330688477, + "learning_rate": 9.859640569353321e-05, + "loss": 0.7126525044441223, + "step": 2128 + }, + { + "epoch": 0.8987341772151899, + "grad_norm": 1.314878225326538, + "learning_rate": 9.859074997652658e-05, + "loss": 0.7300811409950256, + "step": 2130 + }, + { + "epoch": 0.8995780590717299, + "grad_norm": 1.1272218227386475, + "learning_rate": 9.858508305056713e-05, + "loss": 0.7217329144477844, + "step": 2132 + }, + { + "epoch": 0.90042194092827, + "grad_norm": 1.10934317111969, + "learning_rate": 9.857940491696211e-05, + "loss": 0.714308500289917, + "step": 2134 + }, + { + "epoch": 0.9012658227848102, + "grad_norm": 1.1991039514541626, + "learning_rate": 9.857371557702136e-05, + "loss": 0.6613366007804871, + "step": 2136 + }, + { + "epoch": 0.9021097046413502, + "grad_norm": 1.3176918029785156, + "learning_rate": 9.85680150320573e-05, + "loss": 0.6972863078117371, + "step": 2138 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 1.1966592073440552, + "learning_rate": 9.856230328338496e-05, + "loss": 0.7299100160598755, + "step": 2140 + }, + { + "epoch": 0.9037974683544304, + "grad_norm": 1.2889270782470703, + "learning_rate": 9.85565803323219e-05, + "loss": 0.7145020961761475, + "step": 2142 + }, + { + "epoch": 0.9046413502109705, + "grad_norm": 1.2112789154052734, + "learning_rate": 9.855084618018828e-05, + "loss": 0.6717942953109741, + "step": 2144 + }, + { + "epoch": 0.9054852320675105, + "grad_norm": 1.2550239562988281, + "learning_rate": 9.85451008283069e-05, + "loss": 0.7460196018218994, + "step": 2146 + }, + { + "epoch": 0.9063291139240506, + "grad_norm": 1.2926387786865234, + "learning_rate": 9.853934427800309e-05, + "loss": 0.8300626873970032, + "step": 2148 + }, + { + "epoch": 0.9071729957805907, + "grad_norm": 1.0690672397613525, + "learning_rate": 9.853357653060478e-05, + "loss": 0.715215802192688, + "step": 2150 + }, + { + "epoch": 0.9080168776371308, + "grad_norm": 1.1021424531936646, + "learning_rate": 9.852779758744245e-05, + "loss": 0.7021427154541016, + "step": 2152 + }, + { + "epoch": 0.9088607594936708, + "grad_norm": 1.0713517665863037, + "learning_rate": 9.852200744984921e-05, + "loss": 0.7576406598091125, + "step": 2154 + }, + { + "epoch": 0.909704641350211, + "grad_norm": 1.277526617050171, + "learning_rate": 9.851620611916075e-05, + "loss": 0.7008846998214722, + "step": 2156 + }, + { + "epoch": 0.9105485232067511, + "grad_norm": 1.2434618473052979, + "learning_rate": 9.85103935967153e-05, + "loss": 0.7536613345146179, + "step": 2158 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 1.1654841899871826, + "learning_rate": 9.850456988385371e-05, + "loss": 0.7435567378997803, + "step": 2160 + }, + { + "epoch": 0.9122362869198313, + "grad_norm": 1.0718246698379517, + "learning_rate": 9.849873498191939e-05, + "loss": 0.7725666165351868, + "step": 2162 + }, + { + "epoch": 0.9130801687763713, + "grad_norm": 1.3425630331039429, + "learning_rate": 9.849288889225835e-05, + "loss": 0.7833593487739563, + "step": 2164 + }, + { + "epoch": 0.9139240506329114, + "grad_norm": 1.1989985704421997, + "learning_rate": 9.848703161621917e-05, + "loss": 0.7290158867835999, + "step": 2166 + }, + { + "epoch": 0.9147679324894514, + "grad_norm": 1.0549380779266357, + "learning_rate": 9.8481163155153e-05, + "loss": 0.6787996888160706, + "step": 2168 + }, + { + "epoch": 0.9156118143459916, + "grad_norm": 1.0757017135620117, + "learning_rate": 9.847528351041359e-05, + "loss": 0.7645748853683472, + "step": 2170 + }, + { + "epoch": 0.9164556962025316, + "grad_norm": 1.0636975765228271, + "learning_rate": 9.846939268335726e-05, + "loss": 0.6640698313713074, + "step": 2172 + }, + { + "epoch": 0.9172995780590717, + "grad_norm": 1.2038439512252808, + "learning_rate": 9.846349067534291e-05, + "loss": 0.7216284275054932, + "step": 2174 + }, + { + "epoch": 0.9181434599156119, + "grad_norm": 1.17854642868042, + "learning_rate": 9.845757748773203e-05, + "loss": 0.7244991660118103, + "step": 2176 + }, + { + "epoch": 0.9189873417721519, + "grad_norm": 1.0391159057617188, + "learning_rate": 9.845165312188864e-05, + "loss": 0.6043152809143066, + "step": 2178 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 1.2382071018218994, + "learning_rate": 9.844571757917944e-05, + "loss": 0.7791659832000732, + "step": 2180 + }, + { + "epoch": 0.920675105485232, + "grad_norm": 1.0855708122253418, + "learning_rate": 9.84397708609736e-05, + "loss": 0.7190433144569397, + "step": 2182 + }, + { + "epoch": 0.9215189873417722, + "grad_norm": 1.103308916091919, + "learning_rate": 9.843381296864291e-05, + "loss": 0.6648658514022827, + "step": 2184 + }, + { + "epoch": 0.9223628691983122, + "grad_norm": 1.073517918586731, + "learning_rate": 9.842784390356178e-05, + "loss": 0.6891760230064392, + "step": 2186 + }, + { + "epoch": 0.9232067510548523, + "grad_norm": 1.0806199312210083, + "learning_rate": 9.842186366710712e-05, + "loss": 0.6880859136581421, + "step": 2188 + }, + { + "epoch": 0.9240506329113924, + "grad_norm": 1.0631483793258667, + "learning_rate": 9.841587226065848e-05, + "loss": 0.6238307952880859, + "step": 2190 + }, + { + "epoch": 0.9248945147679325, + "grad_norm": 1.2630863189697266, + "learning_rate": 9.840986968559795e-05, + "loss": 0.6905744075775146, + "step": 2192 + }, + { + "epoch": 0.9257383966244725, + "grad_norm": 1.1307560205459595, + "learning_rate": 9.840385594331022e-05, + "loss": 0.7531564235687256, + "step": 2194 + }, + { + "epoch": 0.9265822784810127, + "grad_norm": 1.0294862985610962, + "learning_rate": 9.839783103518254e-05, + "loss": 0.6750671863555908, + "step": 2196 + }, + { + "epoch": 0.9274261603375528, + "grad_norm": 1.2446976900100708, + "learning_rate": 9.839179496260472e-05, + "loss": 0.7200804352760315, + "step": 2198 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 1.2673420906066895, + "learning_rate": 9.83857477269692e-05, + "loss": 0.7002623677253723, + "step": 2200 + }, + { + "epoch": 0.9282700421940928, + "eval_loss": 0.7497645616531372, + "eval_runtime": 856.8766, + "eval_samples_per_second": 2.459, + "eval_steps_per_second": 2.459, + "step": 2200 + }, + { + "epoch": 0.9291139240506329, + "grad_norm": 1.5114624500274658, + "learning_rate": 9.837968932967094e-05, + "loss": 0.7718265056610107, + "step": 2202 + }, + { + "epoch": 0.929957805907173, + "grad_norm": 1.2059369087219238, + "learning_rate": 9.837361977210751e-05, + "loss": 0.7204271554946899, + "step": 2204 + }, + { + "epoch": 0.9308016877637131, + "grad_norm": 1.2077301740646362, + "learning_rate": 9.836753905567902e-05, + "loss": 0.7371073961257935, + "step": 2206 + }, + { + "epoch": 0.9316455696202531, + "grad_norm": 1.120097279548645, + "learning_rate": 9.836144718178818e-05, + "loss": 0.6601167321205139, + "step": 2208 + }, + { + "epoch": 0.9324894514767933, + "grad_norm": 1.1755714416503906, + "learning_rate": 9.835534415184029e-05, + "loss": 0.6897423267364502, + "step": 2210 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3587000370025635, + "learning_rate": 9.834922996724317e-05, + "loss": 0.758438229560852, + "step": 2212 + }, + { + "epoch": 0.9341772151898734, + "grad_norm": 1.1898177862167358, + "learning_rate": 9.834310462940727e-05, + "loss": 0.7489214539527893, + "step": 2214 + }, + { + "epoch": 0.9350210970464135, + "grad_norm": 1.0814623832702637, + "learning_rate": 9.833696813974558e-05, + "loss": 0.6844488382339478, + "step": 2216 + }, + { + "epoch": 0.9358649789029536, + "grad_norm": 1.1060179471969604, + "learning_rate": 9.833082049967366e-05, + "loss": 0.6617586016654968, + "step": 2218 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 1.1780575513839722, + "learning_rate": 9.832466171060968e-05, + "loss": 0.7383584976196289, + "step": 2220 + }, + { + "epoch": 0.9375527426160337, + "grad_norm": 1.3734618425369263, + "learning_rate": 9.831849177397432e-05, + "loss": 0.7764308452606201, + "step": 2222 + }, + { + "epoch": 0.9383966244725739, + "grad_norm": 1.1367733478546143, + "learning_rate": 9.831231069119089e-05, + "loss": 0.6834397912025452, + "step": 2224 + }, + { + "epoch": 0.9392405063291139, + "grad_norm": 1.1695492267608643, + "learning_rate": 9.830611846368524e-05, + "loss": 0.7054480910301208, + "step": 2226 + }, + { + "epoch": 0.940084388185654, + "grad_norm": 1.0345736742019653, + "learning_rate": 9.829991509288579e-05, + "loss": 0.694448709487915, + "step": 2228 + }, + { + "epoch": 0.9409282700421941, + "grad_norm": 1.298105239868164, + "learning_rate": 9.829370058022356e-05, + "loss": 0.6839741468429565, + "step": 2230 + }, + { + "epoch": 0.9417721518987342, + "grad_norm": 1.2905502319335938, + "learning_rate": 9.828747492713209e-05, + "loss": 0.7886884212493896, + "step": 2232 + }, + { + "epoch": 0.9426160337552743, + "grad_norm": 1.12301504611969, + "learning_rate": 9.828123813504753e-05, + "loss": 0.7206413149833679, + "step": 2234 + }, + { + "epoch": 0.9434599156118143, + "grad_norm": 1.2644896507263184, + "learning_rate": 9.82749902054086e-05, + "loss": 0.7700693607330322, + "step": 2236 + }, + { + "epoch": 0.9443037974683545, + "grad_norm": 1.1626365184783936, + "learning_rate": 9.826873113965655e-05, + "loss": 0.7199711203575134, + "step": 2238 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 1.0728627443313599, + "learning_rate": 9.826246093923528e-05, + "loss": 0.7183539271354675, + "step": 2240 + }, + { + "epoch": 0.9459915611814346, + "grad_norm": 1.1444766521453857, + "learning_rate": 9.825617960559114e-05, + "loss": 0.7417964935302734, + "step": 2242 + }, + { + "epoch": 0.9468354430379747, + "grad_norm": 1.4059823751449585, + "learning_rate": 9.824988714017316e-05, + "loss": 0.7949740290641785, + "step": 2244 + }, + { + "epoch": 0.9476793248945148, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.824358354443286e-05, + "loss": 0.6433083415031433, + "step": 2246 + }, + { + "epoch": 0.9485232067510548, + "grad_norm": 1.0879144668579102, + "learning_rate": 9.823726881982438e-05, + "loss": 0.6519861817359924, + "step": 2248 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.2289162874221802, + "learning_rate": 9.82309429678044e-05, + "loss": 0.7280195355415344, + "step": 2250 + }, + { + "epoch": 0.950210970464135, + "grad_norm": 1.1755765676498413, + "learning_rate": 9.822460598983217e-05, + "loss": 0.7524687647819519, + "step": 2252 + }, + { + "epoch": 0.9510548523206751, + "grad_norm": 1.179807186126709, + "learning_rate": 9.821825788736949e-05, + "loss": 0.7543174624443054, + "step": 2254 + }, + { + "epoch": 0.9518987341772152, + "grad_norm": 1.1234289407730103, + "learning_rate": 9.821189866188079e-05, + "loss": 0.716377854347229, + "step": 2256 + }, + { + "epoch": 0.9527426160337553, + "grad_norm": 1.0324063301086426, + "learning_rate": 9.820552831483297e-05, + "loss": 0.6403332948684692, + "step": 2258 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 1.1459579467773438, + "learning_rate": 9.819914684769558e-05, + "loss": 0.7406947612762451, + "step": 2260 + }, + { + "epoch": 0.9544303797468354, + "grad_norm": 1.2886124849319458, + "learning_rate": 9.819275426194072e-05, + "loss": 0.749687671661377, + "step": 2262 + }, + { + "epoch": 0.9552742616033755, + "grad_norm": 1.3349844217300415, + "learning_rate": 9.818635055904299e-05, + "loss": 0.778410017490387, + "step": 2264 + }, + { + "epoch": 0.9561181434599156, + "grad_norm": 1.0994901657104492, + "learning_rate": 9.81799357404796e-05, + "loss": 0.6701914668083191, + "step": 2266 + }, + { + "epoch": 0.9569620253164557, + "grad_norm": 1.1787796020507812, + "learning_rate": 9.817350980773038e-05, + "loss": 0.7205135226249695, + "step": 2268 + }, + { + "epoch": 0.9578059071729957, + "grad_norm": 1.100813627243042, + "learning_rate": 9.816707276227763e-05, + "loss": 0.6897916197776794, + "step": 2270 + }, + { + "epoch": 0.9586497890295359, + "grad_norm": 1.1280698776245117, + "learning_rate": 9.816062460560627e-05, + "loss": 0.6763570308685303, + "step": 2272 + }, + { + "epoch": 0.959493670886076, + "grad_norm": 1.2322514057159424, + "learning_rate": 9.815416533920374e-05, + "loss": 0.6948683857917786, + "step": 2274 + }, + { + "epoch": 0.960337552742616, + "grad_norm": 1.3963630199432373, + "learning_rate": 9.814769496456008e-05, + "loss": 0.7876828908920288, + "step": 2276 + }, + { + "epoch": 0.9611814345991562, + "grad_norm": 1.2093676328659058, + "learning_rate": 9.814121348316792e-05, + "loss": 0.8191362619400024, + "step": 2278 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 1.2223572731018066, + "learning_rate": 9.813472089652233e-05, + "loss": 0.7162626385688782, + "step": 2280 + }, + { + "epoch": 0.9628691983122363, + "grad_norm": 1.1498078107833862, + "learning_rate": 9.812821720612111e-05, + "loss": 0.7183970212936401, + "step": 2282 + }, + { + "epoch": 0.9637130801687763, + "grad_norm": 1.1563853025436401, + "learning_rate": 9.812170241346449e-05, + "loss": 0.734487771987915, + "step": 2284 + }, + { + "epoch": 0.9645569620253165, + "grad_norm": 1.1823415756225586, + "learning_rate": 9.81151765200553e-05, + "loss": 0.7312371730804443, + "step": 2286 + }, + { + "epoch": 0.9654008438818565, + "grad_norm": 1.1336151361465454, + "learning_rate": 9.810863952739899e-05, + "loss": 0.7668377757072449, + "step": 2288 + }, + { + "epoch": 0.9662447257383966, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.810209143700347e-05, + "loss": 0.7100399732589722, + "step": 2290 + }, + { + "epoch": 0.9670886075949368, + "grad_norm": 1.1368129253387451, + "learning_rate": 9.809553225037926e-05, + "loss": 0.7169836163520813, + "step": 2292 + }, + { + "epoch": 0.9679324894514768, + "grad_norm": 1.141107439994812, + "learning_rate": 9.808896196903947e-05, + "loss": 0.7709535956382751, + "step": 2294 + }, + { + "epoch": 0.9687763713080169, + "grad_norm": 1.276405930519104, + "learning_rate": 9.808238059449971e-05, + "loss": 0.7300511002540588, + "step": 2296 + }, + { + "epoch": 0.9696202531645569, + "grad_norm": 0.9817046523094177, + "learning_rate": 9.80757881282782e-05, + "loss": 0.6259129047393799, + "step": 2298 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 1.3965257406234741, + "learning_rate": 9.806918457189566e-05, + "loss": 0.7361716032028198, + "step": 2300 + }, + { + "epoch": 0.9704641350210971, + "eval_loss": 0.7464568614959717, + "eval_runtime": 864.2128, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 2.438, + "step": 2300 + }, + { + "epoch": 0.9713080168776371, + "grad_norm": 1.2168612480163574, + "learning_rate": 9.806256992687544e-05, + "loss": 0.805477499961853, + "step": 2302 + }, + { + "epoch": 0.9721518987341772, + "grad_norm": 1.0418168306350708, + "learning_rate": 9.80559441947434e-05, + "loss": 0.6673368811607361, + "step": 2304 + }, + { + "epoch": 0.9729957805907173, + "grad_norm": 1.223128318786621, + "learning_rate": 9.804930737702796e-05, + "loss": 0.7585647106170654, + "step": 2306 + }, + { + "epoch": 0.9738396624472574, + "grad_norm": 1.264511227607727, + "learning_rate": 9.804265947526011e-05, + "loss": 0.7642034888267517, + "step": 2308 + }, + { + "epoch": 0.9746835443037974, + "grad_norm": 1.076887607574463, + "learning_rate": 9.803600049097339e-05, + "loss": 0.7094541192054749, + "step": 2310 + }, + { + "epoch": 0.9755274261603376, + "grad_norm": 1.0214987993240356, + "learning_rate": 9.802933042570392e-05, + "loss": 0.7370059490203857, + "step": 2312 + }, + { + "epoch": 0.9763713080168777, + "grad_norm": 1.3075295686721802, + "learning_rate": 9.802264928099035e-05, + "loss": 0.726834237575531, + "step": 2314 + }, + { + "epoch": 0.9772151898734177, + "grad_norm": 1.057386040687561, + "learning_rate": 9.801595705837385e-05, + "loss": 0.6742353439331055, + "step": 2316 + }, + { + "epoch": 0.9780590717299578, + "grad_norm": 1.3998085260391235, + "learning_rate": 9.800925375939825e-05, + "loss": 0.6862425208091736, + "step": 2318 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 1.080574631690979, + "learning_rate": 9.800253938560983e-05, + "loss": 0.6212031245231628, + "step": 2320 + }, + { + "epoch": 0.979746835443038, + "grad_norm": 1.3643771409988403, + "learning_rate": 9.799581393855748e-05, + "loss": 0.7522522211074829, + "step": 2322 + }, + { + "epoch": 0.980590717299578, + "grad_norm": 1.2455768585205078, + "learning_rate": 9.798907741979264e-05, + "loss": 0.7265716791152954, + "step": 2324 + }, + { + "epoch": 0.9814345991561182, + "grad_norm": 1.078774333000183, + "learning_rate": 9.798232983086927e-05, + "loss": 0.7160419225692749, + "step": 2326 + }, + { + "epoch": 0.9822784810126582, + "grad_norm": 1.3013948202133179, + "learning_rate": 9.797557117334394e-05, + "loss": 0.7991124391555786, + "step": 2328 + }, + { + "epoch": 0.9831223628691983, + "grad_norm": 1.2216732501983643, + "learning_rate": 9.796880144877572e-05, + "loss": 0.7193916440010071, + "step": 2330 + }, + { + "epoch": 0.9839662447257383, + "grad_norm": 1.1469542980194092, + "learning_rate": 9.796202065872627e-05, + "loss": 0.7184370756149292, + "step": 2332 + }, + { + "epoch": 0.9848101265822785, + "grad_norm": 1.0431830883026123, + "learning_rate": 9.795522880475979e-05, + "loss": 0.6474619507789612, + "step": 2334 + }, + { + "epoch": 0.9856540084388186, + "grad_norm": 1.1819576025009155, + "learning_rate": 9.794842588844299e-05, + "loss": 0.6392545700073242, + "step": 2336 + }, + { + "epoch": 0.9864978902953586, + "grad_norm": 1.1984983682632446, + "learning_rate": 9.794161191134525e-05, + "loss": 0.7358114719390869, + "step": 2338 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 1.3378512859344482, + "learning_rate": 9.793478687503834e-05, + "loss": 0.6762020587921143, + "step": 2340 + }, + { + "epoch": 0.9881856540084388, + "grad_norm": 1.272674560546875, + "learning_rate": 9.792795078109673e-05, + "loss": 0.7478934526443481, + "step": 2342 + }, + { + "epoch": 0.9890295358649789, + "grad_norm": 1.153746247291565, + "learning_rate": 9.792110363109733e-05, + "loss": 0.7316533923149109, + "step": 2344 + }, + { + "epoch": 0.9898734177215189, + "grad_norm": 1.1361702680587769, + "learning_rate": 9.791424542661967e-05, + "loss": 0.7078539133071899, + "step": 2346 + }, + { + "epoch": 0.9907172995780591, + "grad_norm": 1.3043115139007568, + "learning_rate": 9.790737616924581e-05, + "loss": 0.7945935130119324, + "step": 2348 + }, + { + "epoch": 0.9915611814345991, + "grad_norm": 1.1913264989852905, + "learning_rate": 9.790049586056034e-05, + "loss": 0.8247197866439819, + "step": 2350 + }, + { + "epoch": 0.9924050632911392, + "grad_norm": 1.1560171842575073, + "learning_rate": 9.789360450215041e-05, + "loss": 0.7099657654762268, + "step": 2352 + }, + { + "epoch": 0.9932489451476794, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.788670209560575e-05, + "loss": 0.7480318546295166, + "step": 2354 + }, + { + "epoch": 0.9940928270042194, + "grad_norm": 1.1584707498550415, + "learning_rate": 9.787978864251859e-05, + "loss": 0.6870889067649841, + "step": 2356 + }, + { + "epoch": 0.9949367088607595, + "grad_norm": 1.057478666305542, + "learning_rate": 9.787286414448375e-05, + "loss": 0.6114922165870667, + "step": 2358 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 1.1431775093078613, + "learning_rate": 9.786592860309856e-05, + "loss": 0.6955118179321289, + "step": 2360 + }, + { + "epoch": 0.9966244725738397, + "grad_norm": 1.232142448425293, + "learning_rate": 9.785898201996292e-05, + "loss": 0.735048770904541, + "step": 2362 + }, + { + "epoch": 0.9974683544303797, + "grad_norm": 1.1236306428909302, + "learning_rate": 9.785202439667928e-05, + "loss": 0.7150241136550903, + "step": 2364 + }, + { + "epoch": 0.9983122362869198, + "grad_norm": 1.0517534017562866, + "learning_rate": 9.784505573485263e-05, + "loss": 0.6870222687721252, + "step": 2366 + }, + { + "epoch": 0.99915611814346, + "grad_norm": 1.1747480630874634, + "learning_rate": 9.78380760360905e-05, + "loss": 0.7521567940711975, + "step": 2368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2790346145629883, + "learning_rate": 9.783108530200298e-05, + "loss": 0.7336234450340271, + "step": 2370 + }, + { + "epoch": 1.0008438818565402, + "grad_norm": 1.1216399669647217, + "learning_rate": 9.78240835342027e-05, + "loss": 0.6378109455108643, + "step": 2372 + }, + { + "epoch": 1.00168776371308, + "grad_norm": 1.267336368560791, + "learning_rate": 9.781707073430482e-05, + "loss": 0.6174905300140381, + "step": 2374 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 1.1342934370040894, + "learning_rate": 9.781004690392706e-05, + "loss": 0.6579123139381409, + "step": 2376 + }, + { + "epoch": 1.0033755274261604, + "grad_norm": 1.1317468881607056, + "learning_rate": 9.78030120446897e-05, + "loss": 0.6679617166519165, + "step": 2378 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 1.2992616891860962, + "learning_rate": 9.779596615821552e-05, + "loss": 0.7368149161338806, + "step": 2380 + }, + { + "epoch": 1.0050632911392405, + "grad_norm": 1.1714510917663574, + "learning_rate": 9.77889092461299e-05, + "loss": 0.6887164115905762, + "step": 2382 + }, + { + "epoch": 1.0059071729957807, + "grad_norm": 1.1670639514923096, + "learning_rate": 9.778184131006071e-05, + "loss": 0.681344211101532, + "step": 2384 + }, + { + "epoch": 1.0067510548523206, + "grad_norm": 1.2487291097640991, + "learning_rate": 9.77747623516384e-05, + "loss": 0.7342769503593445, + "step": 2386 + }, + { + "epoch": 1.0075949367088608, + "grad_norm": 1.2408956289291382, + "learning_rate": 9.776767237249595e-05, + "loss": 0.577454149723053, + "step": 2388 + }, + { + "epoch": 1.0084388185654007, + "grad_norm": 1.067991852760315, + "learning_rate": 9.776057137426889e-05, + "loss": 0.6588307023048401, + "step": 2390 + }, + { + "epoch": 1.009282700421941, + "grad_norm": 1.2821543216705322, + "learning_rate": 9.775345935859525e-05, + "loss": 0.7045041918754578, + "step": 2392 + }, + { + "epoch": 1.010126582278481, + "grad_norm": 1.3160134553909302, + "learning_rate": 9.774633632711569e-05, + "loss": 0.7141479253768921, + "step": 2394 + }, + { + "epoch": 1.010970464135021, + "grad_norm": 1.66774320602417, + "learning_rate": 9.773920228147329e-05, + "loss": 0.723293662071228, + "step": 2396 + }, + { + "epoch": 1.0118143459915612, + "grad_norm": 1.027588963508606, + "learning_rate": 9.77320572233138e-05, + "loss": 0.5812023878097534, + "step": 2398 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.406507968902588, + "learning_rate": 9.77249011542854e-05, + "loss": 0.7071458101272583, + "step": 2400 + }, + { + "epoch": 1.0126582278481013, + "eval_loss": 0.7421699166297913, + "eval_runtime": 854.2185, + "eval_samples_per_second": 2.467, + "eval_steps_per_second": 2.467, + "step": 2400 + }, + { + "epoch": 1.0135021097046413, + "grad_norm": 1.1236240863800049, + "learning_rate": 9.771773407603889e-05, + "loss": 0.7049722671508789, + "step": 2402 + }, + { + "epoch": 1.0143459915611814, + "grad_norm": 1.1924289464950562, + "learning_rate": 9.771055599022756e-05, + "loss": 0.635308027267456, + "step": 2404 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 1.1744966506958008, + "learning_rate": 9.770336689850727e-05, + "loss": 0.7286487817764282, + "step": 2406 + }, + { + "epoch": 1.0160337552742615, + "grad_norm": 1.2131173610687256, + "learning_rate": 9.769616680253639e-05, + "loss": 0.6828222274780273, + "step": 2408 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 1.0517828464508057, + "learning_rate": 9.768895570397585e-05, + "loss": 0.6652156114578247, + "step": 2410 + }, + { + "epoch": 1.0177215189873419, + "grad_norm": 1.1603758335113525, + "learning_rate": 9.768173360448912e-05, + "loss": 0.7278267741203308, + "step": 2412 + }, + { + "epoch": 1.0185654008438818, + "grad_norm": 1.3167752027511597, + "learning_rate": 9.767450050574218e-05, + "loss": 0.6082334518432617, + "step": 2414 + }, + { + "epoch": 1.019409282700422, + "grad_norm": 1.1754449605941772, + "learning_rate": 9.766725640940358e-05, + "loss": 0.67228102684021, + "step": 2416 + }, + { + "epoch": 1.0202531645569621, + "grad_norm": 1.060952067375183, + "learning_rate": 9.766000131714442e-05, + "loss": 0.5984366536140442, + "step": 2418 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 1.0826152563095093, + "learning_rate": 9.765273523063825e-05, + "loss": 0.690661609172821, + "step": 2420 + }, + { + "epoch": 1.0219409282700422, + "grad_norm": 1.423723816871643, + "learning_rate": 9.764545815156125e-05, + "loss": 0.7960668802261353, + "step": 2422 + }, + { + "epoch": 1.0227848101265822, + "grad_norm": 1.0882549285888672, + "learning_rate": 9.763817008159212e-05, + "loss": 0.6971074342727661, + "step": 2424 + }, + { + "epoch": 1.0236286919831223, + "grad_norm": 1.1053040027618408, + "learning_rate": 9.763087102241206e-05, + "loss": 0.6854458451271057, + "step": 2426 + }, + { + "epoch": 1.0244725738396625, + "grad_norm": 1.1975224018096924, + "learning_rate": 9.762356097570482e-05, + "loss": 0.6724489331245422, + "step": 2428 + }, + { + "epoch": 1.0253164556962024, + "grad_norm": 1.1692171096801758, + "learning_rate": 9.76162399431567e-05, + "loss": 0.7064506411552429, + "step": 2430 + }, + { + "epoch": 1.0261603375527426, + "grad_norm": 1.1927787065505981, + "learning_rate": 9.760890792645649e-05, + "loss": 0.6605257391929626, + "step": 2432 + }, + { + "epoch": 1.0270042194092828, + "grad_norm": 1.4147427082061768, + "learning_rate": 9.760156492729558e-05, + "loss": 0.6872501373291016, + "step": 2434 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 1.2503126859664917, + "learning_rate": 9.759421094736785e-05, + "loss": 0.7117500305175781, + "step": 2436 + }, + { + "epoch": 1.0286919831223629, + "grad_norm": 1.229978084564209, + "learning_rate": 9.758684598836971e-05, + "loss": 0.6740369200706482, + "step": 2438 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 1.4765945672988892, + "learning_rate": 9.757947005200014e-05, + "loss": 0.7215790748596191, + "step": 2440 + }, + { + "epoch": 1.030379746835443, + "grad_norm": 1.282632827758789, + "learning_rate": 9.757208313996061e-05, + "loss": 0.6961746215820312, + "step": 2442 + }, + { + "epoch": 1.0312236286919831, + "grad_norm": 1.259828805923462, + "learning_rate": 9.756468525395512e-05, + "loss": 0.6348349452018738, + "step": 2444 + }, + { + "epoch": 1.0320675105485233, + "grad_norm": 1.0984172821044922, + "learning_rate": 9.755727639569024e-05, + "loss": 0.6756057739257812, + "step": 2446 + }, + { + "epoch": 1.0329113924050632, + "grad_norm": 1.235835075378418, + "learning_rate": 9.754985656687506e-05, + "loss": 0.6968509554862976, + "step": 2448 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 1.273032546043396, + "learning_rate": 9.754242576922119e-05, + "loss": 0.6793950796127319, + "step": 2450 + }, + { + "epoch": 1.0345991561181433, + "grad_norm": 1.251996397972107, + "learning_rate": 9.753498400444274e-05, + "loss": 0.645270586013794, + "step": 2452 + }, + { + "epoch": 1.0354430379746835, + "grad_norm": 1.4310805797576904, + "learning_rate": 9.752753127425642e-05, + "loss": 0.7291322350502014, + "step": 2454 + }, + { + "epoch": 1.0362869198312237, + "grad_norm": 1.6582196950912476, + "learning_rate": 9.752006758038142e-05, + "loss": 0.7553019523620605, + "step": 2456 + }, + { + "epoch": 1.0371308016877636, + "grad_norm": 1.081773042678833, + "learning_rate": 9.751259292453947e-05, + "loss": 0.5637331008911133, + "step": 2458 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 1.1483876705169678, + "learning_rate": 9.750510730845483e-05, + "loss": 0.6012396216392517, + "step": 2460 + }, + { + "epoch": 1.038818565400844, + "grad_norm": 1.0879185199737549, + "learning_rate": 9.749761073385428e-05, + "loss": 0.6795822381973267, + "step": 2462 + }, + { + "epoch": 1.0396624472573839, + "grad_norm": 1.2378218173980713, + "learning_rate": 9.749010320246714e-05, + "loss": 0.6895145773887634, + "step": 2464 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 1.253233790397644, + "learning_rate": 9.748258471602527e-05, + "loss": 0.7124115228652954, + "step": 2466 + }, + { + "epoch": 1.0413502109704642, + "grad_norm": 1.3994864225387573, + "learning_rate": 9.747505527626302e-05, + "loss": 0.7304861545562744, + "step": 2468 + }, + { + "epoch": 1.0421940928270041, + "grad_norm": 1.2360669374465942, + "learning_rate": 9.74675148849173e-05, + "loss": 0.6845837831497192, + "step": 2470 + }, + { + "epoch": 1.0430379746835443, + "grad_norm": 1.126849889755249, + "learning_rate": 9.74599635437275e-05, + "loss": 0.6780203580856323, + "step": 2472 + }, + { + "epoch": 1.0438818565400845, + "grad_norm": 1.169788122177124, + "learning_rate": 9.745240125443562e-05, + "loss": 0.7550003528594971, + "step": 2474 + }, + { + "epoch": 1.0447257383966244, + "grad_norm": 1.1311867237091064, + "learning_rate": 9.744482801878612e-05, + "loss": 0.6910399198532104, + "step": 2476 + }, + { + "epoch": 1.0455696202531646, + "grad_norm": 1.1267731189727783, + "learning_rate": 9.743724383852597e-05, + "loss": 0.7164814472198486, + "step": 2478 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 1.2239704132080078, + "learning_rate": 9.742964871540472e-05, + "loss": 0.6428439617156982, + "step": 2480 + }, + { + "epoch": 1.0472573839662447, + "grad_norm": 1.1854743957519531, + "learning_rate": 9.742204265117443e-05, + "loss": 0.6994290351867676, + "step": 2482 + }, + { + "epoch": 1.0481012658227848, + "grad_norm": 1.0695894956588745, + "learning_rate": 9.741442564758964e-05, + "loss": 0.6725777983665466, + "step": 2484 + }, + { + "epoch": 1.048945147679325, + "grad_norm": 1.1799863576889038, + "learning_rate": 9.740679770640748e-05, + "loss": 0.6538674235343933, + "step": 2486 + }, + { + "epoch": 1.049789029535865, + "grad_norm": 1.295546293258667, + "learning_rate": 9.739915882938754e-05, + "loss": 0.780756950378418, + "step": 2488 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 1.2371755838394165, + "learning_rate": 9.739150901829198e-05, + "loss": 0.6657930612564087, + "step": 2490 + }, + { + "epoch": 1.051476793248945, + "grad_norm": 1.103037714958191, + "learning_rate": 9.738384827488547e-05, + "loss": 0.6675208210945129, + "step": 2492 + }, + { + "epoch": 1.0523206751054852, + "grad_norm": 1.1835435628890991, + "learning_rate": 9.737617660093517e-05, + "loss": 0.6693358421325684, + "step": 2494 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 1.003771424293518, + "learning_rate": 9.736849399821082e-05, + "loss": 0.624502956867218, + "step": 2496 + }, + { + "epoch": 1.0540084388185653, + "grad_norm": 1.1391769647598267, + "learning_rate": 9.736080046848463e-05, + "loss": 0.6350868344306946, + "step": 2498 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 1.376518726348877, + "learning_rate": 9.735309601353134e-05, + "loss": 0.6721012592315674, + "step": 2500 + }, + { + "epoch": 1.0548523206751055, + "eval_loss": 0.741338849067688, + "eval_runtime": 847.7478, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2500 + }, + { + "epoch": 1.0556962025316456, + "grad_norm": 1.194190502166748, + "learning_rate": 9.734538063512824e-05, + "loss": 0.6888233423233032, + "step": 2502 + }, + { + "epoch": 1.0565400843881856, + "grad_norm": 1.378830909729004, + "learning_rate": 9.733765433505513e-05, + "loss": 0.7095553278923035, + "step": 2504 + }, + { + "epoch": 1.0573839662447257, + "grad_norm": 1.1289541721343994, + "learning_rate": 9.732991711509428e-05, + "loss": 0.6734166145324707, + "step": 2506 + }, + { + "epoch": 1.058227848101266, + "grad_norm": 1.1858116388320923, + "learning_rate": 9.732216897703054e-05, + "loss": 0.7006195187568665, + "step": 2508 + }, + { + "epoch": 1.0590717299578059, + "grad_norm": 1.1365686655044556, + "learning_rate": 9.731440992265127e-05, + "loss": 0.6481205821037292, + "step": 2510 + }, + { + "epoch": 1.059915611814346, + "grad_norm": 1.2886228561401367, + "learning_rate": 9.730663995374632e-05, + "loss": 0.679282546043396, + "step": 2512 + }, + { + "epoch": 1.0607594936708862, + "grad_norm": 1.355322003364563, + "learning_rate": 9.729885907210808e-05, + "loss": 0.7656359672546387, + "step": 2514 + }, + { + "epoch": 1.0616033755274261, + "grad_norm": 1.1552364826202393, + "learning_rate": 9.729106727953142e-05, + "loss": 0.5996183156967163, + "step": 2516 + }, + { + "epoch": 1.0624472573839663, + "grad_norm": 1.1419235467910767, + "learning_rate": 9.728326457781381e-05, + "loss": 0.7599716782569885, + "step": 2518 + }, + { + "epoch": 1.0632911392405062, + "grad_norm": 1.2240079641342163, + "learning_rate": 9.727545096875512e-05, + "loss": 0.7150241732597351, + "step": 2520 + }, + { + "epoch": 1.0641350210970464, + "grad_norm": 1.2463440895080566, + "learning_rate": 9.726762645415785e-05, + "loss": 0.734352171421051, + "step": 2522 + }, + { + "epoch": 1.0649789029535865, + "grad_norm": 1.1680364608764648, + "learning_rate": 9.725979103582697e-05, + "loss": 0.6950796842575073, + "step": 2524 + }, + { + "epoch": 1.0658227848101265, + "grad_norm": 1.1680421829223633, + "learning_rate": 9.725194471556991e-05, + "loss": 0.7096341252326965, + "step": 2526 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.043717861175537, + "learning_rate": 9.724408749519671e-05, + "loss": 0.6486304402351379, + "step": 2528 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 1.1240284442901611, + "learning_rate": 9.723621937651985e-05, + "loss": 0.6519505381584167, + "step": 2530 + }, + { + "epoch": 1.0683544303797468, + "grad_norm": 1.185223937034607, + "learning_rate": 9.722834036135439e-05, + "loss": 0.6724293231964111, + "step": 2532 + }, + { + "epoch": 1.069198312236287, + "grad_norm": 1.3234196901321411, + "learning_rate": 9.722045045151784e-05, + "loss": 0.6886576414108276, + "step": 2534 + }, + { + "epoch": 1.070042194092827, + "grad_norm": 1.333084225654602, + "learning_rate": 9.721254964883024e-05, + "loss": 0.688493549823761, + "step": 2536 + }, + { + "epoch": 1.070886075949367, + "grad_norm": 1.2435462474822998, + "learning_rate": 9.720463795511419e-05, + "loss": 0.6527412533760071, + "step": 2538 + }, + { + "epoch": 1.0717299578059072, + "grad_norm": 1.1521880626678467, + "learning_rate": 9.719671537219472e-05, + "loss": 0.6508163809776306, + "step": 2540 + }, + { + "epoch": 1.0725738396624473, + "grad_norm": 1.015013575553894, + "learning_rate": 9.718878190189947e-05, + "loss": 0.6954023838043213, + "step": 2542 + }, + { + "epoch": 1.0734177215189873, + "grad_norm": 1.1507678031921387, + "learning_rate": 9.718083754605851e-05, + "loss": 0.7201322913169861, + "step": 2544 + }, + { + "epoch": 1.0742616033755275, + "grad_norm": 1.0569016933441162, + "learning_rate": 9.717288230650444e-05, + "loss": 0.6688649654388428, + "step": 2546 + }, + { + "epoch": 1.0751054852320676, + "grad_norm": 1.2178492546081543, + "learning_rate": 9.716491618507241e-05, + "loss": 0.7077898979187012, + "step": 2548 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 1.3587230443954468, + "learning_rate": 9.715693918360002e-05, + "loss": 0.7312119603157043, + "step": 2550 + }, + { + "epoch": 1.0767932489451477, + "grad_norm": 1.1930122375488281, + "learning_rate": 9.714895130392744e-05, + "loss": 0.6910589337348938, + "step": 2552 + }, + { + "epoch": 1.0776371308016879, + "grad_norm": 1.2440707683563232, + "learning_rate": 9.71409525478973e-05, + "loss": 0.7942836284637451, + "step": 2554 + }, + { + "epoch": 1.0784810126582278, + "grad_norm": 1.3755065202713013, + "learning_rate": 9.713294291735477e-05, + "loss": 0.6652286052703857, + "step": 2556 + }, + { + "epoch": 1.079324894514768, + "grad_norm": 1.165448784828186, + "learning_rate": 9.71249224141475e-05, + "loss": 0.6025735139846802, + "step": 2558 + }, + { + "epoch": 1.080168776371308, + "grad_norm": 1.2981204986572266, + "learning_rate": 9.711689104012569e-05, + "loss": 0.7343734502792358, + "step": 2560 + }, + { + "epoch": 1.081012658227848, + "grad_norm": 1.2040622234344482, + "learning_rate": 9.710884879714202e-05, + "loss": 0.6903306841850281, + "step": 2562 + }, + { + "epoch": 1.0818565400843883, + "grad_norm": 1.1835904121398926, + "learning_rate": 9.710079568705168e-05, + "loss": 0.69134920835495, + "step": 2564 + }, + { + "epoch": 1.0827004219409282, + "grad_norm": 1.3345229625701904, + "learning_rate": 9.709273171171235e-05, + "loss": 0.6471185088157654, + "step": 2566 + }, + { + "epoch": 1.0835443037974684, + "grad_norm": 1.0884469747543335, + "learning_rate": 9.708465687298425e-05, + "loss": 0.6302382349967957, + "step": 2568 + }, + { + "epoch": 1.0843881856540085, + "grad_norm": 1.1994211673736572, + "learning_rate": 9.707657117273007e-05, + "loss": 0.7329678535461426, + "step": 2570 + }, + { + "epoch": 1.0852320675105485, + "grad_norm": 1.2609503269195557, + "learning_rate": 9.706847461281507e-05, + "loss": 0.719862163066864, + "step": 2572 + }, + { + "epoch": 1.0860759493670886, + "grad_norm": 1.2686879634857178, + "learning_rate": 9.706036719510694e-05, + "loss": 0.7142901420593262, + "step": 2574 + }, + { + "epoch": 1.0869198312236288, + "grad_norm": 1.2763310670852661, + "learning_rate": 9.705224892147591e-05, + "loss": 0.7009075284004211, + "step": 2576 + }, + { + "epoch": 1.0877637130801687, + "grad_norm": 1.1704022884368896, + "learning_rate": 9.70441197937947e-05, + "loss": 0.6873779296875, + "step": 2578 + }, + { + "epoch": 1.0886075949367089, + "grad_norm": 1.0482875108718872, + "learning_rate": 9.703597981393856e-05, + "loss": 0.6437726020812988, + "step": 2580 + }, + { + "epoch": 1.0894514767932488, + "grad_norm": 1.28431236743927, + "learning_rate": 9.702782898378521e-05, + "loss": 0.6933431625366211, + "step": 2582 + }, + { + "epoch": 1.090295358649789, + "grad_norm": 1.0962283611297607, + "learning_rate": 9.701966730521491e-05, + "loss": 0.6488757133483887, + "step": 2584 + }, + { + "epoch": 1.0911392405063292, + "grad_norm": 1.2177873849868774, + "learning_rate": 9.70114947801104e-05, + "loss": 0.6385396122932434, + "step": 2586 + }, + { + "epoch": 1.091983122362869, + "grad_norm": 1.197059988975525, + "learning_rate": 9.70033114103569e-05, + "loss": 0.6826614737510681, + "step": 2588 + }, + { + "epoch": 1.0928270042194093, + "grad_norm": 1.1624075174331665, + "learning_rate": 9.699511719784217e-05, + "loss": 0.605629563331604, + "step": 2590 + }, + { + "epoch": 1.0936708860759494, + "grad_norm": 1.2975167036056519, + "learning_rate": 9.698691214445648e-05, + "loss": 0.734926700592041, + "step": 2592 + }, + { + "epoch": 1.0945147679324894, + "grad_norm": 1.215414047241211, + "learning_rate": 9.697869625209255e-05, + "loss": 0.7281333804130554, + "step": 2594 + }, + { + "epoch": 1.0953586497890295, + "grad_norm": 1.1862860918045044, + "learning_rate": 9.697046952264563e-05, + "loss": 0.7388250827789307, + "step": 2596 + }, + { + "epoch": 1.0962025316455697, + "grad_norm": 1.1127797365188599, + "learning_rate": 9.696223195801348e-05, + "loss": 0.6495320796966553, + "step": 2598 + }, + { + "epoch": 1.0970464135021096, + "grad_norm": 1.0863338708877563, + "learning_rate": 9.695398356009636e-05, + "loss": 0.7157143950462341, + "step": 2600 + }, + { + "epoch": 1.0970464135021096, + "eval_loss": 0.7377332448959351, + "eval_runtime": 859.6612, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 2600 + }, + { + "epoch": 1.0978902953586498, + "grad_norm": 1.1228652000427246, + "learning_rate": 9.694572433079699e-05, + "loss": 0.6597335934638977, + "step": 2602 + }, + { + "epoch": 1.09873417721519, + "grad_norm": 1.3077653646469116, + "learning_rate": 9.69374542720206e-05, + "loss": 0.6715680360794067, + "step": 2604 + }, + { + "epoch": 1.09957805907173, + "grad_norm": 1.241603970527649, + "learning_rate": 9.692917338567499e-05, + "loss": 0.6910243034362793, + "step": 2606 + }, + { + "epoch": 1.10042194092827, + "grad_norm": 1.1372551918029785, + "learning_rate": 9.692088167367037e-05, + "loss": 0.6519553065299988, + "step": 2608 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 1.2894765138626099, + "learning_rate": 9.691257913791949e-05, + "loss": 0.6542758941650391, + "step": 2610 + }, + { + "epoch": 1.1021097046413502, + "grad_norm": 1.0800915956497192, + "learning_rate": 9.690426578033755e-05, + "loss": 0.6886795163154602, + "step": 2612 + }, + { + "epoch": 1.1029535864978903, + "grad_norm": 1.3394384384155273, + "learning_rate": 9.689594160284233e-05, + "loss": 0.7512150406837463, + "step": 2614 + }, + { + "epoch": 1.1037974683544305, + "grad_norm": 1.2175323963165283, + "learning_rate": 9.688760660735402e-05, + "loss": 0.67207932472229, + "step": 2616 + }, + { + "epoch": 1.1046413502109704, + "grad_norm": 1.2181185483932495, + "learning_rate": 9.687926079579537e-05, + "loss": 0.6591740846633911, + "step": 2618 + }, + { + "epoch": 1.1054852320675106, + "grad_norm": 1.1740983724594116, + "learning_rate": 9.68709041700916e-05, + "loss": 0.6431041359901428, + "step": 2620 + }, + { + "epoch": 1.1063291139240505, + "grad_norm": 1.1792434453964233, + "learning_rate": 9.686253673217038e-05, + "loss": 0.6573615074157715, + "step": 2622 + }, + { + "epoch": 1.1071729957805907, + "grad_norm": 1.058391809463501, + "learning_rate": 9.685415848396196e-05, + "loss": 0.5576209425926208, + "step": 2624 + }, + { + "epoch": 1.1080168776371309, + "grad_norm": 1.3203206062316895, + "learning_rate": 9.684576942739903e-05, + "loss": 0.668684184551239, + "step": 2626 + }, + { + "epoch": 1.1088607594936708, + "grad_norm": 1.2391762733459473, + "learning_rate": 9.68373695644168e-05, + "loss": 0.6800089478492737, + "step": 2628 + }, + { + "epoch": 1.109704641350211, + "grad_norm": 1.2323405742645264, + "learning_rate": 9.682895889695292e-05, + "loss": 0.6433757543563843, + "step": 2630 + }, + { + "epoch": 1.1105485232067511, + "grad_norm": 1.2656551599502563, + "learning_rate": 9.682053742694759e-05, + "loss": 0.6628785729408264, + "step": 2632 + }, + { + "epoch": 1.111392405063291, + "grad_norm": 1.2984392642974854, + "learning_rate": 9.681210515634349e-05, + "loss": 0.6838971972465515, + "step": 2634 + }, + { + "epoch": 1.1122362869198312, + "grad_norm": 1.3200393915176392, + "learning_rate": 9.680366208708576e-05, + "loss": 0.7548647522926331, + "step": 2636 + }, + { + "epoch": 1.1130801687763714, + "grad_norm": 1.225388526916504, + "learning_rate": 9.679520822112208e-05, + "loss": 0.6553335189819336, + "step": 2638 + }, + { + "epoch": 1.1139240506329113, + "grad_norm": 1.2350653409957886, + "learning_rate": 9.678674356040259e-05, + "loss": 0.631401538848877, + "step": 2640 + }, + { + "epoch": 1.1147679324894515, + "grad_norm": 1.2325507402420044, + "learning_rate": 9.677826810687989e-05, + "loss": 0.6459156274795532, + "step": 2642 + }, + { + "epoch": 1.1156118143459917, + "grad_norm": 1.0008996725082397, + "learning_rate": 9.676978186250915e-05, + "loss": 0.6425284743309021, + "step": 2644 + }, + { + "epoch": 1.1164556962025316, + "grad_norm": 1.3767247200012207, + "learning_rate": 9.676128482924796e-05, + "loss": 0.6451422572135925, + "step": 2646 + }, + { + "epoch": 1.1172995780590718, + "grad_norm": 1.2070895433425903, + "learning_rate": 9.675277700905643e-05, + "loss": 0.6713272929191589, + "step": 2648 + }, + { + "epoch": 1.1181434599156117, + "grad_norm": 1.1582069396972656, + "learning_rate": 9.674425840389716e-05, + "loss": 0.6285044550895691, + "step": 2650 + }, + { + "epoch": 1.1189873417721519, + "grad_norm": 1.1641311645507812, + "learning_rate": 9.67357290157352e-05, + "loss": 0.624229907989502, + "step": 2652 + }, + { + "epoch": 1.119831223628692, + "grad_norm": 1.3071147203445435, + "learning_rate": 9.672718884653814e-05, + "loss": 0.7214919328689575, + "step": 2654 + }, + { + "epoch": 1.120675105485232, + "grad_norm": 1.2157800197601318, + "learning_rate": 9.671863789827602e-05, + "loss": 0.8062215447425842, + "step": 2656 + }, + { + "epoch": 1.1215189873417721, + "grad_norm": 1.2843927145004272, + "learning_rate": 9.671007617292138e-05, + "loss": 0.6362426280975342, + "step": 2658 + }, + { + "epoch": 1.1223628691983123, + "grad_norm": 1.1182712316513062, + "learning_rate": 9.670150367244927e-05, + "loss": 0.6181318163871765, + "step": 2660 + }, + { + "epoch": 1.1232067510548522, + "grad_norm": 1.566605806350708, + "learning_rate": 9.669292039883717e-05, + "loss": 0.6973897218704224, + "step": 2662 + }, + { + "epoch": 1.1240506329113924, + "grad_norm": 1.0726850032806396, + "learning_rate": 9.66843263540651e-05, + "loss": 0.6117324829101562, + "step": 2664 + }, + { + "epoch": 1.1248945147679326, + "grad_norm": 1.2953020334243774, + "learning_rate": 9.66757215401155e-05, + "loss": 0.642676830291748, + "step": 2666 + }, + { + "epoch": 1.1257383966244725, + "grad_norm": 1.1184383630752563, + "learning_rate": 9.66671059589734e-05, + "loss": 0.6757452487945557, + "step": 2668 + }, + { + "epoch": 1.1265822784810127, + "grad_norm": 1.2732970714569092, + "learning_rate": 9.66584796126262e-05, + "loss": 0.6861951947212219, + "step": 2670 + }, + { + "epoch": 1.1274261603375528, + "grad_norm": 1.2713000774383545, + "learning_rate": 9.664984250306383e-05, + "loss": 0.6727077960968018, + "step": 2672 + }, + { + "epoch": 1.1282700421940928, + "grad_norm": 1.269827961921692, + "learning_rate": 9.664119463227874e-05, + "loss": 0.7355974912643433, + "step": 2674 + }, + { + "epoch": 1.129113924050633, + "grad_norm": 1.3067172765731812, + "learning_rate": 9.663253600226581e-05, + "loss": 0.7121313214302063, + "step": 2676 + }, + { + "epoch": 1.129957805907173, + "grad_norm": 1.2958797216415405, + "learning_rate": 9.662386661502242e-05, + "loss": 0.6671369075775146, + "step": 2678 + }, + { + "epoch": 1.130801687763713, + "grad_norm": 1.2943401336669922, + "learning_rate": 9.661518647254842e-05, + "loss": 0.6153768301010132, + "step": 2680 + }, + { + "epoch": 1.1316455696202532, + "grad_norm": 1.1744167804718018, + "learning_rate": 9.660649557684616e-05, + "loss": 0.6070778965950012, + "step": 2682 + }, + { + "epoch": 1.1324894514767934, + "grad_norm": 1.159209132194519, + "learning_rate": 9.659779392992047e-05, + "loss": 0.676887035369873, + "step": 2684 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.1937510967254639, + "learning_rate": 9.658908153377866e-05, + "loss": 0.6086745262145996, + "step": 2686 + }, + { + "epoch": 1.1341772151898735, + "grad_norm": 1.1461687088012695, + "learning_rate": 9.658035839043049e-05, + "loss": 0.6493708491325378, + "step": 2688 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 2.066361665725708, + "learning_rate": 9.657162450188824e-05, + "loss": 0.6813004016876221, + "step": 2690 + }, + { + "epoch": 1.1358649789029536, + "grad_norm": 1.086910367012024, + "learning_rate": 9.656287987016664e-05, + "loss": 0.721062183380127, + "step": 2692 + }, + { + "epoch": 1.1367088607594937, + "grad_norm": 1.1869292259216309, + "learning_rate": 9.65541244972829e-05, + "loss": 0.5975021123886108, + "step": 2694 + }, + { + "epoch": 1.1375527426160337, + "grad_norm": 1.2456518411636353, + "learning_rate": 9.654535838525674e-05, + "loss": 0.6818324327468872, + "step": 2696 + }, + { + "epoch": 1.1383966244725738, + "grad_norm": 1.5271464586257935, + "learning_rate": 9.653658153611031e-05, + "loss": 0.6844469308853149, + "step": 2698 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.1403794288635254, + "learning_rate": 9.652779395186827e-05, + "loss": 0.6388684511184692, + "step": 2700 + }, + { + "epoch": 1.139240506329114, + "eval_loss": 0.7335711717605591, + "eval_runtime": 861.9651, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 2700 + }, + { + "epoch": 1.140084388185654, + "grad_norm": 1.1091634035110474, + "learning_rate": 9.651899563455775e-05, + "loss": 0.6154619455337524, + "step": 2702 + }, + { + "epoch": 1.140928270042194, + "grad_norm": 1.3280601501464844, + "learning_rate": 9.651018658620837e-05, + "loss": 0.629319429397583, + "step": 2704 + }, + { + "epoch": 1.1417721518987343, + "grad_norm": 1.226806402206421, + "learning_rate": 9.650136680885216e-05, + "loss": 0.6088175773620605, + "step": 2706 + }, + { + "epoch": 1.1426160337552742, + "grad_norm": 1.0593408346176147, + "learning_rate": 9.649253630452372e-05, + "loss": 0.6199659705162048, + "step": 2708 + }, + { + "epoch": 1.1434599156118144, + "grad_norm": 1.1112475395202637, + "learning_rate": 9.648369507526008e-05, + "loss": 0.7233364582061768, + "step": 2710 + }, + { + "epoch": 1.1443037974683543, + "grad_norm": 1.1737885475158691, + "learning_rate": 9.647484312310068e-05, + "loss": 0.6687955856323242, + "step": 2712 + }, + { + "epoch": 1.1451476793248945, + "grad_norm": 1.194532036781311, + "learning_rate": 9.646598045008756e-05, + "loss": 0.6508969068527222, + "step": 2714 + }, + { + "epoch": 1.1459915611814346, + "grad_norm": 1.069395899772644, + "learning_rate": 9.645710705826517e-05, + "loss": 0.6408317685127258, + "step": 2716 + }, + { + "epoch": 1.1468354430379746, + "grad_norm": 1.2429133653640747, + "learning_rate": 9.644822294968037e-05, + "loss": 0.650763750076294, + "step": 2718 + }, + { + "epoch": 1.1476793248945147, + "grad_norm": 1.2950133085250854, + "learning_rate": 9.64393281263826e-05, + "loss": 0.6952191591262817, + "step": 2720 + }, + { + "epoch": 1.148523206751055, + "grad_norm": 1.1972628831863403, + "learning_rate": 9.643042259042372e-05, + "loss": 0.6772956252098083, + "step": 2722 + }, + { + "epoch": 1.1493670886075948, + "grad_norm": 1.1670407056808472, + "learning_rate": 9.642150634385805e-05, + "loss": 0.6734447479248047, + "step": 2724 + }, + { + "epoch": 1.150210970464135, + "grad_norm": 1.120302677154541, + "learning_rate": 9.641257938874243e-05, + "loss": 0.6387717127799988, + "step": 2726 + }, + { + "epoch": 1.1510548523206752, + "grad_norm": 1.1241344213485718, + "learning_rate": 9.640364172713609e-05, + "loss": 0.6592874526977539, + "step": 2728 + }, + { + "epoch": 1.1518987341772151, + "grad_norm": 1.2627261877059937, + "learning_rate": 9.639469336110083e-05, + "loss": 0.7257466912269592, + "step": 2730 + }, + { + "epoch": 1.1527426160337553, + "grad_norm": 1.0528618097305298, + "learning_rate": 9.638573429270083e-05, + "loss": 0.572188138961792, + "step": 2732 + }, + { + "epoch": 1.1535864978902954, + "grad_norm": 1.212536334991455, + "learning_rate": 9.637676452400277e-05, + "loss": 0.678981602191925, + "step": 2734 + }, + { + "epoch": 1.1544303797468354, + "grad_norm": 1.152167797088623, + "learning_rate": 9.636778405707582e-05, + "loss": 0.6375001072883606, + "step": 2736 + }, + { + "epoch": 1.1552742616033755, + "grad_norm": 1.2400429248809814, + "learning_rate": 9.635879289399161e-05, + "loss": 0.7602289319038391, + "step": 2738 + }, + { + "epoch": 1.1561181434599157, + "grad_norm": 1.3488622903823853, + "learning_rate": 9.634979103682421e-05, + "loss": 0.6209543943405151, + "step": 2740 + }, + { + "epoch": 1.1569620253164556, + "grad_norm": 1.1999555826187134, + "learning_rate": 9.634077848765019e-05, + "loss": 0.6215830445289612, + "step": 2742 + }, + { + "epoch": 1.1578059071729958, + "grad_norm": 1.2008578777313232, + "learning_rate": 9.633175524854855e-05, + "loss": 0.6634654998779297, + "step": 2744 + }, + { + "epoch": 1.158649789029536, + "grad_norm": 1.3920676708221436, + "learning_rate": 9.63227213216008e-05, + "loss": 0.7515161633491516, + "step": 2746 + }, + { + "epoch": 1.159493670886076, + "grad_norm": 1.0551656484603882, + "learning_rate": 9.631367670889089e-05, + "loss": 0.724361777305603, + "step": 2748 + }, + { + "epoch": 1.160337552742616, + "grad_norm": 1.2820028066635132, + "learning_rate": 9.630462141250523e-05, + "loss": 0.6673553586006165, + "step": 2750 + }, + { + "epoch": 1.1611814345991562, + "grad_norm": 1.1452983617782593, + "learning_rate": 9.62955554345327e-05, + "loss": 0.7029784917831421, + "step": 2752 + }, + { + "epoch": 1.1620253164556962, + "grad_norm": 1.1808624267578125, + "learning_rate": 9.628647877706466e-05, + "loss": 0.7355457544326782, + "step": 2754 + }, + { + "epoch": 1.1628691983122363, + "grad_norm": 1.0574703216552734, + "learning_rate": 9.627739144219492e-05, + "loss": 0.6144933700561523, + "step": 2756 + }, + { + "epoch": 1.1637130801687763, + "grad_norm": 1.215733528137207, + "learning_rate": 9.626829343201974e-05, + "loss": 0.6843759417533875, + "step": 2758 + }, + { + "epoch": 1.1645569620253164, + "grad_norm": 1.1667706966400146, + "learning_rate": 9.625918474863787e-05, + "loss": 0.6197049617767334, + "step": 2760 + }, + { + "epoch": 1.1654008438818566, + "grad_norm": 1.3765631914138794, + "learning_rate": 9.62500653941505e-05, + "loss": 0.715958297252655, + "step": 2762 + }, + { + "epoch": 1.1662447257383965, + "grad_norm": 1.173715591430664, + "learning_rate": 9.62409353706613e-05, + "loss": 0.7433139085769653, + "step": 2764 + }, + { + "epoch": 1.1670886075949367, + "grad_norm": 1.1837430000305176, + "learning_rate": 9.623179468027637e-05, + "loss": 0.7174371480941772, + "step": 2766 + }, + { + "epoch": 1.1679324894514769, + "grad_norm": 1.1577154397964478, + "learning_rate": 9.622264332510432e-05, + "loss": 0.7184823751449585, + "step": 2768 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 1.165246605873108, + "learning_rate": 9.621348130725617e-05, + "loss": 0.693343460559845, + "step": 2770 + }, + { + "epoch": 1.169620253164557, + "grad_norm": 1.2853080034255981, + "learning_rate": 9.620430862884542e-05, + "loss": 0.6999852061271667, + "step": 2772 + }, + { + "epoch": 1.1704641350210971, + "grad_norm": 1.1782865524291992, + "learning_rate": 9.619512529198806e-05, + "loss": 0.6034331321716309, + "step": 2774 + }, + { + "epoch": 1.171308016877637, + "grad_norm": 1.4055447578430176, + "learning_rate": 9.61859312988025e-05, + "loss": 0.7588269710540771, + "step": 2776 + }, + { + "epoch": 1.1721518987341772, + "grad_norm": 1.1148805618286133, + "learning_rate": 9.617672665140957e-05, + "loss": 0.6913981437683105, + "step": 2778 + }, + { + "epoch": 1.1729957805907172, + "grad_norm": 1.1311042308807373, + "learning_rate": 9.616751135193266e-05, + "loss": 0.5976925492286682, + "step": 2780 + }, + { + "epoch": 1.1738396624472573, + "grad_norm": 1.2378602027893066, + "learning_rate": 9.615828540249754e-05, + "loss": 0.6897050142288208, + "step": 2782 + }, + { + "epoch": 1.1746835443037975, + "grad_norm": 1.3445732593536377, + "learning_rate": 9.614904880523248e-05, + "loss": 0.6772098541259766, + "step": 2784 + }, + { + "epoch": 1.1755274261603375, + "grad_norm": 1.3380862474441528, + "learning_rate": 9.613980156226815e-05, + "loss": 0.6354818344116211, + "step": 2786 + }, + { + "epoch": 1.1763713080168776, + "grad_norm": 1.0955157279968262, + "learning_rate": 9.613054367573773e-05, + "loss": 0.6541208028793335, + "step": 2788 + }, + { + "epoch": 1.1772151898734178, + "grad_norm": 1.0176626443862915, + "learning_rate": 9.612127514777686e-05, + "loss": 0.6472887992858887, + "step": 2790 + }, + { + "epoch": 1.1780590717299577, + "grad_norm": 1.2644864320755005, + "learning_rate": 9.611199598052357e-05, + "loss": 0.7511212229728699, + "step": 2792 + }, + { + "epoch": 1.1789029535864979, + "grad_norm": 1.248197317123413, + "learning_rate": 9.61027061761184e-05, + "loss": 0.696236789226532, + "step": 2794 + }, + { + "epoch": 1.179746835443038, + "grad_norm": 1.189935564994812, + "learning_rate": 9.609340573670436e-05, + "loss": 0.5962010622024536, + "step": 2796 + }, + { + "epoch": 1.180590717299578, + "grad_norm": 1.1760492324829102, + "learning_rate": 9.608409466442685e-05, + "loss": 0.5981685519218445, + "step": 2798 + }, + { + "epoch": 1.1814345991561181, + "grad_norm": 1.1820716857910156, + "learning_rate": 9.607477296143374e-05, + "loss": 0.6186091303825378, + "step": 2800 + }, + { + "epoch": 1.1814345991561181, + "eval_loss": 0.7298192977905273, + "eval_runtime": 849.544, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 2.48, + "step": 2800 + }, + { + "epoch": 1.1822784810126583, + "grad_norm": 1.0353888273239136, + "learning_rate": 9.606544062987541e-05, + "loss": 0.5859389901161194, + "step": 2802 + }, + { + "epoch": 1.1831223628691983, + "grad_norm": 1.3141933679580688, + "learning_rate": 9.605609767190464e-05, + "loss": 0.6573460698127747, + "step": 2804 + }, + { + "epoch": 1.1839662447257384, + "grad_norm": 1.1209372282028198, + "learning_rate": 9.604674408967664e-05, + "loss": 0.6991921067237854, + "step": 2806 + }, + { + "epoch": 1.1848101265822786, + "grad_norm": 1.2830493450164795, + "learning_rate": 9.603737988534913e-05, + "loss": 0.6438087821006775, + "step": 2808 + }, + { + "epoch": 1.1856540084388185, + "grad_norm": 1.1427195072174072, + "learning_rate": 9.602800506108225e-05, + "loss": 0.6452094316482544, + "step": 2810 + }, + { + "epoch": 1.1864978902953587, + "grad_norm": 1.316420078277588, + "learning_rate": 9.601861961903857e-05, + "loss": 0.6745601296424866, + "step": 2812 + }, + { + "epoch": 1.1873417721518988, + "grad_norm": 1.1643308401107788, + "learning_rate": 9.600922356138317e-05, + "loss": 0.6761514544487, + "step": 2814 + }, + { + "epoch": 1.1881856540084388, + "grad_norm": 1.036056399345398, + "learning_rate": 9.59998168902835e-05, + "loss": 0.6453908681869507, + "step": 2816 + }, + { + "epoch": 1.189029535864979, + "grad_norm": 1.2211129665374756, + "learning_rate": 9.599039960790954e-05, + "loss": 0.6576406359672546, + "step": 2818 + }, + { + "epoch": 1.189873417721519, + "grad_norm": 1.084114670753479, + "learning_rate": 9.598097171643364e-05, + "loss": 0.6214181780815125, + "step": 2820 + }, + { + "epoch": 1.190717299578059, + "grad_norm": 1.1297314167022705, + "learning_rate": 9.597153321803064e-05, + "loss": 0.6381646990776062, + "step": 2822 + }, + { + "epoch": 1.1915611814345992, + "grad_norm": 1.2568120956420898, + "learning_rate": 9.596208411487784e-05, + "loss": 0.7129076719284058, + "step": 2824 + }, + { + "epoch": 1.1924050632911392, + "grad_norm": 1.07041335105896, + "learning_rate": 9.595262440915493e-05, + "loss": 0.7123546004295349, + "step": 2826 + }, + { + "epoch": 1.1932489451476793, + "grad_norm": 1.3950074911117554, + "learning_rate": 9.594315410304413e-05, + "loss": 0.7263038158416748, + "step": 2828 + }, + { + "epoch": 1.1940928270042195, + "grad_norm": 1.2470672130584717, + "learning_rate": 9.593367319873002e-05, + "loss": 0.6863036751747131, + "step": 2830 + }, + { + "epoch": 1.1949367088607594, + "grad_norm": 1.2065461874008179, + "learning_rate": 9.592418169839968e-05, + "loss": 0.745354175567627, + "step": 2832 + }, + { + "epoch": 1.1957805907172996, + "grad_norm": 1.1710152626037598, + "learning_rate": 9.591467960424261e-05, + "loss": 0.6401656866073608, + "step": 2834 + }, + { + "epoch": 1.1966244725738397, + "grad_norm": 1.3324087858200073, + "learning_rate": 9.590516691845077e-05, + "loss": 0.7402615547180176, + "step": 2836 + }, + { + "epoch": 1.1974683544303797, + "grad_norm": 1.0100195407867432, + "learning_rate": 9.589564364321855e-05, + "loss": 0.5723769068717957, + "step": 2838 + }, + { + "epoch": 1.1983122362869199, + "grad_norm": 1.2706246376037598, + "learning_rate": 9.588610978074277e-05, + "loss": 0.6618966460227966, + "step": 2840 + }, + { + "epoch": 1.1991561181434598, + "grad_norm": 1.1921758651733398, + "learning_rate": 9.587656533322273e-05, + "loss": 0.7090804576873779, + "step": 2842 + }, + { + "epoch": 1.2, + "grad_norm": 1.36713445186615, + "learning_rate": 9.586701030286014e-05, + "loss": 0.6930652856826782, + "step": 2844 + }, + { + "epoch": 1.2008438818565401, + "grad_norm": 1.3084295988082886, + "learning_rate": 9.585744469185917e-05, + "loss": 0.7386236190795898, + "step": 2846 + }, + { + "epoch": 1.20168776371308, + "grad_norm": 1.198922038078308, + "learning_rate": 9.584786850242642e-05, + "loss": 0.6179903149604797, + "step": 2848 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.2106369733810425, + "learning_rate": 9.583828173677092e-05, + "loss": 0.7027528882026672, + "step": 2850 + }, + { + "epoch": 1.2033755274261604, + "grad_norm": 1.2959522008895874, + "learning_rate": 9.582868439710418e-05, + "loss": 0.6612945199012756, + "step": 2852 + }, + { + "epoch": 1.2042194092827003, + "grad_norm": 1.1441705226898193, + "learning_rate": 9.58190764856401e-05, + "loss": 0.7085917592048645, + "step": 2854 + }, + { + "epoch": 1.2050632911392405, + "grad_norm": 1.1586185693740845, + "learning_rate": 9.580945800459504e-05, + "loss": 0.7480600476264954, + "step": 2856 + }, + { + "epoch": 1.2059071729957807, + "grad_norm": 1.2068266868591309, + "learning_rate": 9.579982895618783e-05, + "loss": 0.7185836434364319, + "step": 2858 + }, + { + "epoch": 1.2067510548523206, + "grad_norm": 1.2188525199890137, + "learning_rate": 9.579018934263966e-05, + "loss": 0.6737306118011475, + "step": 2860 + }, + { + "epoch": 1.2075949367088608, + "grad_norm": 1.1513181924819946, + "learning_rate": 9.578053916617423e-05, + "loss": 0.7239293456077576, + "step": 2862 + }, + { + "epoch": 1.208438818565401, + "grad_norm": 1.2063703536987305, + "learning_rate": 9.577087842901764e-05, + "loss": 0.6416276097297668, + "step": 2864 + }, + { + "epoch": 1.2092827004219409, + "grad_norm": 1.102460503578186, + "learning_rate": 9.576120713339844e-05, + "loss": 0.697213351726532, + "step": 2866 + }, + { + "epoch": 1.210126582278481, + "grad_norm": 1.2484638690948486, + "learning_rate": 9.575152528154763e-05, + "loss": 0.6664742231369019, + "step": 2868 + }, + { + "epoch": 1.2109704641350212, + "grad_norm": 1.4476624727249146, + "learning_rate": 9.57418328756986e-05, + "loss": 0.6914868354797363, + "step": 2870 + }, + { + "epoch": 1.2118143459915611, + "grad_norm": 1.0130122900009155, + "learning_rate": 9.573212991808722e-05, + "loss": 0.662024736404419, + "step": 2872 + }, + { + "epoch": 1.2126582278481013, + "grad_norm": 1.014470100402832, + "learning_rate": 9.572241641095177e-05, + "loss": 0.6330409646034241, + "step": 2874 + }, + { + "epoch": 1.2135021097046415, + "grad_norm": 1.1803333759307861, + "learning_rate": 9.571269235653298e-05, + "loss": 0.6607463955879211, + "step": 2876 + }, + { + "epoch": 1.2143459915611814, + "grad_norm": 1.261366844177246, + "learning_rate": 9.570295775707398e-05, + "loss": 0.6925629377365112, + "step": 2878 + }, + { + "epoch": 1.2151898734177216, + "grad_norm": 1.226670503616333, + "learning_rate": 9.569321261482037e-05, + "loss": 0.7070510983467102, + "step": 2880 + }, + { + "epoch": 1.2160337552742617, + "grad_norm": 1.164565920829773, + "learning_rate": 9.568345693202016e-05, + "loss": 0.7243561744689941, + "step": 2882 + }, + { + "epoch": 1.2168776371308017, + "grad_norm": 1.060331106185913, + "learning_rate": 9.567369071092382e-05, + "loss": 0.6316909790039062, + "step": 2884 + }, + { + "epoch": 1.2177215189873418, + "grad_norm": 1.1998693943023682, + "learning_rate": 9.566391395378419e-05, + "loss": 0.6139125227928162, + "step": 2886 + }, + { + "epoch": 1.2185654008438818, + "grad_norm": 1.1875834465026855, + "learning_rate": 9.565412666285661e-05, + "loss": 0.688897430896759, + "step": 2888 + }, + { + "epoch": 1.219409282700422, + "grad_norm": 1.199174404144287, + "learning_rate": 9.564432884039882e-05, + "loss": 0.684590756893158, + "step": 2890 + }, + { + "epoch": 1.220253164556962, + "grad_norm": 1.2428219318389893, + "learning_rate": 9.563452048867099e-05, + "loss": 0.67433100938797, + "step": 2892 + }, + { + "epoch": 1.221097046413502, + "grad_norm": 1.0826431512832642, + "learning_rate": 9.562470160993568e-05, + "loss": 0.6959785223007202, + "step": 2894 + }, + { + "epoch": 1.2219409282700422, + "grad_norm": 1.3140246868133545, + "learning_rate": 9.561487220645797e-05, + "loss": 0.6443175673484802, + "step": 2896 + }, + { + "epoch": 1.2227848101265824, + "grad_norm": 1.2758334875106812, + "learning_rate": 9.560503228050529e-05, + "loss": 0.6715332865715027, + "step": 2898 + }, + { + "epoch": 1.2236286919831223, + "grad_norm": 1.3326421976089478, + "learning_rate": 9.559518183434753e-05, + "loss": 0.6896081566810608, + "step": 2900 + }, + { + "epoch": 1.2236286919831223, + "eval_loss": 0.7281573414802551, + "eval_runtime": 854.563, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 2.466, + "step": 2900 + }, + { + "epoch": 1.2244725738396625, + "grad_norm": 1.3225606679916382, + "learning_rate": 9.558532087025697e-05, + "loss": 0.6797633171081543, + "step": 2902 + }, + { + "epoch": 1.2253164556962026, + "grad_norm": 1.3058340549468994, + "learning_rate": 9.55754493905084e-05, + "loss": 0.6510948538780212, + "step": 2904 + }, + { + "epoch": 1.2261603375527426, + "grad_norm": 1.140268087387085, + "learning_rate": 9.556556739737892e-05, + "loss": 0.6481176614761353, + "step": 2906 + }, + { + "epoch": 1.2270042194092827, + "grad_norm": 1.465113639831543, + "learning_rate": 9.555567489314816e-05, + "loss": 0.7533771991729736, + "step": 2908 + }, + { + "epoch": 1.2278481012658227, + "grad_norm": 1.1468979120254517, + "learning_rate": 9.554577188009812e-05, + "loss": 0.6924305558204651, + "step": 2910 + }, + { + "epoch": 1.2286919831223628, + "grad_norm": 1.2193517684936523, + "learning_rate": 9.553585836051321e-05, + "loss": 0.7082820534706116, + "step": 2912 + }, + { + "epoch": 1.229535864978903, + "grad_norm": 1.2015037536621094, + "learning_rate": 9.552593433668034e-05, + "loss": 0.6735695004463196, + "step": 2914 + }, + { + "epoch": 1.230379746835443, + "grad_norm": 1.1915435791015625, + "learning_rate": 9.551599981088874e-05, + "loss": 0.7312048673629761, + "step": 2916 + }, + { + "epoch": 1.231223628691983, + "grad_norm": 1.2849410772323608, + "learning_rate": 9.550605478543013e-05, + "loss": 0.6590308547019958, + "step": 2918 + }, + { + "epoch": 1.2320675105485233, + "grad_norm": 1.192238688468933, + "learning_rate": 9.549609926259866e-05, + "loss": 0.6237715482711792, + "step": 2920 + }, + { + "epoch": 1.2329113924050632, + "grad_norm": 1.141845703125, + "learning_rate": 9.548613324469085e-05, + "loss": 0.6546295881271362, + "step": 2922 + }, + { + "epoch": 1.2337552742616034, + "grad_norm": 1.1662311553955078, + "learning_rate": 9.547615673400566e-05, + "loss": 0.5800934433937073, + "step": 2924 + }, + { + "epoch": 1.2345991561181435, + "grad_norm": 1.120578646659851, + "learning_rate": 9.546616973284453e-05, + "loss": 0.6487136483192444, + "step": 2926 + }, + { + "epoch": 1.2354430379746835, + "grad_norm": 1.0884860754013062, + "learning_rate": 9.54561722435112e-05, + "loss": 0.7515342235565186, + "step": 2928 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 1.4208670854568481, + "learning_rate": 9.544616426831196e-05, + "loss": 0.7162003517150879, + "step": 2930 + }, + { + "epoch": 1.2371308016877638, + "grad_norm": 1.083389401435852, + "learning_rate": 9.543614580955543e-05, + "loss": 0.708450198173523, + "step": 2932 + }, + { + "epoch": 1.2379746835443037, + "grad_norm": 1.141364336013794, + "learning_rate": 9.542611686955268e-05, + "loss": 0.6255859732627869, + "step": 2934 + }, + { + "epoch": 1.238818565400844, + "grad_norm": 1.122036099433899, + "learning_rate": 9.54160774506172e-05, + "loss": 0.6485402584075928, + "step": 2936 + }, + { + "epoch": 1.239662447257384, + "grad_norm": 1.3514165878295898, + "learning_rate": 9.540602755506487e-05, + "loss": 0.6735473871231079, + "step": 2938 + }, + { + "epoch": 1.240506329113924, + "grad_norm": 1.1762629747390747, + "learning_rate": 9.539596718521403e-05, + "loss": 0.6154970526695251, + "step": 2940 + }, + { + "epoch": 1.2413502109704642, + "grad_norm": 1.1609408855438232, + "learning_rate": 9.53858963433854e-05, + "loss": 0.6410251259803772, + "step": 2942 + }, + { + "epoch": 1.2421940928270043, + "grad_norm": 1.1750361919403076, + "learning_rate": 9.537581503190214e-05, + "loss": 0.6841039657592773, + "step": 2944 + }, + { + "epoch": 1.2430379746835443, + "grad_norm": 1.3125680685043335, + "learning_rate": 9.536572325308982e-05, + "loss": 0.7293462753295898, + "step": 2946 + }, + { + "epoch": 1.2438818565400844, + "grad_norm": 1.1737277507781982, + "learning_rate": 9.53556210092764e-05, + "loss": 0.7713663578033447, + "step": 2948 + }, + { + "epoch": 1.2447257383966246, + "grad_norm": 1.1702152490615845, + "learning_rate": 9.53455083027923e-05, + "loss": 0.6612298488616943, + "step": 2950 + }, + { + "epoch": 1.2455696202531645, + "grad_norm": 1.2594486474990845, + "learning_rate": 9.533538513597028e-05, + "loss": 0.6725803017616272, + "step": 2952 + }, + { + "epoch": 1.2464135021097047, + "grad_norm": 1.180816411972046, + "learning_rate": 9.532525151114562e-05, + "loss": 0.6421069502830505, + "step": 2954 + }, + { + "epoch": 1.2472573839662446, + "grad_norm": 1.25814688205719, + "learning_rate": 9.531510743065593e-05, + "loss": 0.7042996287345886, + "step": 2956 + }, + { + "epoch": 1.2481012658227848, + "grad_norm": 1.2101783752441406, + "learning_rate": 9.530495289684122e-05, + "loss": 0.7359137535095215, + "step": 2958 + }, + { + "epoch": 1.248945147679325, + "grad_norm": 1.1438405513763428, + "learning_rate": 9.5294787912044e-05, + "loss": 0.6186386346817017, + "step": 2960 + }, + { + "epoch": 1.249789029535865, + "grad_norm": 1.163364291191101, + "learning_rate": 9.52846124786091e-05, + "loss": 0.6243056058883667, + "step": 2962 + }, + { + "epoch": 1.250632911392405, + "grad_norm": 1.0695953369140625, + "learning_rate": 9.52744265988838e-05, + "loss": 0.6568763852119446, + "step": 2964 + }, + { + "epoch": 1.2514767932489452, + "grad_norm": 1.2228879928588867, + "learning_rate": 9.52642302752178e-05, + "loss": 0.6486776471138, + "step": 2966 + }, + { + "epoch": 1.2523206751054852, + "grad_norm": 1.2262967824935913, + "learning_rate": 9.52540235099632e-05, + "loss": 0.6293455958366394, + "step": 2968 + }, + { + "epoch": 1.2531645569620253, + "grad_norm": 1.0862956047058105, + "learning_rate": 9.524380630547449e-05, + "loss": 0.6549884080886841, + "step": 2970 + }, + { + "epoch": 1.2540084388185653, + "grad_norm": 1.1721880435943604, + "learning_rate": 9.52335786641086e-05, + "loss": 0.6126490831375122, + "step": 2972 + }, + { + "epoch": 1.2548523206751054, + "grad_norm": 1.2452391386032104, + "learning_rate": 9.522334058822483e-05, + "loss": 0.7078590393066406, + "step": 2974 + }, + { + "epoch": 1.2556962025316456, + "grad_norm": 1.2290222644805908, + "learning_rate": 9.521309208018492e-05, + "loss": 0.6166214942932129, + "step": 2976 + }, + { + "epoch": 1.2565400843881855, + "grad_norm": 1.1823618412017822, + "learning_rate": 9.520283314235299e-05, + "loss": 0.666228175163269, + "step": 2978 + }, + { + "epoch": 1.2573839662447257, + "grad_norm": 1.1702475547790527, + "learning_rate": 9.51925637770956e-05, + "loss": 0.7436795830726624, + "step": 2980 + }, + { + "epoch": 1.2582278481012659, + "grad_norm": 1.0879321098327637, + "learning_rate": 9.518228398678168e-05, + "loss": 0.7120893001556396, + "step": 2982 + }, + { + "epoch": 1.2590717299578058, + "grad_norm": 1.1608418226242065, + "learning_rate": 9.517199377378261e-05, + "loss": 0.6931713223457336, + "step": 2984 + }, + { + "epoch": 1.259915611814346, + "grad_norm": 1.1289087533950806, + "learning_rate": 9.51616931404721e-05, + "loss": 0.6803538799285889, + "step": 2986 + }, + { + "epoch": 1.2607594936708861, + "grad_norm": 1.1622236967086792, + "learning_rate": 9.515138208922633e-05, + "loss": 0.6499706506729126, + "step": 2988 + }, + { + "epoch": 1.261603375527426, + "grad_norm": 1.2492594718933105, + "learning_rate": 9.514106062242386e-05, + "loss": 0.6132655739784241, + "step": 2990 + }, + { + "epoch": 1.2624472573839662, + "grad_norm": 1.1538822650909424, + "learning_rate": 9.513072874244567e-05, + "loss": 0.6309265494346619, + "step": 2992 + }, + { + "epoch": 1.2632911392405064, + "grad_norm": 1.0828478336334229, + "learning_rate": 9.512038645167509e-05, + "loss": 0.6297751665115356, + "step": 2994 + }, + { + "epoch": 1.2641350210970463, + "grad_norm": 1.2440937757492065, + "learning_rate": 9.511003375249792e-05, + "loss": 0.6335258483886719, + "step": 2996 + }, + { + "epoch": 1.2649789029535865, + "grad_norm": 1.1259970664978027, + "learning_rate": 9.50996706473023e-05, + "loss": 0.6513770818710327, + "step": 2998 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.1530309915542603, + "learning_rate": 9.508929713847884e-05, + "loss": 0.6490892767906189, + "step": 3000 + }, + { + "epoch": 1.2658227848101267, + "eval_loss": 0.72515869140625, + "eval_runtime": 868.0515, + "eval_samples_per_second": 2.427, + "eval_steps_per_second": 2.427, + "step": 3000 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.2257169485092163, + "learning_rate": 9.507891322842048e-05, + "loss": 0.6936060786247253, + "step": 3002 + }, + { + "epoch": 1.2675105485232068, + "grad_norm": 1.0380109548568726, + "learning_rate": 9.506851891952259e-05, + "loss": 0.5941951870918274, + "step": 3004 + }, + { + "epoch": 1.268354430379747, + "grad_norm": 1.2830222845077515, + "learning_rate": 9.505811421418296e-05, + "loss": 0.648429811000824, + "step": 3006 + }, + { + "epoch": 1.2691983122362869, + "grad_norm": 1.2212986946105957, + "learning_rate": 9.504769911480171e-05, + "loss": 0.6868565678596497, + "step": 3008 + }, + { + "epoch": 1.270042194092827, + "grad_norm": 1.104656457901001, + "learning_rate": 9.503727362378145e-05, + "loss": 0.6777986288070679, + "step": 3010 + }, + { + "epoch": 1.2708860759493672, + "grad_norm": 1.1449005603790283, + "learning_rate": 9.502683774352713e-05, + "loss": 0.6581128239631653, + "step": 3012 + }, + { + "epoch": 1.2717299578059071, + "grad_norm": 1.2753362655639648, + "learning_rate": 9.501639147644608e-05, + "loss": 0.689930260181427, + "step": 3014 + }, + { + "epoch": 1.2725738396624473, + "grad_norm": 1.3367106914520264, + "learning_rate": 9.500593482494809e-05, + "loss": 0.7549214363098145, + "step": 3016 + }, + { + "epoch": 1.2734177215189875, + "grad_norm": 1.2309048175811768, + "learning_rate": 9.499546779144528e-05, + "loss": 0.6713513135910034, + "step": 3018 + }, + { + "epoch": 1.2742616033755274, + "grad_norm": 1.3833240270614624, + "learning_rate": 9.49849903783522e-05, + "loss": 0.7045458555221558, + "step": 3020 + }, + { + "epoch": 1.2751054852320676, + "grad_norm": 1.1402570009231567, + "learning_rate": 9.49745025880858e-05, + "loss": 0.708249568939209, + "step": 3022 + }, + { + "epoch": 1.2759493670886077, + "grad_norm": 1.0476267337799072, + "learning_rate": 9.496400442306541e-05, + "loss": 0.616210401058197, + "step": 3024 + }, + { + "epoch": 1.2767932489451477, + "grad_norm": 1.1045979261398315, + "learning_rate": 9.495349588571274e-05, + "loss": 0.6691827178001404, + "step": 3026 + }, + { + "epoch": 1.2776371308016878, + "grad_norm": 1.1760368347167969, + "learning_rate": 9.494297697845194e-05, + "loss": 0.6198306083679199, + "step": 3028 + }, + { + "epoch": 1.2784810126582278, + "grad_norm": 1.0015549659729004, + "learning_rate": 9.493244770370946e-05, + "loss": 0.5756480097770691, + "step": 3030 + }, + { + "epoch": 1.279324894514768, + "grad_norm": 1.2190428972244263, + "learning_rate": 9.492190806391427e-05, + "loss": 0.6794419884681702, + "step": 3032 + }, + { + "epoch": 1.2801687763713079, + "grad_norm": 1.0210410356521606, + "learning_rate": 9.491135806149762e-05, + "loss": 0.5847988724708557, + "step": 3034 + }, + { + "epoch": 1.281012658227848, + "grad_norm": 1.0678503513336182, + "learning_rate": 9.490079769889319e-05, + "loss": 0.6760231256484985, + "step": 3036 + }, + { + "epoch": 1.2818565400843882, + "grad_norm": 1.1811012029647827, + "learning_rate": 9.489022697853709e-05, + "loss": 0.7188448309898376, + "step": 3038 + }, + { + "epoch": 1.2827004219409281, + "grad_norm": 1.1134302616119385, + "learning_rate": 9.487964590286776e-05, + "loss": 0.674904465675354, + "step": 3040 + }, + { + "epoch": 1.2835443037974683, + "grad_norm": 1.1868232488632202, + "learning_rate": 9.486905447432603e-05, + "loss": 0.6016344428062439, + "step": 3042 + }, + { + "epoch": 1.2843881856540085, + "grad_norm": 1.1586613655090332, + "learning_rate": 9.485845269535517e-05, + "loss": 0.6965603828430176, + "step": 3044 + }, + { + "epoch": 1.2852320675105484, + "grad_norm": 1.149837613105774, + "learning_rate": 9.48478405684008e-05, + "loss": 0.656144380569458, + "step": 3046 + }, + { + "epoch": 1.2860759493670886, + "grad_norm": 1.228752613067627, + "learning_rate": 9.48372180959109e-05, + "loss": 0.6388653516769409, + "step": 3048 + }, + { + "epoch": 1.2869198312236287, + "grad_norm": 1.2403100728988647, + "learning_rate": 9.482658528033595e-05, + "loss": 0.6255465745925903, + "step": 3050 + }, + { + "epoch": 1.2877637130801687, + "grad_norm": 1.2483839988708496, + "learning_rate": 9.481594212412865e-05, + "loss": 0.6828253269195557, + "step": 3052 + }, + { + "epoch": 1.2886075949367088, + "grad_norm": 1.4161021709442139, + "learning_rate": 9.480528862974422e-05, + "loss": 0.7072080373764038, + "step": 3054 + }, + { + "epoch": 1.289451476793249, + "grad_norm": 1.1500437259674072, + "learning_rate": 9.479462479964021e-05, + "loss": 0.6082415580749512, + "step": 3056 + }, + { + "epoch": 1.290295358649789, + "grad_norm": 1.196595549583435, + "learning_rate": 9.478395063627654e-05, + "loss": 0.6653015613555908, + "step": 3058 + }, + { + "epoch": 1.2911392405063291, + "grad_norm": 1.2832285165786743, + "learning_rate": 9.477326614211557e-05, + "loss": 0.7095832824707031, + "step": 3060 + }, + { + "epoch": 1.2919831223628693, + "grad_norm": 1.2234288454055786, + "learning_rate": 9.476257131962198e-05, + "loss": 0.7183426022529602, + "step": 3062 + }, + { + "epoch": 1.2928270042194092, + "grad_norm": 1.2350459098815918, + "learning_rate": 9.475186617126286e-05, + "loss": 0.713284432888031, + "step": 3064 + }, + { + "epoch": 1.2936708860759494, + "grad_norm": 1.2079555988311768, + "learning_rate": 9.47411506995077e-05, + "loss": 0.6580002307891846, + "step": 3066 + }, + { + "epoch": 1.2945147679324895, + "grad_norm": 1.129796028137207, + "learning_rate": 9.473042490682835e-05, + "loss": 0.5967763662338257, + "step": 3068 + }, + { + "epoch": 1.2953586497890295, + "grad_norm": 1.1706618070602417, + "learning_rate": 9.471968879569901e-05, + "loss": 0.6724388003349304, + "step": 3070 + }, + { + "epoch": 1.2962025316455696, + "grad_norm": 1.0336005687713623, + "learning_rate": 9.470894236859635e-05, + "loss": 0.6527577638626099, + "step": 3072 + }, + { + "epoch": 1.2970464135021098, + "grad_norm": 1.1124558448791504, + "learning_rate": 9.469818562799932e-05, + "loss": 0.677132785320282, + "step": 3074 + }, + { + "epoch": 1.2978902953586497, + "grad_norm": 1.158069372177124, + "learning_rate": 9.468741857638933e-05, + "loss": 0.649718165397644, + "step": 3076 + }, + { + "epoch": 1.29873417721519, + "grad_norm": 1.092926263809204, + "learning_rate": 9.46766412162501e-05, + "loss": 0.6872133612632751, + "step": 3078 + }, + { + "epoch": 1.29957805907173, + "grad_norm": 1.1324822902679443, + "learning_rate": 9.466585355006777e-05, + "loss": 0.6495246291160583, + "step": 3080 + }, + { + "epoch": 1.30042194092827, + "grad_norm": 1.5882837772369385, + "learning_rate": 9.465505558033086e-05, + "loss": 0.6730570197105408, + "step": 3082 + }, + { + "epoch": 1.3012658227848102, + "grad_norm": 0.9866069555282593, + "learning_rate": 9.464424730953023e-05, + "loss": 0.5677527785301208, + "step": 3084 + }, + { + "epoch": 1.3021097046413503, + "grad_norm": 1.1560224294662476, + "learning_rate": 9.463342874015917e-05, + "loss": 0.6247856020927429, + "step": 3086 + }, + { + "epoch": 1.3029535864978903, + "grad_norm": 1.135939359664917, + "learning_rate": 9.462259987471329e-05, + "loss": 0.6889358758926392, + "step": 3088 + }, + { + "epoch": 1.3037974683544304, + "grad_norm": 1.3935760259628296, + "learning_rate": 9.461176071569063e-05, + "loss": 0.7097522020339966, + "step": 3090 + }, + { + "epoch": 1.3046413502109704, + "grad_norm": 1.153518795967102, + "learning_rate": 9.460091126559155e-05, + "loss": 0.7044580578804016, + "step": 3092 + }, + { + "epoch": 1.3054852320675105, + "grad_norm": 1.2112717628479004, + "learning_rate": 9.45900515269188e-05, + "loss": 0.6119300723075867, + "step": 3094 + }, + { + "epoch": 1.3063291139240507, + "grad_norm": 1.295591115951538, + "learning_rate": 9.457918150217754e-05, + "loss": 0.7150222063064575, + "step": 3096 + }, + { + "epoch": 1.3071729957805907, + "grad_norm": 1.1175775527954102, + "learning_rate": 9.456830119387527e-05, + "loss": 0.6043334007263184, + "step": 3098 + }, + { + "epoch": 1.3080168776371308, + "grad_norm": 1.4022588729858398, + "learning_rate": 9.455741060452186e-05, + "loss": 0.6354425549507141, + "step": 3100 + }, + { + "epoch": 1.3080168776371308, + "eval_loss": 0.7225774526596069, + "eval_runtime": 862.4006, + "eval_samples_per_second": 2.443, + "eval_steps_per_second": 2.443, + "step": 3100 + }, + { + "epoch": 1.3088607594936708, + "grad_norm": 1.1657692193984985, + "learning_rate": 9.454650973662957e-05, + "loss": 0.7281571626663208, + "step": 3102 + }, + { + "epoch": 1.309704641350211, + "grad_norm": 1.6169127225875854, + "learning_rate": 9.453559859271301e-05, + "loss": 0.8038214445114136, + "step": 3104 + }, + { + "epoch": 1.310548523206751, + "grad_norm": 1.1256520748138428, + "learning_rate": 9.452467717528918e-05, + "loss": 0.6488606333732605, + "step": 3106 + }, + { + "epoch": 1.311392405063291, + "grad_norm": 1.1224530935287476, + "learning_rate": 9.451374548687745e-05, + "loss": 0.6897066235542297, + "step": 3108 + }, + { + "epoch": 1.3122362869198312, + "grad_norm": 1.1123055219650269, + "learning_rate": 9.450280352999952e-05, + "loss": 0.6332913041114807, + "step": 3110 + }, + { + "epoch": 1.3130801687763713, + "grad_norm": 1.1688940525054932, + "learning_rate": 9.449185130717952e-05, + "loss": 0.7426630854606628, + "step": 3112 + }, + { + "epoch": 1.3139240506329113, + "grad_norm": 1.1898044347763062, + "learning_rate": 9.44808888209439e-05, + "loss": 0.7156099677085876, + "step": 3114 + }, + { + "epoch": 1.3147679324894515, + "grad_norm": 1.3030686378479004, + "learning_rate": 9.44699160738215e-05, + "loss": 0.7150979042053223, + "step": 3116 + }, + { + "epoch": 1.3156118143459916, + "grad_norm": 1.1539074182510376, + "learning_rate": 9.445893306834352e-05, + "loss": 0.6687285900115967, + "step": 3118 + }, + { + "epoch": 1.3164556962025316, + "grad_norm": 1.311808466911316, + "learning_rate": 9.444793980704355e-05, + "loss": 0.7340983152389526, + "step": 3120 + }, + { + "epoch": 1.3172995780590717, + "grad_norm": 1.3325430154800415, + "learning_rate": 9.44369362924575e-05, + "loss": 0.6620677709579468, + "step": 3122 + }, + { + "epoch": 1.3181434599156119, + "grad_norm": 1.201518177986145, + "learning_rate": 9.442592252712365e-05, + "loss": 0.6169955134391785, + "step": 3124 + }, + { + "epoch": 1.3189873417721518, + "grad_norm": 1.2124013900756836, + "learning_rate": 9.441489851358272e-05, + "loss": 0.6696792840957642, + "step": 3126 + }, + { + "epoch": 1.319831223628692, + "grad_norm": 1.2186850309371948, + "learning_rate": 9.440386425437768e-05, + "loss": 0.7303428649902344, + "step": 3128 + }, + { + "epoch": 1.3206751054852321, + "grad_norm": 1.3780523538589478, + "learning_rate": 9.439281975205396e-05, + "loss": 0.7093026638031006, + "step": 3130 + }, + { + "epoch": 1.321518987341772, + "grad_norm": 1.233353614807129, + "learning_rate": 9.438176500915932e-05, + "loss": 0.6821767687797546, + "step": 3132 + }, + { + "epoch": 1.3223628691983123, + "grad_norm": 1.2425329685211182, + "learning_rate": 9.437070002824385e-05, + "loss": 0.700680136680603, + "step": 3134 + }, + { + "epoch": 1.3232067510548524, + "grad_norm": 1.1600432395935059, + "learning_rate": 9.435962481186003e-05, + "loss": 0.6173145771026611, + "step": 3136 + }, + { + "epoch": 1.3240506329113924, + "grad_norm": 1.279336929321289, + "learning_rate": 9.434853936256272e-05, + "loss": 0.6597106456756592, + "step": 3138 + }, + { + "epoch": 1.3248945147679325, + "grad_norm": 1.1787258386611938, + "learning_rate": 9.433744368290909e-05, + "loss": 0.6655287742614746, + "step": 3140 + }, + { + "epoch": 1.3257383966244727, + "grad_norm": 1.3658509254455566, + "learning_rate": 9.432633777545874e-05, + "loss": 0.6312944889068604, + "step": 3142 + }, + { + "epoch": 1.3265822784810126, + "grad_norm": 1.1220000982284546, + "learning_rate": 9.431522164277356e-05, + "loss": 0.6696156859397888, + "step": 3144 + }, + { + "epoch": 1.3274261603375528, + "grad_norm": 1.224761724472046, + "learning_rate": 9.430409528741783e-05, + "loss": 0.6586571335792542, + "step": 3146 + }, + { + "epoch": 1.328270042194093, + "grad_norm": 1.227510929107666, + "learning_rate": 9.429295871195821e-05, + "loss": 0.64905846118927, + "step": 3148 + }, + { + "epoch": 1.3291139240506329, + "grad_norm": 1.1359103918075562, + "learning_rate": 9.428181191896366e-05, + "loss": 0.6407933831214905, + "step": 3150 + }, + { + "epoch": 1.329957805907173, + "grad_norm": 1.2729473114013672, + "learning_rate": 9.427065491100556e-05, + "loss": 0.7004884481430054, + "step": 3152 + }, + { + "epoch": 1.3308016877637132, + "grad_norm": 1.1182841062545776, + "learning_rate": 9.42594876906576e-05, + "loss": 0.6835907101631165, + "step": 3154 + }, + { + "epoch": 1.3316455696202532, + "grad_norm": 1.2309781312942505, + "learning_rate": 9.424831026049585e-05, + "loss": 0.7476315498352051, + "step": 3156 + }, + { + "epoch": 1.3324894514767933, + "grad_norm": 1.0857728719711304, + "learning_rate": 9.423712262309873e-05, + "loss": 0.6811426281929016, + "step": 3158 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.299680233001709, + "learning_rate": 9.4225924781047e-05, + "loss": 0.6403942108154297, + "step": 3160 + }, + { + "epoch": 1.3341772151898734, + "grad_norm": 1.226472020149231, + "learning_rate": 9.421471673692382e-05, + "loss": 0.6758930683135986, + "step": 3162 + }, + { + "epoch": 1.3350210970464136, + "grad_norm": 1.1403205394744873, + "learning_rate": 9.420349849331463e-05, + "loss": 0.7119444608688354, + "step": 3164 + }, + { + "epoch": 1.3358649789029535, + "grad_norm": 1.2888442277908325, + "learning_rate": 9.419227005280729e-05, + "loss": 0.7411463260650635, + "step": 3166 + }, + { + "epoch": 1.3367088607594937, + "grad_norm": 1.1929190158843994, + "learning_rate": 9.418103141799197e-05, + "loss": 0.5992606282234192, + "step": 3168 + }, + { + "epoch": 1.3375527426160336, + "grad_norm": 1.2574355602264404, + "learning_rate": 9.416978259146122e-05, + "loss": 0.6728890538215637, + "step": 3170 + }, + { + "epoch": 1.3383966244725738, + "grad_norm": 0.9653727412223816, + "learning_rate": 9.415852357580992e-05, + "loss": 0.6294883489608765, + "step": 3172 + }, + { + "epoch": 1.339240506329114, + "grad_norm": 1.2107670307159424, + "learning_rate": 9.414725437363532e-05, + "loss": 0.6816665530204773, + "step": 3174 + }, + { + "epoch": 1.340084388185654, + "grad_norm": 1.024849534034729, + "learning_rate": 9.4135974987537e-05, + "loss": 0.6186381578445435, + "step": 3176 + }, + { + "epoch": 1.340928270042194, + "grad_norm": 1.1556614637374878, + "learning_rate": 9.41246854201169e-05, + "loss": 0.6071005463600159, + "step": 3178 + }, + { + "epoch": 1.3417721518987342, + "grad_norm": 1.2382808923721313, + "learning_rate": 9.41133856739793e-05, + "loss": 0.7871434092521667, + "step": 3180 + }, + { + "epoch": 1.3426160337552742, + "grad_norm": 1.0499578714370728, + "learning_rate": 9.410207575173082e-05, + "loss": 0.6578201651573181, + "step": 3182 + }, + { + "epoch": 1.3434599156118143, + "grad_norm": 1.2048250436782837, + "learning_rate": 9.409075565598049e-05, + "loss": 0.6271620392799377, + "step": 3184 + }, + { + "epoch": 1.3443037974683545, + "grad_norm": 1.0287591218948364, + "learning_rate": 9.407942538933958e-05, + "loss": 0.5773864388465881, + "step": 3186 + }, + { + "epoch": 1.3451476793248944, + "grad_norm": 1.1125097274780273, + "learning_rate": 9.406808495442181e-05, + "loss": 0.6745175719261169, + "step": 3188 + }, + { + "epoch": 1.3459915611814346, + "grad_norm": 1.036125898361206, + "learning_rate": 9.405673435384319e-05, + "loss": 0.6001214385032654, + "step": 3190 + }, + { + "epoch": 1.3468354430379748, + "grad_norm": 1.2771985530853271, + "learning_rate": 9.404537359022207e-05, + "loss": 0.6703945994377136, + "step": 3192 + }, + { + "epoch": 1.3476793248945147, + "grad_norm": 1.0891097784042358, + "learning_rate": 9.403400266617918e-05, + "loss": 0.6159096360206604, + "step": 3194 + }, + { + "epoch": 1.3485232067510549, + "grad_norm": 1.1926233768463135, + "learning_rate": 9.402262158433755e-05, + "loss": 0.6439315676689148, + "step": 3196 + }, + { + "epoch": 1.349367088607595, + "grad_norm": 1.272557020187378, + "learning_rate": 9.40112303473226e-05, + "loss": 0.7125352025032043, + "step": 3198 + }, + { + "epoch": 1.350210970464135, + "grad_norm": 1.052037239074707, + "learning_rate": 9.399982895776207e-05, + "loss": 0.594719648361206, + "step": 3200 + }, + { + "epoch": 1.350210970464135, + "eval_loss": 0.7200453281402588, + "eval_runtime": 846.2953, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 3200 + }, + { + "epoch": 1.3510548523206751, + "grad_norm": 1.204728126525879, + "learning_rate": 9.398841741828601e-05, + "loss": 0.6390520334243774, + "step": 3202 + }, + { + "epoch": 1.3518987341772153, + "grad_norm": 1.0873899459838867, + "learning_rate": 9.397699573152689e-05, + "loss": 0.6010531187057495, + "step": 3204 + }, + { + "epoch": 1.3527426160337552, + "grad_norm": 1.3124359846115112, + "learning_rate": 9.396556390011944e-05, + "loss": 0.724280834197998, + "step": 3206 + }, + { + "epoch": 1.3535864978902954, + "grad_norm": 1.2179948091506958, + "learning_rate": 9.395412192670075e-05, + "loss": 0.6430405378341675, + "step": 3208 + }, + { + "epoch": 1.3544303797468356, + "grad_norm": 1.2617219686508179, + "learning_rate": 9.394266981391031e-05, + "loss": 0.7188641428947449, + "step": 3210 + }, + { + "epoch": 1.3552742616033755, + "grad_norm": 1.2151501178741455, + "learning_rate": 9.393120756438988e-05, + "loss": 0.6724364757537842, + "step": 3212 + }, + { + "epoch": 1.3561181434599157, + "grad_norm": 1.221528172492981, + "learning_rate": 9.391973518078357e-05, + "loss": 0.6340664625167847, + "step": 3214 + }, + { + "epoch": 1.3569620253164558, + "grad_norm": 1.3180092573165894, + "learning_rate": 9.390825266573786e-05, + "loss": 0.6914255023002625, + "step": 3216 + }, + { + "epoch": 1.3578059071729958, + "grad_norm": 1.103994369506836, + "learning_rate": 9.38967600219015e-05, + "loss": 0.6137136220932007, + "step": 3218 + }, + { + "epoch": 1.358649789029536, + "grad_norm": 1.33389413356781, + "learning_rate": 9.38852572519257e-05, + "loss": 0.7173700332641602, + "step": 3220 + }, + { + "epoch": 1.3594936708860759, + "grad_norm": 1.1074159145355225, + "learning_rate": 9.387374435846386e-05, + "loss": 0.5942243933677673, + "step": 3222 + }, + { + "epoch": 1.360337552742616, + "grad_norm": 1.1157063245773315, + "learning_rate": 9.386222134417182e-05, + "loss": 0.6362866163253784, + "step": 3224 + }, + { + "epoch": 1.3611814345991562, + "grad_norm": 1.1717792749404907, + "learning_rate": 9.38506882117077e-05, + "loss": 0.6784523129463196, + "step": 3226 + }, + { + "epoch": 1.3620253164556961, + "grad_norm": 1.0946043729782104, + "learning_rate": 9.383914496373197e-05, + "loss": 0.6647377014160156, + "step": 3228 + }, + { + "epoch": 1.3628691983122363, + "grad_norm": 1.1519699096679688, + "learning_rate": 9.382759160290746e-05, + "loss": 0.6302075982093811, + "step": 3230 + }, + { + "epoch": 1.3637130801687762, + "grad_norm": 0.9928684830665588, + "learning_rate": 9.381602813189929e-05, + "loss": 0.5979090332984924, + "step": 3232 + }, + { + "epoch": 1.3645569620253164, + "grad_norm": 1.2488124370574951, + "learning_rate": 9.380445455337492e-05, + "loss": 0.6949353218078613, + "step": 3234 + }, + { + "epoch": 1.3654008438818566, + "grad_norm": 1.3884797096252441, + "learning_rate": 9.379287087000416e-05, + "loss": 0.7225558161735535, + "step": 3236 + }, + { + "epoch": 1.3662447257383965, + "grad_norm": 1.2981176376342773, + "learning_rate": 9.378127708445917e-05, + "loss": 0.6993390917778015, + "step": 3238 + }, + { + "epoch": 1.3670886075949367, + "grad_norm": 0.9884640574455261, + "learning_rate": 9.376967319941438e-05, + "loss": 0.6983805894851685, + "step": 3240 + }, + { + "epoch": 1.3679324894514768, + "grad_norm": 1.2051894664764404, + "learning_rate": 9.375805921754659e-05, + "loss": 0.7062534689903259, + "step": 3242 + }, + { + "epoch": 1.3687763713080168, + "grad_norm": 1.1943434476852417, + "learning_rate": 9.374643514153494e-05, + "loss": 0.6405107378959656, + "step": 3244 + }, + { + "epoch": 1.369620253164557, + "grad_norm": 1.249214768409729, + "learning_rate": 9.373480097406086e-05, + "loss": 0.6844781637191772, + "step": 3246 + }, + { + "epoch": 1.370464135021097, + "grad_norm": 1.1847131252288818, + "learning_rate": 9.372315671780813e-05, + "loss": 0.6048306226730347, + "step": 3248 + }, + { + "epoch": 1.371308016877637, + "grad_norm": 1.125545859336853, + "learning_rate": 9.37115023754629e-05, + "loss": 0.6772685050964355, + "step": 3250 + }, + { + "epoch": 1.3721518987341772, + "grad_norm": 1.466615915298462, + "learning_rate": 9.369983794971354e-05, + "loss": 0.7536272406578064, + "step": 3252 + }, + { + "epoch": 1.3729957805907174, + "grad_norm": 1.066699504852295, + "learning_rate": 9.368816344325084e-05, + "loss": 0.6640655398368835, + "step": 3254 + }, + { + "epoch": 1.3738396624472573, + "grad_norm": 1.4793988466262817, + "learning_rate": 9.367647885876787e-05, + "loss": 0.7029458284378052, + "step": 3256 + }, + { + "epoch": 1.3746835443037975, + "grad_norm": 1.258540153503418, + "learning_rate": 9.366478419896006e-05, + "loss": 0.7231863737106323, + "step": 3258 + }, + { + "epoch": 1.3755274261603376, + "grad_norm": 1.176106333732605, + "learning_rate": 9.365307946652512e-05, + "loss": 0.6679144501686096, + "step": 3260 + }, + { + "epoch": 1.3763713080168776, + "grad_norm": 1.3301753997802734, + "learning_rate": 9.364136466416316e-05, + "loss": 0.6282188296318054, + "step": 3262 + }, + { + "epoch": 1.3772151898734177, + "grad_norm": 1.3616732358932495, + "learning_rate": 9.362963979457648e-05, + "loss": 0.6870840191841125, + "step": 3264 + }, + { + "epoch": 1.378059071729958, + "grad_norm": 1.1982418298721313, + "learning_rate": 9.361790486046985e-05, + "loss": 0.6823731660842896, + "step": 3266 + }, + { + "epoch": 1.3789029535864978, + "grad_norm": 1.1869033575057983, + "learning_rate": 9.360615986455024e-05, + "loss": 0.6582897305488586, + "step": 3268 + }, + { + "epoch": 1.379746835443038, + "grad_norm": 1.1192975044250488, + "learning_rate": 9.359440480952703e-05, + "loss": 0.716654360294342, + "step": 3270 + }, + { + "epoch": 1.3805907172995782, + "grad_norm": 1.2210016250610352, + "learning_rate": 9.358263969811189e-05, + "loss": 0.6880061626434326, + "step": 3272 + }, + { + "epoch": 1.381434599156118, + "grad_norm": 1.0358284711837769, + "learning_rate": 9.357086453301878e-05, + "loss": 0.666864812374115, + "step": 3274 + }, + { + "epoch": 1.3822784810126583, + "grad_norm": 1.2790803909301758, + "learning_rate": 9.355907931696401e-05, + "loss": 0.6872087121009827, + "step": 3276 + }, + { + "epoch": 1.3831223628691984, + "grad_norm": 1.182991623878479, + "learning_rate": 9.354728405266623e-05, + "loss": 0.5929665565490723, + "step": 3278 + }, + { + "epoch": 1.3839662447257384, + "grad_norm": 1.1071184873580933, + "learning_rate": 9.353547874284634e-05, + "loss": 0.5928181409835815, + "step": 3280 + }, + { + "epoch": 1.3848101265822785, + "grad_norm": 1.3139623403549194, + "learning_rate": 9.352366339022763e-05, + "loss": 0.6783652901649475, + "step": 3282 + }, + { + "epoch": 1.3856540084388187, + "grad_norm": 1.2534632682800293, + "learning_rate": 9.351183799753567e-05, + "loss": 0.7652941346168518, + "step": 3284 + }, + { + "epoch": 1.3864978902953586, + "grad_norm": 1.4487930536270142, + "learning_rate": 9.350000256749833e-05, + "loss": 0.7430433630943298, + "step": 3286 + }, + { + "epoch": 1.3873417721518988, + "grad_norm": 1.0786021947860718, + "learning_rate": 9.348815710284584e-05, + "loss": 0.5854598879814148, + "step": 3288 + }, + { + "epoch": 1.3881856540084387, + "grad_norm": 1.0544480085372925, + "learning_rate": 9.347630160631071e-05, + "loss": 0.6365222334861755, + "step": 3290 + }, + { + "epoch": 1.389029535864979, + "grad_norm": 0.9989988207817078, + "learning_rate": 9.346443608062778e-05, + "loss": 0.6485803127288818, + "step": 3292 + }, + { + "epoch": 1.389873417721519, + "grad_norm": 1.100951910018921, + "learning_rate": 9.345256052853419e-05, + "loss": 0.6417753100395203, + "step": 3294 + }, + { + "epoch": 1.390717299578059, + "grad_norm": 1.1398471593856812, + "learning_rate": 9.344067495276942e-05, + "loss": 0.6333693861961365, + "step": 3296 + }, + { + "epoch": 1.3915611814345992, + "grad_norm": 1.1745941638946533, + "learning_rate": 9.342877935607521e-05, + "loss": 0.677288293838501, + "step": 3298 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 1.2651115655899048, + "learning_rate": 9.34168737411957e-05, + "loss": 0.7408396005630493, + "step": 3300 + }, + { + "epoch": 1.3924050632911391, + "eval_loss": 0.7173135876655579, + "eval_runtime": 853.5344, + "eval_samples_per_second": 2.469, + "eval_steps_per_second": 2.469, + "step": 3300 + }, + { + "epoch": 1.3932489451476793, + "grad_norm": 1.0747730731964111, + "learning_rate": 9.340495811087723e-05, + "loss": 0.6810371279716492, + "step": 3302 + }, + { + "epoch": 1.3940928270042194, + "grad_norm": 1.2857651710510254, + "learning_rate": 9.339303246786854e-05, + "loss": 0.6693953275680542, + "step": 3304 + }, + { + "epoch": 1.3949367088607594, + "grad_norm": 1.4544212818145752, + "learning_rate": 9.338109681492063e-05, + "loss": 0.7019274234771729, + "step": 3306 + }, + { + "epoch": 1.3957805907172995, + "grad_norm": 1.687755823135376, + "learning_rate": 9.336915115478685e-05, + "loss": 0.6074224710464478, + "step": 3308 + }, + { + "epoch": 1.3966244725738397, + "grad_norm": 1.1645431518554688, + "learning_rate": 9.33571954902228e-05, + "loss": 0.6981383562088013, + "step": 3310 + }, + { + "epoch": 1.3974683544303796, + "grad_norm": 1.6173527240753174, + "learning_rate": 9.334522982398646e-05, + "loss": 0.7282926440238953, + "step": 3312 + }, + { + "epoch": 1.3983122362869198, + "grad_norm": 1.3132909536361694, + "learning_rate": 9.333325415883804e-05, + "loss": 0.6574883460998535, + "step": 3314 + }, + { + "epoch": 1.39915611814346, + "grad_norm": 1.1629762649536133, + "learning_rate": 9.332126849754014e-05, + "loss": 0.6559937596321106, + "step": 3316 + }, + { + "epoch": 1.4, + "grad_norm": 1.1666897535324097, + "learning_rate": 9.33092728428576e-05, + "loss": 0.683718740940094, + "step": 3318 + }, + { + "epoch": 1.40084388185654, + "grad_norm": 1.2269554138183594, + "learning_rate": 9.329726719755756e-05, + "loss": 0.6909779906272888, + "step": 3320 + }, + { + "epoch": 1.4016877637130802, + "grad_norm": 1.1010066270828247, + "learning_rate": 9.328525156440952e-05, + "loss": 0.6051948666572571, + "step": 3322 + }, + { + "epoch": 1.4025316455696202, + "grad_norm": 1.127143144607544, + "learning_rate": 9.327322594618528e-05, + "loss": 0.6266679763793945, + "step": 3324 + }, + { + "epoch": 1.4033755274261603, + "grad_norm": 1.2160708904266357, + "learning_rate": 9.326119034565887e-05, + "loss": 0.6587526202201843, + "step": 3326 + }, + { + "epoch": 1.4042194092827005, + "grad_norm": 1.0853947401046753, + "learning_rate": 9.32491447656067e-05, + "loss": 0.5916946530342102, + "step": 3328 + }, + { + "epoch": 1.4050632911392404, + "grad_norm": 1.2205027341842651, + "learning_rate": 9.323708920880744e-05, + "loss": 0.6032452583312988, + "step": 3330 + }, + { + "epoch": 1.4059071729957806, + "grad_norm": 1.1964668035507202, + "learning_rate": 9.32250236780421e-05, + "loss": 0.6649114489555359, + "step": 3332 + }, + { + "epoch": 1.4067510548523208, + "grad_norm": 1.2507994174957275, + "learning_rate": 9.321294817609394e-05, + "loss": 0.7142994403839111, + "step": 3334 + }, + { + "epoch": 1.4075949367088607, + "grad_norm": 1.1310259103775024, + "learning_rate": 9.320086270574854e-05, + "loss": 0.709568977355957, + "step": 3336 + }, + { + "epoch": 1.4084388185654009, + "grad_norm": 1.2454090118408203, + "learning_rate": 9.318876726979385e-05, + "loss": 0.7800853848457336, + "step": 3338 + }, + { + "epoch": 1.409282700421941, + "grad_norm": 1.1168389320373535, + "learning_rate": 9.317666187101996e-05, + "loss": 0.6187908053398132, + "step": 3340 + }, + { + "epoch": 1.410126582278481, + "grad_norm": 1.6696287393569946, + "learning_rate": 9.316454651221942e-05, + "loss": 0.6222613453865051, + "step": 3342 + }, + { + "epoch": 1.4109704641350211, + "grad_norm": 0.9500295519828796, + "learning_rate": 9.315242119618698e-05, + "loss": 0.6116594672203064, + "step": 3344 + }, + { + "epoch": 1.4118143459915613, + "grad_norm": 1.186358094215393, + "learning_rate": 9.314028592571973e-05, + "loss": 0.633224368095398, + "step": 3346 + }, + { + "epoch": 1.4126582278481012, + "grad_norm": 1.1855978965759277, + "learning_rate": 9.312814070361705e-05, + "loss": 0.6675921082496643, + "step": 3348 + }, + { + "epoch": 1.4135021097046414, + "grad_norm": 1.2465872764587402, + "learning_rate": 9.311598553268059e-05, + "loss": 0.7268879413604736, + "step": 3350 + }, + { + "epoch": 1.4143459915611816, + "grad_norm": 1.151274561882019, + "learning_rate": 9.310382041571435e-05, + "loss": 0.6147416830062866, + "step": 3352 + }, + { + "epoch": 1.4151898734177215, + "grad_norm": 1.1226807832717896, + "learning_rate": 9.309164535552453e-05, + "loss": 0.6678543090820312, + "step": 3354 + }, + { + "epoch": 1.4160337552742617, + "grad_norm": 1.375842571258545, + "learning_rate": 9.307946035491975e-05, + "loss": 0.6334129571914673, + "step": 3356 + }, + { + "epoch": 1.4168776371308016, + "grad_norm": 1.058353066444397, + "learning_rate": 9.306726541671081e-05, + "loss": 0.6582583785057068, + "step": 3358 + }, + { + "epoch": 1.4177215189873418, + "grad_norm": 1.0511330366134644, + "learning_rate": 9.305506054371084e-05, + "loss": 0.5877419114112854, + "step": 3360 + }, + { + "epoch": 1.4185654008438817, + "grad_norm": 1.2246462106704712, + "learning_rate": 9.304284573873532e-05, + "loss": 0.711665689945221, + "step": 3362 + }, + { + "epoch": 1.4194092827004219, + "grad_norm": 1.0242294073104858, + "learning_rate": 9.303062100460193e-05, + "loss": 0.6743642687797546, + "step": 3364 + }, + { + "epoch": 1.420253164556962, + "grad_norm": 1.1432100534439087, + "learning_rate": 9.301838634413069e-05, + "loss": 0.6825576424598694, + "step": 3366 + }, + { + "epoch": 1.421097046413502, + "grad_norm": 1.0128604173660278, + "learning_rate": 9.30061417601439e-05, + "loss": 0.624455988407135, + "step": 3368 + }, + { + "epoch": 1.4219409282700421, + "grad_norm": 1.2738330364227295, + "learning_rate": 9.299388725546617e-05, + "loss": 0.7029586434364319, + "step": 3370 + }, + { + "epoch": 1.4227848101265823, + "grad_norm": 1.0857324600219727, + "learning_rate": 9.298162283292435e-05, + "loss": 0.5994319915771484, + "step": 3372 + }, + { + "epoch": 1.4236286919831223, + "grad_norm": 1.0811917781829834, + "learning_rate": 9.296934849534763e-05, + "loss": 0.6537772417068481, + "step": 3374 + }, + { + "epoch": 1.4244725738396624, + "grad_norm": 1.006913185119629, + "learning_rate": 9.295706424556745e-05, + "loss": 0.5775008201599121, + "step": 3376 + }, + { + "epoch": 1.4253164556962026, + "grad_norm": 1.2306486368179321, + "learning_rate": 9.294477008641755e-05, + "loss": 0.7445536255836487, + "step": 3378 + }, + { + "epoch": 1.4261603375527425, + "grad_norm": 1.223608374595642, + "learning_rate": 9.293246602073398e-05, + "loss": 0.6081538796424866, + "step": 3380 + }, + { + "epoch": 1.4270042194092827, + "grad_norm": 1.0933321714401245, + "learning_rate": 9.2920152051355e-05, + "loss": 0.6134634613990784, + "step": 3382 + }, + { + "epoch": 1.4278481012658228, + "grad_norm": 1.1738401651382446, + "learning_rate": 9.290782818112127e-05, + "loss": 0.5961087346076965, + "step": 3384 + }, + { + "epoch": 1.4286919831223628, + "grad_norm": 1.1493438482284546, + "learning_rate": 9.289549441287561e-05, + "loss": 0.6284122467041016, + "step": 3386 + }, + { + "epoch": 1.429535864978903, + "grad_norm": 1.1907998323440552, + "learning_rate": 9.288315074946324e-05, + "loss": 0.6654639840126038, + "step": 3388 + }, + { + "epoch": 1.4303797468354431, + "grad_norm": 1.3423025608062744, + "learning_rate": 9.287079719373157e-05, + "loss": 0.652850329875946, + "step": 3390 + }, + { + "epoch": 1.431223628691983, + "grad_norm": 1.3932039737701416, + "learning_rate": 9.285843374853034e-05, + "loss": 0.703445315361023, + "step": 3392 + }, + { + "epoch": 1.4320675105485232, + "grad_norm": 5.349400043487549, + "learning_rate": 9.284606041671155e-05, + "loss": 0.693265438079834, + "step": 3394 + }, + { + "epoch": 1.4329113924050634, + "grad_norm": 1.0921961069107056, + "learning_rate": 9.28336772011295e-05, + "loss": 0.6578536033630371, + "step": 3396 + }, + { + "epoch": 1.4337552742616033, + "grad_norm": 1.184157133102417, + "learning_rate": 9.282128410464074e-05, + "loss": 0.7092277407646179, + "step": 3398 + }, + { + "epoch": 1.4345991561181435, + "grad_norm": 1.0923491716384888, + "learning_rate": 9.280888113010415e-05, + "loss": 0.6866328120231628, + "step": 3400 + }, + { + "epoch": 1.4345991561181435, + "eval_loss": 0.715917706489563, + "eval_runtime": 868.51, + "eval_samples_per_second": 2.426, + "eval_steps_per_second": 2.426, + "step": 3400 + }, + { + "epoch": 1.4354430379746836, + "grad_norm": 1.2515597343444824, + "learning_rate": 9.279646828038083e-05, + "loss": 0.6617444157600403, + "step": 3402 + }, + { + "epoch": 1.4362869198312236, + "grad_norm": 1.2122540473937988, + "learning_rate": 9.278404555833422e-05, + "loss": 0.6373176574707031, + "step": 3404 + }, + { + "epoch": 1.4371308016877637, + "grad_norm": 1.191904902458191, + "learning_rate": 9.277161296682997e-05, + "loss": 0.6506488919258118, + "step": 3406 + }, + { + "epoch": 1.437974683544304, + "grad_norm": 1.2492214441299438, + "learning_rate": 9.275917050873606e-05, + "loss": 0.7172291874885559, + "step": 3408 + }, + { + "epoch": 1.4388185654008439, + "grad_norm": 1.0518640279769897, + "learning_rate": 9.274671818692272e-05, + "loss": 0.6180248260498047, + "step": 3410 + }, + { + "epoch": 1.439662447257384, + "grad_norm": 1.150563359260559, + "learning_rate": 9.273425600426245e-05, + "loss": 0.6828892827033997, + "step": 3412 + }, + { + "epoch": 1.4405063291139242, + "grad_norm": 1.76945960521698, + "learning_rate": 9.272178396363005e-05, + "loss": 0.6585919857025146, + "step": 3414 + }, + { + "epoch": 1.4413502109704641, + "grad_norm": 1.2367758750915527, + "learning_rate": 9.270930206790257e-05, + "loss": 0.7548692226409912, + "step": 3416 + }, + { + "epoch": 1.4421940928270043, + "grad_norm": 1.2292778491973877, + "learning_rate": 9.269681031995936e-05, + "loss": 0.7017102837562561, + "step": 3418 + }, + { + "epoch": 1.4430379746835442, + "grad_norm": 1.2193396091461182, + "learning_rate": 9.268430872268202e-05, + "loss": 0.6657648682594299, + "step": 3420 + }, + { + "epoch": 1.4438818565400844, + "grad_norm": 1.0505954027175903, + "learning_rate": 9.267179727895443e-05, + "loss": 0.6950910091400146, + "step": 3422 + }, + { + "epoch": 1.4447257383966245, + "grad_norm": 1.1560698747634888, + "learning_rate": 9.265927599166272e-05, + "loss": 0.689308226108551, + "step": 3424 + }, + { + "epoch": 1.4455696202531645, + "grad_norm": 1.189336895942688, + "learning_rate": 9.264674486369533e-05, + "loss": 0.6481659412384033, + "step": 3426 + }, + { + "epoch": 1.4464135021097047, + "grad_norm": 1.3527976274490356, + "learning_rate": 9.263420389794294e-05, + "loss": 0.6626612544059753, + "step": 3428 + }, + { + "epoch": 1.4472573839662446, + "grad_norm": 1.096303105354309, + "learning_rate": 9.262165309729854e-05, + "loss": 0.690841794013977, + "step": 3430 + }, + { + "epoch": 1.4481012658227848, + "grad_norm": 1.2131421566009521, + "learning_rate": 9.260909246465732e-05, + "loss": 0.6497649550437927, + "step": 3432 + }, + { + "epoch": 1.448945147679325, + "grad_norm": 1.1831032037734985, + "learning_rate": 9.259652200291678e-05, + "loss": 0.6236130595207214, + "step": 3434 + }, + { + "epoch": 1.4497890295358649, + "grad_norm": 0.9745979309082031, + "learning_rate": 9.25839417149767e-05, + "loss": 0.5223423838615417, + "step": 3436 + }, + { + "epoch": 1.450632911392405, + "grad_norm": 1.372460126876831, + "learning_rate": 9.257135160373912e-05, + "loss": 0.6642022728919983, + "step": 3438 + }, + { + "epoch": 1.4514767932489452, + "grad_norm": 1.421044111251831, + "learning_rate": 9.255875167210832e-05, + "loss": 0.5426992774009705, + "step": 3440 + }, + { + "epoch": 1.4523206751054851, + "grad_norm": 1.1694250106811523, + "learning_rate": 9.254614192299086e-05, + "loss": 0.6260567307472229, + "step": 3442 + }, + { + "epoch": 1.4531645569620253, + "grad_norm": 1.0892298221588135, + "learning_rate": 9.253352235929558e-05, + "loss": 0.5776100158691406, + "step": 3444 + }, + { + "epoch": 1.4540084388185655, + "grad_norm": 1.1841259002685547, + "learning_rate": 9.252089298393356e-05, + "loss": 0.6495202779769897, + "step": 3446 + }, + { + "epoch": 1.4548523206751054, + "grad_norm": 1.1133549213409424, + "learning_rate": 9.250825379981815e-05, + "loss": 0.6570594906806946, + "step": 3448 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 1.197100281715393, + "learning_rate": 9.249560480986498e-05, + "loss": 0.6496587991714478, + "step": 3450 + }, + { + "epoch": 1.4565400843881857, + "grad_norm": 1.1661107540130615, + "learning_rate": 9.248294601699193e-05, + "loss": 0.6644704341888428, + "step": 3452 + }, + { + "epoch": 1.4573839662447257, + "grad_norm": 1.2257879972457886, + "learning_rate": 9.247027742411912e-05, + "loss": 0.6451231241226196, + "step": 3454 + }, + { + "epoch": 1.4582278481012658, + "grad_norm": 1.3634982109069824, + "learning_rate": 9.245759903416897e-05, + "loss": 0.6108601093292236, + "step": 3456 + }, + { + "epoch": 1.459071729957806, + "grad_norm": 1.1802605390548706, + "learning_rate": 9.244491085006615e-05, + "loss": 0.6080004572868347, + "step": 3458 + }, + { + "epoch": 1.459915611814346, + "grad_norm": 1.280831217765808, + "learning_rate": 9.243221287473756e-05, + "loss": 0.6406423449516296, + "step": 3460 + }, + { + "epoch": 1.460759493670886, + "grad_norm": 1.3127192258834839, + "learning_rate": 9.241950511111237e-05, + "loss": 0.7320113778114319, + "step": 3462 + }, + { + "epoch": 1.4616033755274263, + "grad_norm": 1.1711835861206055, + "learning_rate": 9.240678756212204e-05, + "loss": 0.572110652923584, + "step": 3464 + }, + { + "epoch": 1.4624472573839662, + "grad_norm": 1.347143292427063, + "learning_rate": 9.239406023070028e-05, + "loss": 0.7446795105934143, + "step": 3466 + }, + { + "epoch": 1.4632911392405064, + "grad_norm": 1.4953652620315552, + "learning_rate": 9.238132311978299e-05, + "loss": 0.6709978580474854, + "step": 3468 + }, + { + "epoch": 1.4641350210970465, + "grad_norm": 1.2199387550354004, + "learning_rate": 9.236857623230842e-05, + "loss": 0.6691445112228394, + "step": 3470 + }, + { + "epoch": 1.4649789029535865, + "grad_norm": 1.0959199666976929, + "learning_rate": 9.235581957121702e-05, + "loss": 0.6964292526245117, + "step": 3472 + }, + { + "epoch": 1.4658227848101266, + "grad_norm": 1.455505609512329, + "learning_rate": 9.234305313945149e-05, + "loss": 0.6880454421043396, + "step": 3474 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.2820862531661987, + "learning_rate": 9.233027693995681e-05, + "loss": 0.6737138032913208, + "step": 3476 + }, + { + "epoch": 1.4675105485232067, + "grad_norm": 1.3459213972091675, + "learning_rate": 9.231749097568023e-05, + "loss": 0.6874006390571594, + "step": 3478 + }, + { + "epoch": 1.4683544303797469, + "grad_norm": 1.2815442085266113, + "learning_rate": 9.230469524957119e-05, + "loss": 0.7179469466209412, + "step": 3480 + }, + { + "epoch": 1.469198312236287, + "grad_norm": 1.6181597709655762, + "learning_rate": 9.229188976458145e-05, + "loss": 0.7525522112846375, + "step": 3482 + }, + { + "epoch": 1.470042194092827, + "grad_norm": 1.0633227825164795, + "learning_rate": 9.227907452366495e-05, + "loss": 0.5918128490447998, + "step": 3484 + }, + { + "epoch": 1.4708860759493672, + "grad_norm": 1.2055985927581787, + "learning_rate": 9.226624952977796e-05, + "loss": 0.6686186194419861, + "step": 3486 + }, + { + "epoch": 1.471729957805907, + "grad_norm": 1.2495088577270508, + "learning_rate": 9.225341478587893e-05, + "loss": 0.764410674571991, + "step": 3488 + }, + { + "epoch": 1.4725738396624473, + "grad_norm": 1.174229383468628, + "learning_rate": 9.22405702949286e-05, + "loss": 0.7066780924797058, + "step": 3490 + }, + { + "epoch": 1.4734177215189874, + "grad_norm": 1.0970302820205688, + "learning_rate": 9.222771605988995e-05, + "loss": 0.6740228533744812, + "step": 3492 + }, + { + "epoch": 1.4742616033755274, + "grad_norm": 1.2470436096191406, + "learning_rate": 9.221485208372822e-05, + "loss": 0.698371410369873, + "step": 3494 + }, + { + "epoch": 1.4751054852320675, + "grad_norm": 1.0750112533569336, + "learning_rate": 9.220197836941084e-05, + "loss": 0.6354188919067383, + "step": 3496 + }, + { + "epoch": 1.4759493670886075, + "grad_norm": 1.2656232118606567, + "learning_rate": 9.218909491990757e-05, + "loss": 0.7268608212471008, + "step": 3498 + }, + { + "epoch": 1.4767932489451476, + "grad_norm": 1.2389028072357178, + "learning_rate": 9.217620173819037e-05, + "loss": 0.6652966141700745, + "step": 3500 + }, + { + "epoch": 1.4767932489451476, + "eval_loss": 0.7155047059059143, + "eval_runtime": 855.8428, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 2.462, + "step": 3500 + }, + { + "epoch": 1.4776371308016878, + "grad_norm": 1.218304991722107, + "learning_rate": 9.216329882723343e-05, + "loss": 0.6845020651817322, + "step": 3502 + }, + { + "epoch": 1.4784810126582277, + "grad_norm": 1.123903512954712, + "learning_rate": 9.21503861900132e-05, + "loss": 0.6972519755363464, + "step": 3504 + }, + { + "epoch": 1.479324894514768, + "grad_norm": 1.1827739477157593, + "learning_rate": 9.213746382950839e-05, + "loss": 0.6699702739715576, + "step": 3506 + }, + { + "epoch": 1.480168776371308, + "grad_norm": 0.9934872984886169, + "learning_rate": 9.212453174869995e-05, + "loss": 0.5623225569725037, + "step": 3508 + }, + { + "epoch": 1.481012658227848, + "grad_norm": 1.221093773841858, + "learning_rate": 9.211158995057105e-05, + "loss": 0.6527173519134521, + "step": 3510 + }, + { + "epoch": 1.4818565400843882, + "grad_norm": 1.4569166898727417, + "learning_rate": 9.209863843810711e-05, + "loss": 0.7015712261199951, + "step": 3512 + }, + { + "epoch": 1.4827004219409283, + "grad_norm": 1.0764813423156738, + "learning_rate": 9.208567721429581e-05, + "loss": 0.6442505717277527, + "step": 3514 + }, + { + "epoch": 1.4835443037974683, + "grad_norm": 2.1307506561279297, + "learning_rate": 9.207270628212704e-05, + "loss": 0.666451096534729, + "step": 3516 + }, + { + "epoch": 1.4843881856540084, + "grad_norm": 1.180590271949768, + "learning_rate": 9.205972564459296e-05, + "loss": 0.6354807019233704, + "step": 3518 + }, + { + "epoch": 1.4852320675105486, + "grad_norm": 1.2999447584152222, + "learning_rate": 9.204673530468795e-05, + "loss": 0.6080324053764343, + "step": 3520 + }, + { + "epoch": 1.4860759493670885, + "grad_norm": 1.1680655479431152, + "learning_rate": 9.203373526540862e-05, + "loss": 0.6411244869232178, + "step": 3522 + }, + { + "epoch": 1.4869198312236287, + "grad_norm": 1.0565013885498047, + "learning_rate": 9.202072552975383e-05, + "loss": 0.6498287916183472, + "step": 3524 + }, + { + "epoch": 1.4877637130801689, + "grad_norm": 1.246267318725586, + "learning_rate": 9.20077061007247e-05, + "loss": 0.633613109588623, + "step": 3526 + }, + { + "epoch": 1.4886075949367088, + "grad_norm": 1.0626300573349, + "learning_rate": 9.199467698132453e-05, + "loss": 0.6102107167243958, + "step": 3528 + }, + { + "epoch": 1.489451476793249, + "grad_norm": 1.256600260734558, + "learning_rate": 9.198163817455892e-05, + "loss": 0.669352114200592, + "step": 3530 + }, + { + "epoch": 1.4902953586497891, + "grad_norm": 1.143188238143921, + "learning_rate": 9.196858968343565e-05, + "loss": 0.6305804252624512, + "step": 3532 + }, + { + "epoch": 1.491139240506329, + "grad_norm": 1.1471205949783325, + "learning_rate": 9.195553151096475e-05, + "loss": 0.6256994605064392, + "step": 3534 + }, + { + "epoch": 1.4919831223628692, + "grad_norm": 1.1771589517593384, + "learning_rate": 9.194246366015851e-05, + "loss": 0.6395107507705688, + "step": 3536 + }, + { + "epoch": 1.4928270042194094, + "grad_norm": 1.1997097730636597, + "learning_rate": 9.192938613403144e-05, + "loss": 0.6875160932540894, + "step": 3538 + }, + { + "epoch": 1.4936708860759493, + "grad_norm": 1.3962169885635376, + "learning_rate": 9.191629893560024e-05, + "loss": 0.7216510772705078, + "step": 3540 + }, + { + "epoch": 1.4945147679324895, + "grad_norm": 1.1835654973983765, + "learning_rate": 9.19032020678839e-05, + "loss": 0.6870693564414978, + "step": 3542 + }, + { + "epoch": 1.4953586497890297, + "grad_norm": 1.112331509590149, + "learning_rate": 9.18900955339036e-05, + "loss": 0.6266092658042908, + "step": 3544 + }, + { + "epoch": 1.4962025316455696, + "grad_norm": 1.0298354625701904, + "learning_rate": 9.187697933668278e-05, + "loss": 0.5906343460083008, + "step": 3546 + }, + { + "epoch": 1.4970464135021098, + "grad_norm": 1.2650012969970703, + "learning_rate": 9.186385347924709e-05, + "loss": 0.6203610897064209, + "step": 3548 + }, + { + "epoch": 1.49789029535865, + "grad_norm": 1.1208417415618896, + "learning_rate": 9.185071796462441e-05, + "loss": 0.6841281652450562, + "step": 3550 + }, + { + "epoch": 1.4987341772151899, + "grad_norm": 1.1319488286972046, + "learning_rate": 9.183757279584486e-05, + "loss": 0.7089514136314392, + "step": 3552 + }, + { + "epoch": 1.49957805907173, + "grad_norm": 1.1104235649108887, + "learning_rate": 9.182441797594076e-05, + "loss": 0.6663861870765686, + "step": 3554 + }, + { + "epoch": 1.5004219409282702, + "grad_norm": 1.161412000656128, + "learning_rate": 9.18112535079467e-05, + "loss": 0.6713237762451172, + "step": 3556 + }, + { + "epoch": 1.5012658227848101, + "grad_norm": 1.2925246953964233, + "learning_rate": 9.179807939489945e-05, + "loss": 0.6665274500846863, + "step": 3558 + }, + { + "epoch": 1.50210970464135, + "grad_norm": 1.0968270301818848, + "learning_rate": 9.178489563983802e-05, + "loss": 0.6881593465805054, + "step": 3560 + }, + { + "epoch": 1.5029535864978905, + "grad_norm": 1.111439824104309, + "learning_rate": 9.177170224580368e-05, + "loss": 0.631568431854248, + "step": 3562 + }, + { + "epoch": 1.5037974683544304, + "grad_norm": 1.6731075048446655, + "learning_rate": 9.175849921583986e-05, + "loss": 0.6896167397499084, + "step": 3564 + }, + { + "epoch": 1.5046413502109703, + "grad_norm": 1.226739525794983, + "learning_rate": 9.174528655299226e-05, + "loss": 0.6285277605056763, + "step": 3566 + }, + { + "epoch": 1.5054852320675105, + "grad_norm": 1.2030941247940063, + "learning_rate": 9.17320642603088e-05, + "loss": 0.6256678700447083, + "step": 3568 + }, + { + "epoch": 1.5063291139240507, + "grad_norm": 1.1980781555175781, + "learning_rate": 9.171883234083958e-05, + "loss": 0.6895992159843445, + "step": 3570 + }, + { + "epoch": 1.5071729957805906, + "grad_norm": 1.2083429098129272, + "learning_rate": 9.170559079763696e-05, + "loss": 0.6642275452613831, + "step": 3572 + }, + { + "epoch": 1.5080168776371308, + "grad_norm": 1.134020209312439, + "learning_rate": 9.169233963375552e-05, + "loss": 0.7441924214363098, + "step": 3574 + }, + { + "epoch": 1.508860759493671, + "grad_norm": 1.8178621530532837, + "learning_rate": 9.167907885225204e-05, + "loss": 0.6435995101928711, + "step": 3576 + }, + { + "epoch": 1.5097046413502109, + "grad_norm": 1.3850326538085938, + "learning_rate": 9.166580845618553e-05, + "loss": 0.6933603882789612, + "step": 3578 + }, + { + "epoch": 1.510548523206751, + "grad_norm": 1.2500641345977783, + "learning_rate": 9.165252844861723e-05, + "loss": 0.6686714887619019, + "step": 3580 + }, + { + "epoch": 1.5113924050632912, + "grad_norm": 1.0226643085479736, + "learning_rate": 9.163923883261056e-05, + "loss": 0.607890248298645, + "step": 3582 + }, + { + "epoch": 1.5122362869198311, + "grad_norm": 1.233402132987976, + "learning_rate": 9.162593961123118e-05, + "loss": 0.6604583859443665, + "step": 3584 + }, + { + "epoch": 1.5130801687763713, + "grad_norm": 1.2609056234359741, + "learning_rate": 9.161263078754698e-05, + "loss": 0.6756428480148315, + "step": 3586 + }, + { + "epoch": 1.5139240506329115, + "grad_norm": 1.22673761844635, + "learning_rate": 9.159931236462805e-05, + "loss": 0.6990940570831299, + "step": 3588 + }, + { + "epoch": 1.5147679324894514, + "grad_norm": 1.1386182308197021, + "learning_rate": 9.158598434554668e-05, + "loss": 0.6436648964881897, + "step": 3590 + }, + { + "epoch": 1.5156118143459916, + "grad_norm": 1.1136831045150757, + "learning_rate": 9.157264673337739e-05, + "loss": 0.6420145034790039, + "step": 3592 + }, + { + "epoch": 1.5164556962025317, + "grad_norm": 1.1957908868789673, + "learning_rate": 9.155929953119693e-05, + "loss": 0.6518592834472656, + "step": 3594 + }, + { + "epoch": 1.5172995780590717, + "grad_norm": 1.1049647331237793, + "learning_rate": 9.154594274208422e-05, + "loss": 0.6891129612922668, + "step": 3596 + }, + { + "epoch": 1.5181434599156118, + "grad_norm": 1.243675947189331, + "learning_rate": 9.153257636912043e-05, + "loss": 0.6945107579231262, + "step": 3598 + }, + { + "epoch": 1.518987341772152, + "grad_norm": 1.2633713483810425, + "learning_rate": 9.15192004153889e-05, + "loss": 0.7011660933494568, + "step": 3600 + }, + { + "epoch": 1.518987341772152, + "eval_loss": 0.7118256688117981, + "eval_runtime": 851.3079, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 2.475, + "step": 3600 + }, + { + "epoch": 1.519831223628692, + "grad_norm": 1.2995525598526, + "learning_rate": 9.150581488397525e-05, + "loss": 0.6843758821487427, + "step": 3602 + }, + { + "epoch": 1.520675105485232, + "grad_norm": 1.3140910863876343, + "learning_rate": 9.149241977796723e-05, + "loss": 0.6699353456497192, + "step": 3604 + }, + { + "epoch": 1.5215189873417723, + "grad_norm": 1.2674909830093384, + "learning_rate": 9.147901510045485e-05, + "loss": 0.7269271612167358, + "step": 3606 + }, + { + "epoch": 1.5223628691983122, + "grad_norm": 1.0232038497924805, + "learning_rate": 9.146560085453031e-05, + "loss": 0.5556837916374207, + "step": 3608 + }, + { + "epoch": 1.5232067510548524, + "grad_norm": 1.2598992586135864, + "learning_rate": 9.1452177043288e-05, + "loss": 0.7273092269897461, + "step": 3610 + }, + { + "epoch": 1.5240506329113925, + "grad_norm": 1.2002917528152466, + "learning_rate": 9.143874366982455e-05, + "loss": 0.6897470355033875, + "step": 3612 + }, + { + "epoch": 1.5248945147679325, + "grad_norm": 1.0959099531173706, + "learning_rate": 9.142530073723878e-05, + "loss": 0.6060715913772583, + "step": 3614 + }, + { + "epoch": 1.5257383966244724, + "grad_norm": 1.9890750646591187, + "learning_rate": 9.141184824863173e-05, + "loss": 0.6585046052932739, + "step": 3616 + }, + { + "epoch": 1.5265822784810128, + "grad_norm": 1.1460137367248535, + "learning_rate": 9.139838620710663e-05, + "loss": 0.6022046804428101, + "step": 3618 + }, + { + "epoch": 1.5274261603375527, + "grad_norm": 1.193206548690796, + "learning_rate": 9.138491461576888e-05, + "loss": 0.6332581639289856, + "step": 3620 + }, + { + "epoch": 1.5282700421940927, + "grad_norm": 1.2813689708709717, + "learning_rate": 9.137143347772614e-05, + "loss": 0.6690208315849304, + "step": 3622 + }, + { + "epoch": 1.529113924050633, + "grad_norm": 1.0950052738189697, + "learning_rate": 9.135794279608827e-05, + "loss": 0.6034293174743652, + "step": 3624 + }, + { + "epoch": 1.529957805907173, + "grad_norm": 1.208884358406067, + "learning_rate": 9.134444257396729e-05, + "loss": 0.7077960968017578, + "step": 3626 + }, + { + "epoch": 1.530801687763713, + "grad_norm": 1.093759298324585, + "learning_rate": 9.133093281447742e-05, + "loss": 0.6741147637367249, + "step": 3628 + }, + { + "epoch": 1.5316455696202531, + "grad_norm": 1.1280012130737305, + "learning_rate": 9.131741352073514e-05, + "loss": 0.6816818118095398, + "step": 3630 + }, + { + "epoch": 1.5324894514767933, + "grad_norm": 1.2868385314941406, + "learning_rate": 9.130388469585907e-05, + "loss": 0.7149180769920349, + "step": 3632 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.9654553532600403, + "learning_rate": 9.129034634297007e-05, + "loss": 0.613467812538147, + "step": 3634 + }, + { + "epoch": 1.5341772151898734, + "grad_norm": 1.8958736658096313, + "learning_rate": 9.127679846519115e-05, + "loss": 0.7034116387367249, + "step": 3636 + }, + { + "epoch": 1.5350210970464135, + "grad_norm": 1.305284857749939, + "learning_rate": 9.126324106564757e-05, + "loss": 0.7076106667518616, + "step": 3638 + }, + { + "epoch": 1.5358649789029535, + "grad_norm": 1.1843762397766113, + "learning_rate": 9.124967414746675e-05, + "loss": 0.6671180725097656, + "step": 3640 + }, + { + "epoch": 1.5367088607594936, + "grad_norm": 1.0460047721862793, + "learning_rate": 9.123609771377832e-05, + "loss": 0.667533814907074, + "step": 3642 + }, + { + "epoch": 1.5375527426160338, + "grad_norm": 1.0441135168075562, + "learning_rate": 9.122251176771409e-05, + "loss": 0.6454499959945679, + "step": 3644 + }, + { + "epoch": 1.5383966244725737, + "grad_norm": 1.5647634267807007, + "learning_rate": 9.120891631240811e-05, + "loss": 0.677007794380188, + "step": 3646 + }, + { + "epoch": 1.539240506329114, + "grad_norm": 1.0650273561477661, + "learning_rate": 9.119531135099655e-05, + "loss": 0.7017449736595154, + "step": 3648 + }, + { + "epoch": 1.540084388185654, + "grad_norm": 1.2904767990112305, + "learning_rate": 9.118169688661784e-05, + "loss": 0.683830738067627, + "step": 3650 + }, + { + "epoch": 1.540928270042194, + "grad_norm": 1.1278672218322754, + "learning_rate": 9.116807292241257e-05, + "loss": 0.5923286080360413, + "step": 3652 + }, + { + "epoch": 1.5417721518987342, + "grad_norm": 1.1107184886932373, + "learning_rate": 9.115443946152352e-05, + "loss": 0.6595140099525452, + "step": 3654 + }, + { + "epoch": 1.5426160337552743, + "grad_norm": 1.0917898416519165, + "learning_rate": 9.114079650709566e-05, + "loss": 0.655241072177887, + "step": 3656 + }, + { + "epoch": 1.5434599156118143, + "grad_norm": 1.1922433376312256, + "learning_rate": 9.11271440622762e-05, + "loss": 0.5987096428871155, + "step": 3658 + }, + { + "epoch": 1.5443037974683544, + "grad_norm": 0.9974617958068848, + "learning_rate": 9.111348213021445e-05, + "loss": 0.5710145235061646, + "step": 3660 + }, + { + "epoch": 1.5451476793248946, + "grad_norm": 1.133683443069458, + "learning_rate": 9.109981071406197e-05, + "loss": 0.6067734360694885, + "step": 3662 + }, + { + "epoch": 1.5459915611814345, + "grad_norm": 1.1958736181259155, + "learning_rate": 9.108612981697248e-05, + "loss": 0.622981071472168, + "step": 3664 + }, + { + "epoch": 1.5468354430379747, + "grad_norm": 1.234328031539917, + "learning_rate": 9.107243944210194e-05, + "loss": 0.6520710587501526, + "step": 3666 + }, + { + "epoch": 1.5476793248945149, + "grad_norm": 1.0374714136123657, + "learning_rate": 9.105873959260842e-05, + "loss": 0.5993341207504272, + "step": 3668 + }, + { + "epoch": 1.5485232067510548, + "grad_norm": 0.9987428784370422, + "learning_rate": 9.104503027165223e-05, + "loss": 0.6564813852310181, + "step": 3670 + }, + { + "epoch": 1.549367088607595, + "grad_norm": 1.0823339223861694, + "learning_rate": 9.103131148239584e-05, + "loss": 0.61710524559021, + "step": 3672 + }, + { + "epoch": 1.5502109704641351, + "grad_norm": 1.3481065034866333, + "learning_rate": 9.101758322800391e-05, + "loss": 0.687752366065979, + "step": 3674 + }, + { + "epoch": 1.551054852320675, + "grad_norm": 1.2243965864181519, + "learning_rate": 9.10038455116433e-05, + "loss": 0.5981095433235168, + "step": 3676 + }, + { + "epoch": 1.5518987341772152, + "grad_norm": 1.1384631395339966, + "learning_rate": 9.0990098336483e-05, + "loss": 0.7181004285812378, + "step": 3678 + }, + { + "epoch": 1.5527426160337554, + "grad_norm": 1.042925477027893, + "learning_rate": 9.097634170569426e-05, + "loss": 0.6137188076972961, + "step": 3680 + }, + { + "epoch": 1.5535864978902953, + "grad_norm": 1.372023105621338, + "learning_rate": 9.096257562245045e-05, + "loss": 0.6761168241500854, + "step": 3682 + }, + { + "epoch": 1.5544303797468353, + "grad_norm": 1.0574673414230347, + "learning_rate": 9.094880008992714e-05, + "loss": 0.614276647567749, + "step": 3684 + }, + { + "epoch": 1.5552742616033757, + "grad_norm": 1.2894645929336548, + "learning_rate": 9.093501511130208e-05, + "loss": 0.668122410774231, + "step": 3686 + }, + { + "epoch": 1.5561181434599156, + "grad_norm": 1.2241230010986328, + "learning_rate": 9.092122068975523e-05, + "loss": 0.6305631399154663, + "step": 3688 + }, + { + "epoch": 1.5569620253164556, + "grad_norm": 1.1316208839416504, + "learning_rate": 9.090741682846866e-05, + "loss": 0.633276641368866, + "step": 3690 + }, + { + "epoch": 1.557805907172996, + "grad_norm": 1.2857953310012817, + "learning_rate": 9.089360353062666e-05, + "loss": 0.6657599806785583, + "step": 3692 + }, + { + "epoch": 1.5586497890295359, + "grad_norm": 1.2325671911239624, + "learning_rate": 9.087978079941573e-05, + "loss": 0.6379332542419434, + "step": 3694 + }, + { + "epoch": 1.5594936708860758, + "grad_norm": 1.3286080360412598, + "learning_rate": 9.086594863802445e-05, + "loss": 0.6841909885406494, + "step": 3696 + }, + { + "epoch": 1.560337552742616, + "grad_norm": 1.261890172958374, + "learning_rate": 9.085210704964368e-05, + "loss": 0.6735964417457581, + "step": 3698 + }, + { + "epoch": 1.5611814345991561, + "grad_norm": 1.0922305583953857, + "learning_rate": 9.083825603746639e-05, + "loss": 0.6602351665496826, + "step": 3700 + }, + { + "epoch": 1.5611814345991561, + "eval_loss": 0.7099412679672241, + "eval_runtime": 857.2273, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 3700 + }, + { + "epoch": 1.562025316455696, + "grad_norm": 1.1113468408584595, + "learning_rate": 9.082439560468774e-05, + "loss": 0.6590834259986877, + "step": 3702 + }, + { + "epoch": 1.5628691983122363, + "grad_norm": 1.1476659774780273, + "learning_rate": 9.081052575450508e-05, + "loss": 0.6397460103034973, + "step": 3704 + }, + { + "epoch": 1.5637130801687764, + "grad_norm": 1.2270452976226807, + "learning_rate": 9.07966464901179e-05, + "loss": 0.6337460279464722, + "step": 3706 + }, + { + "epoch": 1.5645569620253164, + "grad_norm": 1.233667016029358, + "learning_rate": 9.07827578147279e-05, + "loss": 0.680374801158905, + "step": 3708 + }, + { + "epoch": 1.5654008438818565, + "grad_norm": 1.0761466026306152, + "learning_rate": 9.076885973153891e-05, + "loss": 0.6234241724014282, + "step": 3710 + }, + { + "epoch": 1.5662447257383967, + "grad_norm": 0.9219012260437012, + "learning_rate": 9.075495224375697e-05, + "loss": 0.6096800565719604, + "step": 3712 + }, + { + "epoch": 1.5670886075949366, + "grad_norm": 1.151168942451477, + "learning_rate": 9.074103535459026e-05, + "loss": 0.649919867515564, + "step": 3714 + }, + { + "epoch": 1.5679324894514768, + "grad_norm": 1.1380470991134644, + "learning_rate": 9.072710906724914e-05, + "loss": 0.6704574227333069, + "step": 3716 + }, + { + "epoch": 1.568776371308017, + "grad_norm": 1.2184447050094604, + "learning_rate": 9.071317338494614e-05, + "loss": 0.6619362831115723, + "step": 3718 + }, + { + "epoch": 1.5696202531645569, + "grad_norm": 1.131170630455017, + "learning_rate": 9.069922831089594e-05, + "loss": 0.6179121732711792, + "step": 3720 + }, + { + "epoch": 1.570464135021097, + "grad_norm": 1.2668405771255493, + "learning_rate": 9.06852738483154e-05, + "loss": 0.594958484172821, + "step": 3722 + }, + { + "epoch": 1.5713080168776372, + "grad_norm": 1.1624782085418701, + "learning_rate": 9.067131000042359e-05, + "loss": 0.6323778629302979, + "step": 3724 + }, + { + "epoch": 1.5721518987341772, + "grad_norm": 1.2936128377914429, + "learning_rate": 9.065733677044166e-05, + "loss": 0.628058910369873, + "step": 3726 + }, + { + "epoch": 1.5729957805907173, + "grad_norm": 1.1847784519195557, + "learning_rate": 9.064335416159296e-05, + "loss": 0.6472614407539368, + "step": 3728 + }, + { + "epoch": 1.5738396624472575, + "grad_norm": 1.8903449773788452, + "learning_rate": 9.062936217710305e-05, + "loss": 0.6395491361618042, + "step": 3730 + }, + { + "epoch": 1.5746835443037974, + "grad_norm": 1.1150785684585571, + "learning_rate": 9.061536082019956e-05, + "loss": 0.6911961436271667, + "step": 3732 + }, + { + "epoch": 1.5755274261603376, + "grad_norm": 1.1206107139587402, + "learning_rate": 9.060135009411239e-05, + "loss": 0.7051874399185181, + "step": 3734 + }, + { + "epoch": 1.5763713080168777, + "grad_norm": 1.27924382686615, + "learning_rate": 9.05873300020735e-05, + "loss": 0.7012752890586853, + "step": 3736 + }, + { + "epoch": 1.5772151898734177, + "grad_norm": 1.3970832824707031, + "learning_rate": 9.057330054731707e-05, + "loss": 0.7185142040252686, + "step": 3738 + }, + { + "epoch": 1.5780590717299579, + "grad_norm": 0.9732457995414734, + "learning_rate": 9.055926173307945e-05, + "loss": 0.6298858523368835, + "step": 3740 + }, + { + "epoch": 1.578902953586498, + "grad_norm": 1.230928897857666, + "learning_rate": 9.054521356259909e-05, + "loss": 0.7142943739891052, + "step": 3742 + }, + { + "epoch": 1.579746835443038, + "grad_norm": 1.1297426223754883, + "learning_rate": 9.053115603911664e-05, + "loss": 0.6535376310348511, + "step": 3744 + }, + { + "epoch": 1.580590717299578, + "grad_norm": 1.2132076025009155, + "learning_rate": 9.051708916587491e-05, + "loss": 0.6236510872840881, + "step": 3746 + }, + { + "epoch": 1.5814345991561183, + "grad_norm": 1.201319932937622, + "learning_rate": 9.050301294611885e-05, + "loss": 0.6752219200134277, + "step": 3748 + }, + { + "epoch": 1.5822784810126582, + "grad_norm": 1.2969163656234741, + "learning_rate": 9.048892738309559e-05, + "loss": 0.7248554825782776, + "step": 3750 + }, + { + "epoch": 1.5831223628691982, + "grad_norm": 1.0721957683563232, + "learning_rate": 9.047483248005439e-05, + "loss": 0.6488997340202332, + "step": 3752 + }, + { + "epoch": 1.5839662447257385, + "grad_norm": 0.9988508820533752, + "learning_rate": 9.046072824024667e-05, + "loss": 0.6191130876541138, + "step": 3754 + }, + { + "epoch": 1.5848101265822785, + "grad_norm": 1.260183572769165, + "learning_rate": 9.0446614666926e-05, + "loss": 0.6681985259056091, + "step": 3756 + }, + { + "epoch": 1.5856540084388184, + "grad_norm": 1.1288834810256958, + "learning_rate": 9.043249176334812e-05, + "loss": 0.662024736404419, + "step": 3758 + }, + { + "epoch": 1.5864978902953588, + "grad_norm": 1.4384263753890991, + "learning_rate": 9.04183595327709e-05, + "loss": 0.609916627407074, + "step": 3760 + }, + { + "epoch": 1.5873417721518988, + "grad_norm": 1.1109941005706787, + "learning_rate": 9.04042179784544e-05, + "loss": 0.6532528400421143, + "step": 3762 + }, + { + "epoch": 1.5881856540084387, + "grad_norm": 1.0959233045578003, + "learning_rate": 9.039006710366078e-05, + "loss": 0.7136290669441223, + "step": 3764 + }, + { + "epoch": 1.5890295358649789, + "grad_norm": 1.2313964366912842, + "learning_rate": 9.037590691165439e-05, + "loss": 0.6907190084457397, + "step": 3766 + }, + { + "epoch": 1.589873417721519, + "grad_norm": 1.3127682209014893, + "learning_rate": 9.036173740570172e-05, + "loss": 0.7114790678024292, + "step": 3768 + }, + { + "epoch": 1.590717299578059, + "grad_norm": 1.0038903951644897, + "learning_rate": 9.034755858907138e-05, + "loss": 0.6257581114768982, + "step": 3770 + }, + { + "epoch": 1.5915611814345991, + "grad_norm": 1.1058061122894287, + "learning_rate": 9.033337046503416e-05, + "loss": 0.578145444393158, + "step": 3772 + }, + { + "epoch": 1.5924050632911393, + "grad_norm": 1.0893515348434448, + "learning_rate": 9.0319173036863e-05, + "loss": 0.6312620043754578, + "step": 3774 + }, + { + "epoch": 1.5932489451476792, + "grad_norm": 1.1091047525405884, + "learning_rate": 9.030496630783297e-05, + "loss": 0.6799508333206177, + "step": 3776 + }, + { + "epoch": 1.5940928270042194, + "grad_norm": 1.1103609800338745, + "learning_rate": 9.029075028122127e-05, + "loss": 0.678726315498352, + "step": 3778 + }, + { + "epoch": 1.5949367088607596, + "grad_norm": 1.1918376684188843, + "learning_rate": 9.027652496030728e-05, + "loss": 0.7357890009880066, + "step": 3780 + }, + { + "epoch": 1.5957805907172995, + "grad_norm": 1.0541924238204956, + "learning_rate": 9.026229034837253e-05, + "loss": 0.6079391241073608, + "step": 3782 + }, + { + "epoch": 1.5966244725738397, + "grad_norm": 1.195845603942871, + "learning_rate": 9.024804644870062e-05, + "loss": 0.7173702120780945, + "step": 3784 + }, + { + "epoch": 1.5974683544303798, + "grad_norm": 1.1362866163253784, + "learning_rate": 9.023379326457737e-05, + "loss": 0.6431670188903809, + "step": 3786 + }, + { + "epoch": 1.5983122362869198, + "grad_norm": 1.2327499389648438, + "learning_rate": 9.021953079929074e-05, + "loss": 0.6346777677536011, + "step": 3788 + }, + { + "epoch": 1.59915611814346, + "grad_norm": 1.1623177528381348, + "learning_rate": 9.020525905613078e-05, + "loss": 0.6852784156799316, + "step": 3790 + }, + { + "epoch": 1.6, + "grad_norm": 1.0258424282073975, + "learning_rate": 9.019097803838971e-05, + "loss": 0.6357095241546631, + "step": 3792 + }, + { + "epoch": 1.60084388185654, + "grad_norm": 1.0825177431106567, + "learning_rate": 9.017668774936188e-05, + "loss": 0.6663659811019897, + "step": 3794 + }, + { + "epoch": 1.6016877637130802, + "grad_norm": 1.1190401315689087, + "learning_rate": 9.016238819234381e-05, + "loss": 0.6009758710861206, + "step": 3796 + }, + { + "epoch": 1.6025316455696204, + "grad_norm": 1.09871244430542, + "learning_rate": 9.01480793706341e-05, + "loss": 0.6907890439033508, + "step": 3798 + }, + { + "epoch": 1.6033755274261603, + "grad_norm": 1.2046958208084106, + "learning_rate": 9.013376128753354e-05, + "loss": 0.6709389090538025, + "step": 3800 + }, + { + "epoch": 1.6033755274261603, + "eval_loss": 0.7080941200256348, + "eval_runtime": 865.6774, + "eval_samples_per_second": 2.434, + "eval_steps_per_second": 2.434, + "step": 3800 + }, + { + "epoch": 1.6042194092827005, + "grad_norm": 1.0671489238739014, + "learning_rate": 9.011943394634505e-05, + "loss": 0.653937041759491, + "step": 3802 + }, + { + "epoch": 1.6050632911392406, + "grad_norm": 1.4205375909805298, + "learning_rate": 9.010509735037364e-05, + "loss": 0.6647229194641113, + "step": 3804 + }, + { + "epoch": 1.6059071729957806, + "grad_norm": 1.3793799877166748, + "learning_rate": 9.009075150292652e-05, + "loss": 0.6981267929077148, + "step": 3806 + }, + { + "epoch": 1.6067510548523207, + "grad_norm": 1.0534380674362183, + "learning_rate": 9.007639640731298e-05, + "loss": 0.6151314973831177, + "step": 3808 + }, + { + "epoch": 1.6075949367088609, + "grad_norm": 1.1359853744506836, + "learning_rate": 9.006203206684447e-05, + "loss": 0.6671237349510193, + "step": 3810 + }, + { + "epoch": 1.6084388185654008, + "grad_norm": 1.2385475635528564, + "learning_rate": 9.004765848483456e-05, + "loss": 0.7145646810531616, + "step": 3812 + }, + { + "epoch": 1.6092827004219408, + "grad_norm": 1.1323930025100708, + "learning_rate": 9.003327566459899e-05, + "loss": 0.6524789929389954, + "step": 3814 + }, + { + "epoch": 1.6101265822784812, + "grad_norm": 1.1863508224487305, + "learning_rate": 9.001888360945555e-05, + "loss": 0.7574670314788818, + "step": 3816 + }, + { + "epoch": 1.610970464135021, + "grad_norm": 1.0288994312286377, + "learning_rate": 9.000448232272425e-05, + "loss": 0.5858811736106873, + "step": 3818 + }, + { + "epoch": 1.611814345991561, + "grad_norm": 1.2674148082733154, + "learning_rate": 8.999007180772719e-05, + "loss": 0.6834250688552856, + "step": 3820 + }, + { + "epoch": 1.6126582278481014, + "grad_norm": 1.2014318704605103, + "learning_rate": 8.997565206778856e-05, + "loss": 0.6435309052467346, + "step": 3822 + }, + { + "epoch": 1.6135021097046414, + "grad_norm": 1.205741286277771, + "learning_rate": 8.996122310623476e-05, + "loss": 0.6212471127510071, + "step": 3824 + }, + { + "epoch": 1.6143459915611813, + "grad_norm": 1.0866186618804932, + "learning_rate": 8.994678492639426e-05, + "loss": 0.6832143664360046, + "step": 3826 + }, + { + "epoch": 1.6151898734177215, + "grad_norm": 1.0786924362182617, + "learning_rate": 8.993233753159768e-05, + "loss": 0.6129988431930542, + "step": 3828 + }, + { + "epoch": 1.6160337552742616, + "grad_norm": 1.176597237586975, + "learning_rate": 8.991788092517775e-05, + "loss": 0.6376019716262817, + "step": 3830 + }, + { + "epoch": 1.6168776371308016, + "grad_norm": 1.149990200996399, + "learning_rate": 8.99034151104693e-05, + "loss": 0.7300569415092468, + "step": 3832 + }, + { + "epoch": 1.6177215189873417, + "grad_norm": 1.0655301809310913, + "learning_rate": 8.988894009080936e-05, + "loss": 0.6163336634635925, + "step": 3834 + }, + { + "epoch": 1.618565400843882, + "grad_norm": 1.1596909761428833, + "learning_rate": 8.987445586953703e-05, + "loss": 0.6459008455276489, + "step": 3836 + }, + { + "epoch": 1.6194092827004218, + "grad_norm": 1.201897382736206, + "learning_rate": 8.985996244999352e-05, + "loss": 0.6166399121284485, + "step": 3838 + }, + { + "epoch": 1.620253164556962, + "grad_norm": 1.1000950336456299, + "learning_rate": 8.984545983552219e-05, + "loss": 0.6438087224960327, + "step": 3840 + }, + { + "epoch": 1.6210970464135022, + "grad_norm": 0.9962409734725952, + "learning_rate": 8.983094802946854e-05, + "loss": 0.6238043308258057, + "step": 3842 + }, + { + "epoch": 1.621940928270042, + "grad_norm": 1.2501682043075562, + "learning_rate": 8.981642703518015e-05, + "loss": 0.6445946097373962, + "step": 3844 + }, + { + "epoch": 1.6227848101265823, + "grad_norm": 1.2027913331985474, + "learning_rate": 8.980189685600673e-05, + "loss": 0.7147613167762756, + "step": 3846 + }, + { + "epoch": 1.6236286919831224, + "grad_norm": 1.1382197141647339, + "learning_rate": 8.97873574953001e-05, + "loss": 0.6531714200973511, + "step": 3848 + }, + { + "epoch": 1.6244725738396624, + "grad_norm": 1.2600723505020142, + "learning_rate": 8.977280895641425e-05, + "loss": 0.6811055541038513, + "step": 3850 + }, + { + "epoch": 1.6253164556962025, + "grad_norm": 0.9908071160316467, + "learning_rate": 8.97582512427052e-05, + "loss": 0.6142261624336243, + "step": 3852 + }, + { + "epoch": 1.6261603375527427, + "grad_norm": 1.171557068824768, + "learning_rate": 8.974368435753117e-05, + "loss": 0.6408987045288086, + "step": 3854 + }, + { + "epoch": 1.6270042194092826, + "grad_norm": 1.1839419603347778, + "learning_rate": 8.972910830425247e-05, + "loss": 0.7352069616317749, + "step": 3856 + }, + { + "epoch": 1.6278481012658228, + "grad_norm": 1.233730673789978, + "learning_rate": 8.971452308623148e-05, + "loss": 0.7663040161132812, + "step": 3858 + }, + { + "epoch": 1.628691983122363, + "grad_norm": 1.3636224269866943, + "learning_rate": 8.969992870683273e-05, + "loss": 0.6496971249580383, + "step": 3860 + }, + { + "epoch": 1.629535864978903, + "grad_norm": 1.2819573879241943, + "learning_rate": 8.96853251694229e-05, + "loss": 0.6079609394073486, + "step": 3862 + }, + { + "epoch": 1.630379746835443, + "grad_norm": 1.087265968322754, + "learning_rate": 8.967071247737071e-05, + "loss": 0.6299422979354858, + "step": 3864 + }, + { + "epoch": 1.6312236286919832, + "grad_norm": 1.24200439453125, + "learning_rate": 8.965609063404706e-05, + "loss": 0.6691840291023254, + "step": 3866 + }, + { + "epoch": 1.6320675105485232, + "grad_norm": 1.0771806240081787, + "learning_rate": 8.96414596428249e-05, + "loss": 0.6623613238334656, + "step": 3868 + }, + { + "epoch": 1.6329113924050633, + "grad_norm": 1.1830974817276, + "learning_rate": 8.962681950707932e-05, + "loss": 0.6663276553153992, + "step": 3870 + }, + { + "epoch": 1.6337552742616035, + "grad_norm": 1.1107177734375, + "learning_rate": 8.961217023018754e-05, + "loss": 0.6426810622215271, + "step": 3872 + }, + { + "epoch": 1.6345991561181434, + "grad_norm": 1.2528507709503174, + "learning_rate": 8.959751181552886e-05, + "loss": 0.7113696336746216, + "step": 3874 + }, + { + "epoch": 1.6354430379746834, + "grad_norm": 1.0656070709228516, + "learning_rate": 8.958284426648467e-05, + "loss": 0.6211581230163574, + "step": 3876 + }, + { + "epoch": 1.6362869198312238, + "grad_norm": 1.0627381801605225, + "learning_rate": 8.956816758643852e-05, + "loss": 0.5950066447257996, + "step": 3878 + }, + { + "epoch": 1.6371308016877637, + "grad_norm": 0.9812912344932556, + "learning_rate": 8.955348177877603e-05, + "loss": 0.6519815325737, + "step": 3880 + }, + { + "epoch": 1.6379746835443036, + "grad_norm": 1.1843842267990112, + "learning_rate": 8.953878684688493e-05, + "loss": 0.6830767393112183, + "step": 3882 + }, + { + "epoch": 1.638818565400844, + "grad_norm": 1.0393236875534058, + "learning_rate": 8.952408279415507e-05, + "loss": 0.5920302271842957, + "step": 3884 + }, + { + "epoch": 1.639662447257384, + "grad_norm": 0.9931944608688354, + "learning_rate": 8.950936962397838e-05, + "loss": 0.6269177198410034, + "step": 3886 + }, + { + "epoch": 1.640506329113924, + "grad_norm": 1.1461358070373535, + "learning_rate": 8.949464733974891e-05, + "loss": 0.7021532654762268, + "step": 3888 + }, + { + "epoch": 1.6413502109704643, + "grad_norm": 1.2654093503952026, + "learning_rate": 8.947991594486279e-05, + "loss": 0.7331246733665466, + "step": 3890 + }, + { + "epoch": 1.6421940928270042, + "grad_norm": 1.1487081050872803, + "learning_rate": 8.946517544271831e-05, + "loss": 0.6438513994216919, + "step": 3892 + }, + { + "epoch": 1.6430379746835442, + "grad_norm": 1.0876784324645996, + "learning_rate": 8.945042583671579e-05, + "loss": 0.6779276728630066, + "step": 3894 + }, + { + "epoch": 1.6438818565400843, + "grad_norm": 1.2382020950317383, + "learning_rate": 8.943566713025768e-05, + "loss": 0.7255419492721558, + "step": 3896 + }, + { + "epoch": 1.6447257383966245, + "grad_norm": 1.3502718210220337, + "learning_rate": 8.942089932674855e-05, + "loss": 0.7068934440612793, + "step": 3898 + }, + { + "epoch": 1.6455696202531644, + "grad_norm": 1.050878643989563, + "learning_rate": 8.940612242959503e-05, + "loss": 0.608700156211853, + "step": 3900 + }, + { + "epoch": 1.6455696202531644, + "eval_loss": 0.7049403786659241, + "eval_runtime": 854.9866, + "eval_samples_per_second": 2.464, + "eval_steps_per_second": 2.464, + "step": 3900 + }, + { + "epoch": 1.6464135021097046, + "grad_norm": 1.0536954402923584, + "learning_rate": 8.939133644220588e-05, + "loss": 0.6257222890853882, + "step": 3902 + }, + { + "epoch": 1.6472573839662448, + "grad_norm": 1.1903947591781616, + "learning_rate": 8.937654136799195e-05, + "loss": 0.6823404431343079, + "step": 3904 + }, + { + "epoch": 1.6481012658227847, + "grad_norm": 1.225679874420166, + "learning_rate": 8.936173721036616e-05, + "loss": 0.6596478819847107, + "step": 3906 + }, + { + "epoch": 1.6489451476793249, + "grad_norm": 1.0071430206298828, + "learning_rate": 8.934692397274354e-05, + "loss": 0.5638422966003418, + "step": 3908 + }, + { + "epoch": 1.649789029535865, + "grad_norm": 1.0146223306655884, + "learning_rate": 8.933210165854125e-05, + "loss": 0.5743419528007507, + "step": 3910 + }, + { + "epoch": 1.650632911392405, + "grad_norm": 1.122976541519165, + "learning_rate": 8.931727027117848e-05, + "loss": 0.6775169372558594, + "step": 3912 + }, + { + "epoch": 1.6514767932489451, + "grad_norm": 0.9223271012306213, + "learning_rate": 8.930242981407656e-05, + "loss": 0.5984215140342712, + "step": 3914 + }, + { + "epoch": 1.6523206751054853, + "grad_norm": 1.1599735021591187, + "learning_rate": 8.928758029065891e-05, + "loss": 0.6342158913612366, + "step": 3916 + }, + { + "epoch": 1.6531645569620252, + "grad_norm": 1.2680121660232544, + "learning_rate": 8.927272170435101e-05, + "loss": 0.678507924079895, + "step": 3918 + }, + { + "epoch": 1.6540084388185654, + "grad_norm": 1.3628549575805664, + "learning_rate": 8.925785405858047e-05, + "loss": 0.6739710569381714, + "step": 3920 + }, + { + "epoch": 1.6548523206751056, + "grad_norm": 1.163482427597046, + "learning_rate": 8.924297735677694e-05, + "loss": 0.7050020098686218, + "step": 3922 + }, + { + "epoch": 1.6556962025316455, + "grad_norm": 1.2057000398635864, + "learning_rate": 8.922809160237222e-05, + "loss": 0.6847540140151978, + "step": 3924 + }, + { + "epoch": 1.6565400843881857, + "grad_norm": 1.2784082889556885, + "learning_rate": 8.921319679880016e-05, + "loss": 0.7079069018363953, + "step": 3926 + }, + { + "epoch": 1.6573839662447258, + "grad_norm": 1.1701157093048096, + "learning_rate": 8.919829294949671e-05, + "loss": 0.665060818195343, + "step": 3928 + }, + { + "epoch": 1.6582278481012658, + "grad_norm": 1.3886606693267822, + "learning_rate": 8.918338005789988e-05, + "loss": 0.7547550201416016, + "step": 3930 + }, + { + "epoch": 1.659071729957806, + "grad_norm": 0.9504727721214294, + "learning_rate": 8.91684581274498e-05, + "loss": 0.5718522667884827, + "step": 3932 + }, + { + "epoch": 1.659915611814346, + "grad_norm": 1.1185030937194824, + "learning_rate": 8.915352716158869e-05, + "loss": 0.5984254479408264, + "step": 3934 + }, + { + "epoch": 1.660759493670886, + "grad_norm": 1.1489602327346802, + "learning_rate": 8.913858716376081e-05, + "loss": 0.6749780774116516, + "step": 3936 + }, + { + "epoch": 1.6616033755274262, + "grad_norm": 1.389431118965149, + "learning_rate": 8.912363813741255e-05, + "loss": 0.6537864804267883, + "step": 3938 + }, + { + "epoch": 1.6624472573839664, + "grad_norm": 1.0958757400512695, + "learning_rate": 8.910868008599235e-05, + "loss": 0.6033569574356079, + "step": 3940 + }, + { + "epoch": 1.6632911392405063, + "grad_norm": 1.2735344171524048, + "learning_rate": 8.909371301295075e-05, + "loss": 0.7404987215995789, + "step": 3942 + }, + { + "epoch": 1.6641350210970463, + "grad_norm": 1.123336911201477, + "learning_rate": 8.907873692174038e-05, + "loss": 0.6265006065368652, + "step": 3944 + }, + { + "epoch": 1.6649789029535866, + "grad_norm": 1.259470820426941, + "learning_rate": 8.90637518158159e-05, + "loss": 0.650705099105835, + "step": 3946 + }, + { + "epoch": 1.6658227848101266, + "grad_norm": 1.4020485877990723, + "learning_rate": 8.904875769863412e-05, + "loss": 0.7813970446586609, + "step": 3948 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.1709671020507812, + "learning_rate": 8.903375457365389e-05, + "loss": 0.6499447822570801, + "step": 3950 + }, + { + "epoch": 1.667510548523207, + "grad_norm": 1.085585355758667, + "learning_rate": 8.901874244433612e-05, + "loss": 0.6141875386238098, + "step": 3952 + }, + { + "epoch": 1.6683544303797468, + "grad_norm": 1.2340166568756104, + "learning_rate": 8.900372131414386e-05, + "loss": 0.7080221176147461, + "step": 3954 + }, + { + "epoch": 1.6691983122362868, + "grad_norm": 1.148576259613037, + "learning_rate": 8.898869118654216e-05, + "loss": 0.6340513229370117, + "step": 3956 + }, + { + "epoch": 1.6700421940928272, + "grad_norm": 1.2231999635696411, + "learning_rate": 8.89736520649982e-05, + "loss": 0.6999116539955139, + "step": 3958 + }, + { + "epoch": 1.6708860759493671, + "grad_norm": 1.1600396633148193, + "learning_rate": 8.895860395298121e-05, + "loss": 0.7177759408950806, + "step": 3960 + }, + { + "epoch": 1.671729957805907, + "grad_norm": 1.3019158840179443, + "learning_rate": 8.894354685396251e-05, + "loss": 0.6485702395439148, + "step": 3962 + }, + { + "epoch": 1.6725738396624472, + "grad_norm": 1.0153226852416992, + "learning_rate": 8.892848077141546e-05, + "loss": 0.6189450025558472, + "step": 3964 + }, + { + "epoch": 1.6734177215189874, + "grad_norm": 1.1953094005584717, + "learning_rate": 8.891340570881555e-05, + "loss": 0.6756728291511536, + "step": 3966 + }, + { + "epoch": 1.6742616033755273, + "grad_norm": 1.3376187086105347, + "learning_rate": 8.889832166964027e-05, + "loss": 0.6851167678833008, + "step": 3968 + }, + { + "epoch": 1.6751054852320675, + "grad_norm": 1.0045926570892334, + "learning_rate": 8.888322865736924e-05, + "loss": 0.5991915464401245, + "step": 3970 + }, + { + "epoch": 1.6759493670886076, + "grad_norm": 1.2115750312805176, + "learning_rate": 8.886812667548414e-05, + "loss": 0.713362455368042, + "step": 3972 + }, + { + "epoch": 1.6767932489451476, + "grad_norm": 1.1887929439544678, + "learning_rate": 8.88530157274687e-05, + "loss": 0.7058883309364319, + "step": 3974 + }, + { + "epoch": 1.6776371308016877, + "grad_norm": 1.1465295553207397, + "learning_rate": 8.883789581680868e-05, + "loss": 0.6501380801200867, + "step": 3976 + }, + { + "epoch": 1.678481012658228, + "grad_norm": 1.184693694114685, + "learning_rate": 8.882276694699204e-05, + "loss": 0.6109840273857117, + "step": 3978 + }, + { + "epoch": 1.6793248945147679, + "grad_norm": 1.2034777402877808, + "learning_rate": 8.880762912150862e-05, + "loss": 0.6815584897994995, + "step": 3980 + }, + { + "epoch": 1.680168776371308, + "grad_norm": 1.1312000751495361, + "learning_rate": 8.879248234385052e-05, + "loss": 0.6859248876571655, + "step": 3982 + }, + { + "epoch": 1.6810126582278482, + "grad_norm": 1.2273681163787842, + "learning_rate": 8.877732661751173e-05, + "loss": 0.6426702737808228, + "step": 3984 + }, + { + "epoch": 1.6818565400843881, + "grad_norm": 1.2550326585769653, + "learning_rate": 8.876216194598844e-05, + "loss": 0.6462456583976746, + "step": 3986 + }, + { + "epoch": 1.6827004219409283, + "grad_norm": 1.3111321926116943, + "learning_rate": 8.874698833277884e-05, + "loss": 0.6293925046920776, + "step": 3988 + }, + { + "epoch": 1.6835443037974684, + "grad_norm": 1.037883996963501, + "learning_rate": 8.873180578138316e-05, + "loss": 0.59798264503479, + "step": 3990 + }, + { + "epoch": 1.6843881856540084, + "grad_norm": 1.2411901950836182, + "learning_rate": 8.871661429530376e-05, + "loss": 0.6741529703140259, + "step": 3992 + }, + { + "epoch": 1.6852320675105485, + "grad_norm": 1.206354022026062, + "learning_rate": 8.8701413878045e-05, + "loss": 0.5972680449485779, + "step": 3994 + }, + { + "epoch": 1.6860759493670887, + "grad_norm": 1.1922144889831543, + "learning_rate": 8.868620453311334e-05, + "loss": 0.5879245400428772, + "step": 3996 + }, + { + "epoch": 1.6869198312236287, + "grad_norm": 1.3499996662139893, + "learning_rate": 8.867098626401729e-05, + "loss": 0.7381167411804199, + "step": 3998 + }, + { + "epoch": 1.6877637130801688, + "grad_norm": 1.3601514101028442, + "learning_rate": 8.865575907426737e-05, + "loss": 0.6590276956558228, + "step": 4000 + }, + { + "epoch": 1.6877637130801688, + "eval_loss": 0.7027890682220459, + "eval_runtime": 848.7529, + "eval_samples_per_second": 2.482, + "eval_steps_per_second": 2.482, + "step": 4000 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.153598321108613e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4000/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4500/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4500/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4500/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec120351a706768bea687fa1a543c46c80a7521f --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fd1ebf87b90eb17afa5390dea09e0139ead244bc4d3cd999cc2e2c4c3d750af +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4500/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f7e776838811ea67f089294dc8f0879741747f0 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b58168fa8b05ffa2b0c22529be4bcf5aba1b2b9cb6b14d36cda9f8f43bbf916 +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4500/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2628aec7d9410e5bdb3c50afda87aa12a87f9878 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0efe65d231115c25223bf7b93f16e661ce129b91718b68f1f079e626bed512b +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4500/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e9a16ab30068f50716feedc77eb3792954016f5 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ddebc5e42121a3c52427c71de63ee27a7547ec14262f7ddfeb0be5491a11af0 +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4500/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f1ad5227ab431506ae6a1ce2483064aff5f735e8 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/trainer_state.json @@ -0,0 +1,16153 @@ +{ + "best_global_step": 4500, + "best_metric": 0.6938078999519348, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-4500", + "epoch": 1.8987341772151898, + "eval_steps": 100, + "global_step": 4500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + }, + { + "epoch": 0.8447257383966245, + "grad_norm": 1.145434021949768, + "learning_rate": 9.893006338154401e-05, + "loss": 0.731850802898407, + "step": 2002 + }, + { + "epoch": 0.8455696202531645, + "grad_norm": 1.0618077516555786, + "learning_rate": 9.892511635084101e-05, + "loss": 0.6711665391921997, + "step": 2004 + }, + { + "epoch": 0.8464135021097047, + "grad_norm": 1.1657867431640625, + "learning_rate": 9.892015803405331e-05, + "loss": 0.6894803643226624, + "step": 2006 + }, + { + "epoch": 0.8472573839662447, + "grad_norm": 1.080140233039856, + "learning_rate": 9.891518843232467e-05, + "loss": 0.628146231174469, + "step": 2008 + }, + { + "epoch": 0.8481012658227848, + "grad_norm": 1.0664509534835815, + "learning_rate": 9.891020754680151e-05, + "loss": 0.740858793258667, + "step": 2010 + }, + { + "epoch": 0.8489451476793249, + "grad_norm": 1.5567615032196045, + "learning_rate": 9.89052153786328e-05, + "loss": 0.7763919234275818, + "step": 2012 + }, + { + "epoch": 0.849789029535865, + "grad_norm": 1.4347095489501953, + "learning_rate": 9.890021192897016e-05, + "loss": 0.8131396770477295, + "step": 2014 + }, + { + "epoch": 0.850632911392405, + "grad_norm": 1.1787892580032349, + "learning_rate": 9.889519719896776e-05, + "loss": 0.6829051375389099, + "step": 2016 + }, + { + "epoch": 0.8514767932489451, + "grad_norm": 1.239745855331421, + "learning_rate": 9.889017118978241e-05, + "loss": 0.7664558291435242, + "step": 2018 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 1.1224207878112793, + "learning_rate": 9.888513390257352e-05, + "loss": 0.7307376861572266, + "step": 2020 + }, + { + "epoch": 0.8531645569620253, + "grad_norm": 1.100536823272705, + "learning_rate": 9.88800853385031e-05, + "loss": 0.6786578893661499, + "step": 2022 + }, + { + "epoch": 0.8540084388185654, + "grad_norm": 1.25773024559021, + "learning_rate": 9.887502549873576e-05, + "loss": 0.7971984148025513, + "step": 2024 + }, + { + "epoch": 0.8548523206751055, + "grad_norm": 0.9980104565620422, + "learning_rate": 9.886995438443868e-05, + "loss": 0.6990941166877747, + "step": 2026 + }, + { + "epoch": 0.8556962025316456, + "grad_norm": 1.0464621782302856, + "learning_rate": 9.886487199678171e-05, + "loss": 0.763938307762146, + "step": 2028 + }, + { + "epoch": 0.8565400843881856, + "grad_norm": 1.2303017377853394, + "learning_rate": 9.885977833693724e-05, + "loss": 0.7165632247924805, + "step": 2030 + }, + { + "epoch": 0.8573839662447258, + "grad_norm": 1.2203325033187866, + "learning_rate": 9.885467340608027e-05, + "loss": 0.7586364150047302, + "step": 2032 + }, + { + "epoch": 0.8582278481012658, + "grad_norm": 1.113882064819336, + "learning_rate": 9.884955720538843e-05, + "loss": 0.703253984451294, + "step": 2034 + }, + { + "epoch": 0.8590717299578059, + "grad_norm": 1.1731632947921753, + "learning_rate": 9.88444297360419e-05, + "loss": 0.8530917763710022, + "step": 2036 + }, + { + "epoch": 0.859915611814346, + "grad_norm": 1.4592338800430298, + "learning_rate": 9.883929099922349e-05, + "loss": 0.8166638612747192, + "step": 2038 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.1279125213623047, + "learning_rate": 9.883414099611864e-05, + "loss": 0.6762415170669556, + "step": 2040 + }, + { + "epoch": 0.8616033755274262, + "grad_norm": 1.1587293148040771, + "learning_rate": 9.882897972791534e-05, + "loss": 0.6826539039611816, + "step": 2042 + }, + { + "epoch": 0.8624472573839662, + "grad_norm": 1.1909502744674683, + "learning_rate": 9.88238071958042e-05, + "loss": 0.7372410893440247, + "step": 2044 + }, + { + "epoch": 0.8632911392405064, + "grad_norm": 1.0340155363082886, + "learning_rate": 9.881862340097841e-05, + "loss": 0.699260950088501, + "step": 2046 + }, + { + "epoch": 0.8641350210970464, + "grad_norm": 1.1745870113372803, + "learning_rate": 9.881342834463379e-05, + "loss": 0.7689789533615112, + "step": 2048 + }, + { + "epoch": 0.8649789029535865, + "grad_norm": 1.0003606081008911, + "learning_rate": 9.880822202796872e-05, + "loss": 0.6877372860908508, + "step": 2050 + }, + { + "epoch": 0.8658227848101265, + "grad_norm": 1.2546781301498413, + "learning_rate": 9.88030044521842e-05, + "loss": 0.7632413506507874, + "step": 2052 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.1178704500198364, + "learning_rate": 9.879777561848385e-05, + "loss": 0.6776729822158813, + "step": 2054 + }, + { + "epoch": 0.8675105485232067, + "grad_norm": 1.523606777191162, + "learning_rate": 9.879253552807384e-05, + "loss": 0.7592973709106445, + "step": 2056 + }, + { + "epoch": 0.8683544303797468, + "grad_norm": 1.3490995168685913, + "learning_rate": 9.878728418216296e-05, + "loss": 0.8028839230537415, + "step": 2058 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 1.1851624250411987, + "learning_rate": 9.87820215819626e-05, + "loss": 0.7499933838844299, + "step": 2060 + }, + { + "epoch": 0.870042194092827, + "grad_norm": 1.1877925395965576, + "learning_rate": 9.877674772868672e-05, + "loss": 0.7324717044830322, + "step": 2062 + }, + { + "epoch": 0.8708860759493671, + "grad_norm": 1.2982885837554932, + "learning_rate": 9.877146262355194e-05, + "loss": 0.7456585168838501, + "step": 2064 + }, + { + "epoch": 0.8717299578059071, + "grad_norm": 1.043912649154663, + "learning_rate": 9.876616626777739e-05, + "loss": 0.7552799582481384, + "step": 2066 + }, + { + "epoch": 0.8725738396624473, + "grad_norm": 1.172580599784851, + "learning_rate": 9.876085866258487e-05, + "loss": 0.6964990496635437, + "step": 2068 + }, + { + "epoch": 0.8734177215189873, + "grad_norm": 1.26815927028656, + "learning_rate": 9.875553980919871e-05, + "loss": 0.7368612289428711, + "step": 2070 + }, + { + "epoch": 0.8742616033755274, + "grad_norm": 1.1268136501312256, + "learning_rate": 9.875020970884587e-05, + "loss": 0.7400802969932556, + "step": 2072 + }, + { + "epoch": 0.8751054852320675, + "grad_norm": 1.0556721687316895, + "learning_rate": 9.874486836275594e-05, + "loss": 0.6931334137916565, + "step": 2074 + }, + { + "epoch": 0.8759493670886076, + "grad_norm": 1.1967823505401611, + "learning_rate": 9.873951577216106e-05, + "loss": 0.7124089002609253, + "step": 2076 + }, + { + "epoch": 0.8767932489451477, + "grad_norm": 1.1753164529800415, + "learning_rate": 9.873415193829591e-05, + "loss": 0.7462030053138733, + "step": 2078 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 1.326923131942749, + "learning_rate": 9.872877686239789e-05, + "loss": 0.778078019618988, + "step": 2080 + }, + { + "epoch": 0.8784810126582279, + "grad_norm": 1.1472662687301636, + "learning_rate": 9.87233905457069e-05, + "loss": 0.6592919826507568, + "step": 2082 + }, + { + "epoch": 0.8793248945147679, + "grad_norm": 1.1162762641906738, + "learning_rate": 9.871799298946544e-05, + "loss": 0.661717414855957, + "step": 2084 + }, + { + "epoch": 0.880168776371308, + "grad_norm": 1.1694408655166626, + "learning_rate": 9.871258419491866e-05, + "loss": 0.6203670501708984, + "step": 2086 + }, + { + "epoch": 0.8810126582278481, + "grad_norm": 1.229691505432129, + "learning_rate": 9.870716416331425e-05, + "loss": 0.758888304233551, + "step": 2088 + }, + { + "epoch": 0.8818565400843882, + "grad_norm": 1.540377140045166, + "learning_rate": 9.870173289590251e-05, + "loss": 0.760649561882019, + "step": 2090 + }, + { + "epoch": 0.8827004219409282, + "grad_norm": 1.173628568649292, + "learning_rate": 9.869629039393632e-05, + "loss": 0.6981227397918701, + "step": 2092 + }, + { + "epoch": 0.8835443037974684, + "grad_norm": 1.1404013633728027, + "learning_rate": 9.869083665867116e-05, + "loss": 0.7808336615562439, + "step": 2094 + }, + { + "epoch": 0.8843881856540085, + "grad_norm": 1.1038721799850464, + "learning_rate": 9.868537169136511e-05, + "loss": 0.7540555596351624, + "step": 2096 + }, + { + "epoch": 0.8852320675105485, + "grad_norm": 1.1510080099105835, + "learning_rate": 9.867989549327885e-05, + "loss": 0.6650454998016357, + "step": 2098 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.166912317276001, + "learning_rate": 9.867440806567561e-05, + "loss": 0.673769474029541, + "step": 2100 + }, + { + "epoch": 0.8860759493670886, + "eval_loss": 0.7559094429016113, + "eval_runtime": 847.8311, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2100 + }, + { + "epoch": 0.8869198312236287, + "grad_norm": 1.227583885192871, + "learning_rate": 9.866890940982121e-05, + "loss": 0.8314241766929626, + "step": 2102 + }, + { + "epoch": 0.8877637130801688, + "grad_norm": 1.1813976764678955, + "learning_rate": 9.866339952698413e-05, + "loss": 0.6770843863487244, + "step": 2104 + }, + { + "epoch": 0.8886075949367088, + "grad_norm": 1.2471063137054443, + "learning_rate": 9.865787841843539e-05, + "loss": 0.7142292857170105, + "step": 2106 + }, + { + "epoch": 0.889451476793249, + "grad_norm": 1.1602860689163208, + "learning_rate": 9.865234608544858e-05, + "loss": 0.6981731653213501, + "step": 2108 + }, + { + "epoch": 0.890295358649789, + "grad_norm": 1.145677089691162, + "learning_rate": 9.864680252929992e-05, + "loss": 0.7019379138946533, + "step": 2110 + }, + { + "epoch": 0.8911392405063291, + "grad_norm": 1.2222462892532349, + "learning_rate": 9.86412477512682e-05, + "loss": 0.7690986394882202, + "step": 2112 + }, + { + "epoch": 0.8919831223628693, + "grad_norm": 1.1288166046142578, + "learning_rate": 9.863568175263478e-05, + "loss": 0.7241792678833008, + "step": 2114 + }, + { + "epoch": 0.8928270042194093, + "grad_norm": 1.1773978471755981, + "learning_rate": 9.863010453468364e-05, + "loss": 0.7392162084579468, + "step": 2116 + }, + { + "epoch": 0.8936708860759494, + "grad_norm": 1.102638840675354, + "learning_rate": 9.862451609870136e-05, + "loss": 0.7603078484535217, + "step": 2118 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.861891644597707e-05, + "loss": 0.6804911494255066, + "step": 2120 + }, + { + "epoch": 0.8953586497890296, + "grad_norm": 1.1381969451904297, + "learning_rate": 9.86133055778025e-05, + "loss": 0.787288248538971, + "step": 2122 + }, + { + "epoch": 0.8962025316455696, + "grad_norm": 1.2454546689987183, + "learning_rate": 9.860768349547196e-05, + "loss": 0.7282505035400391, + "step": 2124 + }, + { + "epoch": 0.8970464135021097, + "grad_norm": 1.2568305730819702, + "learning_rate": 9.860205020028237e-05, + "loss": 0.7554803490638733, + "step": 2126 + }, + { + "epoch": 0.8978902953586498, + "grad_norm": 1.1523523330688477, + "learning_rate": 9.859640569353321e-05, + "loss": 0.7126525044441223, + "step": 2128 + }, + { + "epoch": 0.8987341772151899, + "grad_norm": 1.314878225326538, + "learning_rate": 9.859074997652658e-05, + "loss": 0.7300811409950256, + "step": 2130 + }, + { + "epoch": 0.8995780590717299, + "grad_norm": 1.1272218227386475, + "learning_rate": 9.858508305056713e-05, + "loss": 0.7217329144477844, + "step": 2132 + }, + { + "epoch": 0.90042194092827, + "grad_norm": 1.10934317111969, + "learning_rate": 9.857940491696211e-05, + "loss": 0.714308500289917, + "step": 2134 + }, + { + "epoch": 0.9012658227848102, + "grad_norm": 1.1991039514541626, + "learning_rate": 9.857371557702136e-05, + "loss": 0.6613366007804871, + "step": 2136 + }, + { + "epoch": 0.9021097046413502, + "grad_norm": 1.3176918029785156, + "learning_rate": 9.85680150320573e-05, + "loss": 0.6972863078117371, + "step": 2138 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 1.1966592073440552, + "learning_rate": 9.856230328338496e-05, + "loss": 0.7299100160598755, + "step": 2140 + }, + { + "epoch": 0.9037974683544304, + "grad_norm": 1.2889270782470703, + "learning_rate": 9.85565803323219e-05, + "loss": 0.7145020961761475, + "step": 2142 + }, + { + "epoch": 0.9046413502109705, + "grad_norm": 1.2112789154052734, + "learning_rate": 9.855084618018828e-05, + "loss": 0.6717942953109741, + "step": 2144 + }, + { + "epoch": 0.9054852320675105, + "grad_norm": 1.2550239562988281, + "learning_rate": 9.85451008283069e-05, + "loss": 0.7460196018218994, + "step": 2146 + }, + { + "epoch": 0.9063291139240506, + "grad_norm": 1.2926387786865234, + "learning_rate": 9.853934427800309e-05, + "loss": 0.8300626873970032, + "step": 2148 + }, + { + "epoch": 0.9071729957805907, + "grad_norm": 1.0690672397613525, + "learning_rate": 9.853357653060478e-05, + "loss": 0.715215802192688, + "step": 2150 + }, + { + "epoch": 0.9080168776371308, + "grad_norm": 1.1021424531936646, + "learning_rate": 9.852779758744245e-05, + "loss": 0.7021427154541016, + "step": 2152 + }, + { + "epoch": 0.9088607594936708, + "grad_norm": 1.0713517665863037, + "learning_rate": 9.852200744984921e-05, + "loss": 0.7576406598091125, + "step": 2154 + }, + { + "epoch": 0.909704641350211, + "grad_norm": 1.277526617050171, + "learning_rate": 9.851620611916075e-05, + "loss": 0.7008846998214722, + "step": 2156 + }, + { + "epoch": 0.9105485232067511, + "grad_norm": 1.2434618473052979, + "learning_rate": 9.85103935967153e-05, + "loss": 0.7536613345146179, + "step": 2158 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 1.1654841899871826, + "learning_rate": 9.850456988385371e-05, + "loss": 0.7435567378997803, + "step": 2160 + }, + { + "epoch": 0.9122362869198313, + "grad_norm": 1.0718246698379517, + "learning_rate": 9.849873498191939e-05, + "loss": 0.7725666165351868, + "step": 2162 + }, + { + "epoch": 0.9130801687763713, + "grad_norm": 1.3425630331039429, + "learning_rate": 9.849288889225835e-05, + "loss": 0.7833593487739563, + "step": 2164 + }, + { + "epoch": 0.9139240506329114, + "grad_norm": 1.1989985704421997, + "learning_rate": 9.848703161621917e-05, + "loss": 0.7290158867835999, + "step": 2166 + }, + { + "epoch": 0.9147679324894514, + "grad_norm": 1.0549380779266357, + "learning_rate": 9.8481163155153e-05, + "loss": 0.6787996888160706, + "step": 2168 + }, + { + "epoch": 0.9156118143459916, + "grad_norm": 1.0757017135620117, + "learning_rate": 9.847528351041359e-05, + "loss": 0.7645748853683472, + "step": 2170 + }, + { + "epoch": 0.9164556962025316, + "grad_norm": 1.0636975765228271, + "learning_rate": 9.846939268335726e-05, + "loss": 0.6640698313713074, + "step": 2172 + }, + { + "epoch": 0.9172995780590717, + "grad_norm": 1.2038439512252808, + "learning_rate": 9.846349067534291e-05, + "loss": 0.7216284275054932, + "step": 2174 + }, + { + "epoch": 0.9181434599156119, + "grad_norm": 1.17854642868042, + "learning_rate": 9.845757748773203e-05, + "loss": 0.7244991660118103, + "step": 2176 + }, + { + "epoch": 0.9189873417721519, + "grad_norm": 1.0391159057617188, + "learning_rate": 9.845165312188864e-05, + "loss": 0.6043152809143066, + "step": 2178 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 1.2382071018218994, + "learning_rate": 9.844571757917944e-05, + "loss": 0.7791659832000732, + "step": 2180 + }, + { + "epoch": 0.920675105485232, + "grad_norm": 1.0855708122253418, + "learning_rate": 9.84397708609736e-05, + "loss": 0.7190433144569397, + "step": 2182 + }, + { + "epoch": 0.9215189873417722, + "grad_norm": 1.103308916091919, + "learning_rate": 9.843381296864291e-05, + "loss": 0.6648658514022827, + "step": 2184 + }, + { + "epoch": 0.9223628691983122, + "grad_norm": 1.073517918586731, + "learning_rate": 9.842784390356178e-05, + "loss": 0.6891760230064392, + "step": 2186 + }, + { + "epoch": 0.9232067510548523, + "grad_norm": 1.0806199312210083, + "learning_rate": 9.842186366710712e-05, + "loss": 0.6880859136581421, + "step": 2188 + }, + { + "epoch": 0.9240506329113924, + "grad_norm": 1.0631483793258667, + "learning_rate": 9.841587226065848e-05, + "loss": 0.6238307952880859, + "step": 2190 + }, + { + "epoch": 0.9248945147679325, + "grad_norm": 1.2630863189697266, + "learning_rate": 9.840986968559795e-05, + "loss": 0.6905744075775146, + "step": 2192 + }, + { + "epoch": 0.9257383966244725, + "grad_norm": 1.1307560205459595, + "learning_rate": 9.840385594331022e-05, + "loss": 0.7531564235687256, + "step": 2194 + }, + { + "epoch": 0.9265822784810127, + "grad_norm": 1.0294862985610962, + "learning_rate": 9.839783103518254e-05, + "loss": 0.6750671863555908, + "step": 2196 + }, + { + "epoch": 0.9274261603375528, + "grad_norm": 1.2446976900100708, + "learning_rate": 9.839179496260472e-05, + "loss": 0.7200804352760315, + "step": 2198 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 1.2673420906066895, + "learning_rate": 9.83857477269692e-05, + "loss": 0.7002623677253723, + "step": 2200 + }, + { + "epoch": 0.9282700421940928, + "eval_loss": 0.7497645616531372, + "eval_runtime": 856.8766, + "eval_samples_per_second": 2.459, + "eval_steps_per_second": 2.459, + "step": 2200 + }, + { + "epoch": 0.9291139240506329, + "grad_norm": 1.5114624500274658, + "learning_rate": 9.837968932967094e-05, + "loss": 0.7718265056610107, + "step": 2202 + }, + { + "epoch": 0.929957805907173, + "grad_norm": 1.2059369087219238, + "learning_rate": 9.837361977210751e-05, + "loss": 0.7204271554946899, + "step": 2204 + }, + { + "epoch": 0.9308016877637131, + "grad_norm": 1.2077301740646362, + "learning_rate": 9.836753905567902e-05, + "loss": 0.7371073961257935, + "step": 2206 + }, + { + "epoch": 0.9316455696202531, + "grad_norm": 1.120097279548645, + "learning_rate": 9.836144718178818e-05, + "loss": 0.6601167321205139, + "step": 2208 + }, + { + "epoch": 0.9324894514767933, + "grad_norm": 1.1755714416503906, + "learning_rate": 9.835534415184029e-05, + "loss": 0.6897423267364502, + "step": 2210 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3587000370025635, + "learning_rate": 9.834922996724317e-05, + "loss": 0.758438229560852, + "step": 2212 + }, + { + "epoch": 0.9341772151898734, + "grad_norm": 1.1898177862167358, + "learning_rate": 9.834310462940727e-05, + "loss": 0.7489214539527893, + "step": 2214 + }, + { + "epoch": 0.9350210970464135, + "grad_norm": 1.0814623832702637, + "learning_rate": 9.833696813974558e-05, + "loss": 0.6844488382339478, + "step": 2216 + }, + { + "epoch": 0.9358649789029536, + "grad_norm": 1.1060179471969604, + "learning_rate": 9.833082049967366e-05, + "loss": 0.6617586016654968, + "step": 2218 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 1.1780575513839722, + "learning_rate": 9.832466171060968e-05, + "loss": 0.7383584976196289, + "step": 2220 + }, + { + "epoch": 0.9375527426160337, + "grad_norm": 1.3734618425369263, + "learning_rate": 9.831849177397432e-05, + "loss": 0.7764308452606201, + "step": 2222 + }, + { + "epoch": 0.9383966244725739, + "grad_norm": 1.1367733478546143, + "learning_rate": 9.831231069119089e-05, + "loss": 0.6834397912025452, + "step": 2224 + }, + { + "epoch": 0.9392405063291139, + "grad_norm": 1.1695492267608643, + "learning_rate": 9.830611846368524e-05, + "loss": 0.7054480910301208, + "step": 2226 + }, + { + "epoch": 0.940084388185654, + "grad_norm": 1.0345736742019653, + "learning_rate": 9.829991509288579e-05, + "loss": 0.694448709487915, + "step": 2228 + }, + { + "epoch": 0.9409282700421941, + "grad_norm": 1.298105239868164, + "learning_rate": 9.829370058022356e-05, + "loss": 0.6839741468429565, + "step": 2230 + }, + { + "epoch": 0.9417721518987342, + "grad_norm": 1.2905502319335938, + "learning_rate": 9.828747492713209e-05, + "loss": 0.7886884212493896, + "step": 2232 + }, + { + "epoch": 0.9426160337552743, + "grad_norm": 1.12301504611969, + "learning_rate": 9.828123813504753e-05, + "loss": 0.7206413149833679, + "step": 2234 + }, + { + "epoch": 0.9434599156118143, + "grad_norm": 1.2644896507263184, + "learning_rate": 9.82749902054086e-05, + "loss": 0.7700693607330322, + "step": 2236 + }, + { + "epoch": 0.9443037974683545, + "grad_norm": 1.1626365184783936, + "learning_rate": 9.826873113965655e-05, + "loss": 0.7199711203575134, + "step": 2238 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 1.0728627443313599, + "learning_rate": 9.826246093923528e-05, + "loss": 0.7183539271354675, + "step": 2240 + }, + { + "epoch": 0.9459915611814346, + "grad_norm": 1.1444766521453857, + "learning_rate": 9.825617960559114e-05, + "loss": 0.7417964935302734, + "step": 2242 + }, + { + "epoch": 0.9468354430379747, + "grad_norm": 1.4059823751449585, + "learning_rate": 9.824988714017316e-05, + "loss": 0.7949740290641785, + "step": 2244 + }, + { + "epoch": 0.9476793248945148, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.824358354443286e-05, + "loss": 0.6433083415031433, + "step": 2246 + }, + { + "epoch": 0.9485232067510548, + "grad_norm": 1.0879144668579102, + "learning_rate": 9.823726881982438e-05, + "loss": 0.6519861817359924, + "step": 2248 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.2289162874221802, + "learning_rate": 9.82309429678044e-05, + "loss": 0.7280195355415344, + "step": 2250 + }, + { + "epoch": 0.950210970464135, + "grad_norm": 1.1755765676498413, + "learning_rate": 9.822460598983217e-05, + "loss": 0.7524687647819519, + "step": 2252 + }, + { + "epoch": 0.9510548523206751, + "grad_norm": 1.179807186126709, + "learning_rate": 9.821825788736949e-05, + "loss": 0.7543174624443054, + "step": 2254 + }, + { + "epoch": 0.9518987341772152, + "grad_norm": 1.1234289407730103, + "learning_rate": 9.821189866188079e-05, + "loss": 0.716377854347229, + "step": 2256 + }, + { + "epoch": 0.9527426160337553, + "grad_norm": 1.0324063301086426, + "learning_rate": 9.820552831483297e-05, + "loss": 0.6403332948684692, + "step": 2258 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 1.1459579467773438, + "learning_rate": 9.819914684769558e-05, + "loss": 0.7406947612762451, + "step": 2260 + }, + { + "epoch": 0.9544303797468354, + "grad_norm": 1.2886124849319458, + "learning_rate": 9.819275426194072e-05, + "loss": 0.749687671661377, + "step": 2262 + }, + { + "epoch": 0.9552742616033755, + "grad_norm": 1.3349844217300415, + "learning_rate": 9.818635055904299e-05, + "loss": 0.778410017490387, + "step": 2264 + }, + { + "epoch": 0.9561181434599156, + "grad_norm": 1.0994901657104492, + "learning_rate": 9.81799357404796e-05, + "loss": 0.6701914668083191, + "step": 2266 + }, + { + "epoch": 0.9569620253164557, + "grad_norm": 1.1787796020507812, + "learning_rate": 9.817350980773038e-05, + "loss": 0.7205135226249695, + "step": 2268 + }, + { + "epoch": 0.9578059071729957, + "grad_norm": 1.100813627243042, + "learning_rate": 9.816707276227763e-05, + "loss": 0.6897916197776794, + "step": 2270 + }, + { + "epoch": 0.9586497890295359, + "grad_norm": 1.1280698776245117, + "learning_rate": 9.816062460560627e-05, + "loss": 0.6763570308685303, + "step": 2272 + }, + { + "epoch": 0.959493670886076, + "grad_norm": 1.2322514057159424, + "learning_rate": 9.815416533920374e-05, + "loss": 0.6948683857917786, + "step": 2274 + }, + { + "epoch": 0.960337552742616, + "grad_norm": 1.3963630199432373, + "learning_rate": 9.814769496456008e-05, + "loss": 0.7876828908920288, + "step": 2276 + }, + { + "epoch": 0.9611814345991562, + "grad_norm": 1.2093676328659058, + "learning_rate": 9.814121348316792e-05, + "loss": 0.8191362619400024, + "step": 2278 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 1.2223572731018066, + "learning_rate": 9.813472089652233e-05, + "loss": 0.7162626385688782, + "step": 2280 + }, + { + "epoch": 0.9628691983122363, + "grad_norm": 1.1498078107833862, + "learning_rate": 9.812821720612111e-05, + "loss": 0.7183970212936401, + "step": 2282 + }, + { + "epoch": 0.9637130801687763, + "grad_norm": 1.1563853025436401, + "learning_rate": 9.812170241346449e-05, + "loss": 0.734487771987915, + "step": 2284 + }, + { + "epoch": 0.9645569620253165, + "grad_norm": 1.1823415756225586, + "learning_rate": 9.81151765200553e-05, + "loss": 0.7312371730804443, + "step": 2286 + }, + { + "epoch": 0.9654008438818565, + "grad_norm": 1.1336151361465454, + "learning_rate": 9.810863952739899e-05, + "loss": 0.7668377757072449, + "step": 2288 + }, + { + "epoch": 0.9662447257383966, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.810209143700347e-05, + "loss": 0.7100399732589722, + "step": 2290 + }, + { + "epoch": 0.9670886075949368, + "grad_norm": 1.1368129253387451, + "learning_rate": 9.809553225037926e-05, + "loss": 0.7169836163520813, + "step": 2292 + }, + { + "epoch": 0.9679324894514768, + "grad_norm": 1.141107439994812, + "learning_rate": 9.808896196903947e-05, + "loss": 0.7709535956382751, + "step": 2294 + }, + { + "epoch": 0.9687763713080169, + "grad_norm": 1.276405930519104, + "learning_rate": 9.808238059449971e-05, + "loss": 0.7300511002540588, + "step": 2296 + }, + { + "epoch": 0.9696202531645569, + "grad_norm": 0.9817046523094177, + "learning_rate": 9.80757881282782e-05, + "loss": 0.6259129047393799, + "step": 2298 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 1.3965257406234741, + "learning_rate": 9.806918457189566e-05, + "loss": 0.7361716032028198, + "step": 2300 + }, + { + "epoch": 0.9704641350210971, + "eval_loss": 0.7464568614959717, + "eval_runtime": 864.2128, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 2.438, + "step": 2300 + }, + { + "epoch": 0.9713080168776371, + "grad_norm": 1.2168612480163574, + "learning_rate": 9.806256992687544e-05, + "loss": 0.805477499961853, + "step": 2302 + }, + { + "epoch": 0.9721518987341772, + "grad_norm": 1.0418168306350708, + "learning_rate": 9.80559441947434e-05, + "loss": 0.6673368811607361, + "step": 2304 + }, + { + "epoch": 0.9729957805907173, + "grad_norm": 1.223128318786621, + "learning_rate": 9.804930737702796e-05, + "loss": 0.7585647106170654, + "step": 2306 + }, + { + "epoch": 0.9738396624472574, + "grad_norm": 1.264511227607727, + "learning_rate": 9.804265947526011e-05, + "loss": 0.7642034888267517, + "step": 2308 + }, + { + "epoch": 0.9746835443037974, + "grad_norm": 1.076887607574463, + "learning_rate": 9.803600049097339e-05, + "loss": 0.7094541192054749, + "step": 2310 + }, + { + "epoch": 0.9755274261603376, + "grad_norm": 1.0214987993240356, + "learning_rate": 9.802933042570392e-05, + "loss": 0.7370059490203857, + "step": 2312 + }, + { + "epoch": 0.9763713080168777, + "grad_norm": 1.3075295686721802, + "learning_rate": 9.802264928099035e-05, + "loss": 0.726834237575531, + "step": 2314 + }, + { + "epoch": 0.9772151898734177, + "grad_norm": 1.057386040687561, + "learning_rate": 9.801595705837385e-05, + "loss": 0.6742353439331055, + "step": 2316 + }, + { + "epoch": 0.9780590717299578, + "grad_norm": 1.3998085260391235, + "learning_rate": 9.800925375939825e-05, + "loss": 0.6862425208091736, + "step": 2318 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 1.080574631690979, + "learning_rate": 9.800253938560983e-05, + "loss": 0.6212031245231628, + "step": 2320 + }, + { + "epoch": 0.979746835443038, + "grad_norm": 1.3643771409988403, + "learning_rate": 9.799581393855748e-05, + "loss": 0.7522522211074829, + "step": 2322 + }, + { + "epoch": 0.980590717299578, + "grad_norm": 1.2455768585205078, + "learning_rate": 9.798907741979264e-05, + "loss": 0.7265716791152954, + "step": 2324 + }, + { + "epoch": 0.9814345991561182, + "grad_norm": 1.078774333000183, + "learning_rate": 9.798232983086927e-05, + "loss": 0.7160419225692749, + "step": 2326 + }, + { + "epoch": 0.9822784810126582, + "grad_norm": 1.3013948202133179, + "learning_rate": 9.797557117334394e-05, + "loss": 0.7991124391555786, + "step": 2328 + }, + { + "epoch": 0.9831223628691983, + "grad_norm": 1.2216732501983643, + "learning_rate": 9.796880144877572e-05, + "loss": 0.7193916440010071, + "step": 2330 + }, + { + "epoch": 0.9839662447257383, + "grad_norm": 1.1469542980194092, + "learning_rate": 9.796202065872627e-05, + "loss": 0.7184370756149292, + "step": 2332 + }, + { + "epoch": 0.9848101265822785, + "grad_norm": 1.0431830883026123, + "learning_rate": 9.795522880475979e-05, + "loss": 0.6474619507789612, + "step": 2334 + }, + { + "epoch": 0.9856540084388186, + "grad_norm": 1.1819576025009155, + "learning_rate": 9.794842588844299e-05, + "loss": 0.6392545700073242, + "step": 2336 + }, + { + "epoch": 0.9864978902953586, + "grad_norm": 1.1984983682632446, + "learning_rate": 9.794161191134525e-05, + "loss": 0.7358114719390869, + "step": 2338 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 1.3378512859344482, + "learning_rate": 9.793478687503834e-05, + "loss": 0.6762020587921143, + "step": 2340 + }, + { + "epoch": 0.9881856540084388, + "grad_norm": 1.272674560546875, + "learning_rate": 9.792795078109673e-05, + "loss": 0.7478934526443481, + "step": 2342 + }, + { + "epoch": 0.9890295358649789, + "grad_norm": 1.153746247291565, + "learning_rate": 9.792110363109733e-05, + "loss": 0.7316533923149109, + "step": 2344 + }, + { + "epoch": 0.9898734177215189, + "grad_norm": 1.1361702680587769, + "learning_rate": 9.791424542661967e-05, + "loss": 0.7078539133071899, + "step": 2346 + }, + { + "epoch": 0.9907172995780591, + "grad_norm": 1.3043115139007568, + "learning_rate": 9.790737616924581e-05, + "loss": 0.7945935130119324, + "step": 2348 + }, + { + "epoch": 0.9915611814345991, + "grad_norm": 1.1913264989852905, + "learning_rate": 9.790049586056034e-05, + "loss": 0.8247197866439819, + "step": 2350 + }, + { + "epoch": 0.9924050632911392, + "grad_norm": 1.1560171842575073, + "learning_rate": 9.789360450215041e-05, + "loss": 0.7099657654762268, + "step": 2352 + }, + { + "epoch": 0.9932489451476794, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.788670209560575e-05, + "loss": 0.7480318546295166, + "step": 2354 + }, + { + "epoch": 0.9940928270042194, + "grad_norm": 1.1584707498550415, + "learning_rate": 9.787978864251859e-05, + "loss": 0.6870889067649841, + "step": 2356 + }, + { + "epoch": 0.9949367088607595, + "grad_norm": 1.057478666305542, + "learning_rate": 9.787286414448375e-05, + "loss": 0.6114922165870667, + "step": 2358 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 1.1431775093078613, + "learning_rate": 9.786592860309856e-05, + "loss": 0.6955118179321289, + "step": 2360 + }, + { + "epoch": 0.9966244725738397, + "grad_norm": 1.232142448425293, + "learning_rate": 9.785898201996292e-05, + "loss": 0.735048770904541, + "step": 2362 + }, + { + "epoch": 0.9974683544303797, + "grad_norm": 1.1236306428909302, + "learning_rate": 9.785202439667928e-05, + "loss": 0.7150241136550903, + "step": 2364 + }, + { + "epoch": 0.9983122362869198, + "grad_norm": 1.0517534017562866, + "learning_rate": 9.784505573485263e-05, + "loss": 0.6870222687721252, + "step": 2366 + }, + { + "epoch": 0.99915611814346, + "grad_norm": 1.1747480630874634, + "learning_rate": 9.78380760360905e-05, + "loss": 0.7521567940711975, + "step": 2368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2790346145629883, + "learning_rate": 9.783108530200298e-05, + "loss": 0.7336234450340271, + "step": 2370 + }, + { + "epoch": 1.0008438818565402, + "grad_norm": 1.1216399669647217, + "learning_rate": 9.78240835342027e-05, + "loss": 0.6378109455108643, + "step": 2372 + }, + { + "epoch": 1.00168776371308, + "grad_norm": 1.267336368560791, + "learning_rate": 9.781707073430482e-05, + "loss": 0.6174905300140381, + "step": 2374 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 1.1342934370040894, + "learning_rate": 9.781004690392706e-05, + "loss": 0.6579123139381409, + "step": 2376 + }, + { + "epoch": 1.0033755274261604, + "grad_norm": 1.1317468881607056, + "learning_rate": 9.78030120446897e-05, + "loss": 0.6679617166519165, + "step": 2378 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 1.2992616891860962, + "learning_rate": 9.779596615821552e-05, + "loss": 0.7368149161338806, + "step": 2380 + }, + { + "epoch": 1.0050632911392405, + "grad_norm": 1.1714510917663574, + "learning_rate": 9.77889092461299e-05, + "loss": 0.6887164115905762, + "step": 2382 + }, + { + "epoch": 1.0059071729957807, + "grad_norm": 1.1670639514923096, + "learning_rate": 9.778184131006071e-05, + "loss": 0.681344211101532, + "step": 2384 + }, + { + "epoch": 1.0067510548523206, + "grad_norm": 1.2487291097640991, + "learning_rate": 9.77747623516384e-05, + "loss": 0.7342769503593445, + "step": 2386 + }, + { + "epoch": 1.0075949367088608, + "grad_norm": 1.2408956289291382, + "learning_rate": 9.776767237249595e-05, + "loss": 0.577454149723053, + "step": 2388 + }, + { + "epoch": 1.0084388185654007, + "grad_norm": 1.067991852760315, + "learning_rate": 9.776057137426889e-05, + "loss": 0.6588307023048401, + "step": 2390 + }, + { + "epoch": 1.009282700421941, + "grad_norm": 1.2821543216705322, + "learning_rate": 9.775345935859525e-05, + "loss": 0.7045041918754578, + "step": 2392 + }, + { + "epoch": 1.010126582278481, + "grad_norm": 1.3160134553909302, + "learning_rate": 9.774633632711569e-05, + "loss": 0.7141479253768921, + "step": 2394 + }, + { + "epoch": 1.010970464135021, + "grad_norm": 1.66774320602417, + "learning_rate": 9.773920228147329e-05, + "loss": 0.723293662071228, + "step": 2396 + }, + { + "epoch": 1.0118143459915612, + "grad_norm": 1.027588963508606, + "learning_rate": 9.77320572233138e-05, + "loss": 0.5812023878097534, + "step": 2398 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.406507968902588, + "learning_rate": 9.77249011542854e-05, + "loss": 0.7071458101272583, + "step": 2400 + }, + { + "epoch": 1.0126582278481013, + "eval_loss": 0.7421699166297913, + "eval_runtime": 854.2185, + "eval_samples_per_second": 2.467, + "eval_steps_per_second": 2.467, + "step": 2400 + }, + { + "epoch": 1.0135021097046413, + "grad_norm": 1.1236240863800049, + "learning_rate": 9.771773407603889e-05, + "loss": 0.7049722671508789, + "step": 2402 + }, + { + "epoch": 1.0143459915611814, + "grad_norm": 1.1924289464950562, + "learning_rate": 9.771055599022756e-05, + "loss": 0.635308027267456, + "step": 2404 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 1.1744966506958008, + "learning_rate": 9.770336689850727e-05, + "loss": 0.7286487817764282, + "step": 2406 + }, + { + "epoch": 1.0160337552742615, + "grad_norm": 1.2131173610687256, + "learning_rate": 9.769616680253639e-05, + "loss": 0.6828222274780273, + "step": 2408 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 1.0517828464508057, + "learning_rate": 9.768895570397585e-05, + "loss": 0.6652156114578247, + "step": 2410 + }, + { + "epoch": 1.0177215189873419, + "grad_norm": 1.1603758335113525, + "learning_rate": 9.768173360448912e-05, + "loss": 0.7278267741203308, + "step": 2412 + }, + { + "epoch": 1.0185654008438818, + "grad_norm": 1.3167752027511597, + "learning_rate": 9.767450050574218e-05, + "loss": 0.6082334518432617, + "step": 2414 + }, + { + "epoch": 1.019409282700422, + "grad_norm": 1.1754449605941772, + "learning_rate": 9.766725640940358e-05, + "loss": 0.67228102684021, + "step": 2416 + }, + { + "epoch": 1.0202531645569621, + "grad_norm": 1.060952067375183, + "learning_rate": 9.766000131714442e-05, + "loss": 0.5984366536140442, + "step": 2418 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 1.0826152563095093, + "learning_rate": 9.765273523063825e-05, + "loss": 0.690661609172821, + "step": 2420 + }, + { + "epoch": 1.0219409282700422, + "grad_norm": 1.423723816871643, + "learning_rate": 9.764545815156125e-05, + "loss": 0.7960668802261353, + "step": 2422 + }, + { + "epoch": 1.0227848101265822, + "grad_norm": 1.0882549285888672, + "learning_rate": 9.763817008159212e-05, + "loss": 0.6971074342727661, + "step": 2424 + }, + { + "epoch": 1.0236286919831223, + "grad_norm": 1.1053040027618408, + "learning_rate": 9.763087102241206e-05, + "loss": 0.6854458451271057, + "step": 2426 + }, + { + "epoch": 1.0244725738396625, + "grad_norm": 1.1975224018096924, + "learning_rate": 9.762356097570482e-05, + "loss": 0.6724489331245422, + "step": 2428 + }, + { + "epoch": 1.0253164556962024, + "grad_norm": 1.1692171096801758, + "learning_rate": 9.76162399431567e-05, + "loss": 0.7064506411552429, + "step": 2430 + }, + { + "epoch": 1.0261603375527426, + "grad_norm": 1.1927787065505981, + "learning_rate": 9.760890792645649e-05, + "loss": 0.6605257391929626, + "step": 2432 + }, + { + "epoch": 1.0270042194092828, + "grad_norm": 1.4147427082061768, + "learning_rate": 9.760156492729558e-05, + "loss": 0.6872501373291016, + "step": 2434 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 1.2503126859664917, + "learning_rate": 9.759421094736785e-05, + "loss": 0.7117500305175781, + "step": 2436 + }, + { + "epoch": 1.0286919831223629, + "grad_norm": 1.229978084564209, + "learning_rate": 9.758684598836971e-05, + "loss": 0.6740369200706482, + "step": 2438 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 1.4765945672988892, + "learning_rate": 9.757947005200014e-05, + "loss": 0.7215790748596191, + "step": 2440 + }, + { + "epoch": 1.030379746835443, + "grad_norm": 1.282632827758789, + "learning_rate": 9.757208313996061e-05, + "loss": 0.6961746215820312, + "step": 2442 + }, + { + "epoch": 1.0312236286919831, + "grad_norm": 1.259828805923462, + "learning_rate": 9.756468525395512e-05, + "loss": 0.6348349452018738, + "step": 2444 + }, + { + "epoch": 1.0320675105485233, + "grad_norm": 1.0984172821044922, + "learning_rate": 9.755727639569024e-05, + "loss": 0.6756057739257812, + "step": 2446 + }, + { + "epoch": 1.0329113924050632, + "grad_norm": 1.235835075378418, + "learning_rate": 9.754985656687506e-05, + "loss": 0.6968509554862976, + "step": 2448 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 1.273032546043396, + "learning_rate": 9.754242576922119e-05, + "loss": 0.6793950796127319, + "step": 2450 + }, + { + "epoch": 1.0345991561181433, + "grad_norm": 1.251996397972107, + "learning_rate": 9.753498400444274e-05, + "loss": 0.645270586013794, + "step": 2452 + }, + { + "epoch": 1.0354430379746835, + "grad_norm": 1.4310805797576904, + "learning_rate": 9.752753127425642e-05, + "loss": 0.7291322350502014, + "step": 2454 + }, + { + "epoch": 1.0362869198312237, + "grad_norm": 1.6582196950912476, + "learning_rate": 9.752006758038142e-05, + "loss": 0.7553019523620605, + "step": 2456 + }, + { + "epoch": 1.0371308016877636, + "grad_norm": 1.081773042678833, + "learning_rate": 9.751259292453947e-05, + "loss": 0.5637331008911133, + "step": 2458 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 1.1483876705169678, + "learning_rate": 9.750510730845483e-05, + "loss": 0.6012396216392517, + "step": 2460 + }, + { + "epoch": 1.038818565400844, + "grad_norm": 1.0879185199737549, + "learning_rate": 9.749761073385428e-05, + "loss": 0.6795822381973267, + "step": 2462 + }, + { + "epoch": 1.0396624472573839, + "grad_norm": 1.2378218173980713, + "learning_rate": 9.749010320246714e-05, + "loss": 0.6895145773887634, + "step": 2464 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 1.253233790397644, + "learning_rate": 9.748258471602527e-05, + "loss": 0.7124115228652954, + "step": 2466 + }, + { + "epoch": 1.0413502109704642, + "grad_norm": 1.3994864225387573, + "learning_rate": 9.747505527626302e-05, + "loss": 0.7304861545562744, + "step": 2468 + }, + { + "epoch": 1.0421940928270041, + "grad_norm": 1.2360669374465942, + "learning_rate": 9.74675148849173e-05, + "loss": 0.6845837831497192, + "step": 2470 + }, + { + "epoch": 1.0430379746835443, + "grad_norm": 1.126849889755249, + "learning_rate": 9.74599635437275e-05, + "loss": 0.6780203580856323, + "step": 2472 + }, + { + "epoch": 1.0438818565400845, + "grad_norm": 1.169788122177124, + "learning_rate": 9.745240125443562e-05, + "loss": 0.7550003528594971, + "step": 2474 + }, + { + "epoch": 1.0447257383966244, + "grad_norm": 1.1311867237091064, + "learning_rate": 9.744482801878612e-05, + "loss": 0.6910399198532104, + "step": 2476 + }, + { + "epoch": 1.0455696202531646, + "grad_norm": 1.1267731189727783, + "learning_rate": 9.743724383852597e-05, + "loss": 0.7164814472198486, + "step": 2478 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 1.2239704132080078, + "learning_rate": 9.742964871540472e-05, + "loss": 0.6428439617156982, + "step": 2480 + }, + { + "epoch": 1.0472573839662447, + "grad_norm": 1.1854743957519531, + "learning_rate": 9.742204265117443e-05, + "loss": 0.6994290351867676, + "step": 2482 + }, + { + "epoch": 1.0481012658227848, + "grad_norm": 1.0695894956588745, + "learning_rate": 9.741442564758964e-05, + "loss": 0.6725777983665466, + "step": 2484 + }, + { + "epoch": 1.048945147679325, + "grad_norm": 1.1799863576889038, + "learning_rate": 9.740679770640748e-05, + "loss": 0.6538674235343933, + "step": 2486 + }, + { + "epoch": 1.049789029535865, + "grad_norm": 1.295546293258667, + "learning_rate": 9.739915882938754e-05, + "loss": 0.780756950378418, + "step": 2488 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 1.2371755838394165, + "learning_rate": 9.739150901829198e-05, + "loss": 0.6657930612564087, + "step": 2490 + }, + { + "epoch": 1.051476793248945, + "grad_norm": 1.103037714958191, + "learning_rate": 9.738384827488547e-05, + "loss": 0.6675208210945129, + "step": 2492 + }, + { + "epoch": 1.0523206751054852, + "grad_norm": 1.1835435628890991, + "learning_rate": 9.737617660093517e-05, + "loss": 0.6693358421325684, + "step": 2494 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 1.003771424293518, + "learning_rate": 9.736849399821082e-05, + "loss": 0.624502956867218, + "step": 2496 + }, + { + "epoch": 1.0540084388185653, + "grad_norm": 1.1391769647598267, + "learning_rate": 9.736080046848463e-05, + "loss": 0.6350868344306946, + "step": 2498 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 1.376518726348877, + "learning_rate": 9.735309601353134e-05, + "loss": 0.6721012592315674, + "step": 2500 + }, + { + "epoch": 1.0548523206751055, + "eval_loss": 0.741338849067688, + "eval_runtime": 847.7478, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2500 + }, + { + "epoch": 1.0556962025316456, + "grad_norm": 1.194190502166748, + "learning_rate": 9.734538063512824e-05, + "loss": 0.6888233423233032, + "step": 2502 + }, + { + "epoch": 1.0565400843881856, + "grad_norm": 1.378830909729004, + "learning_rate": 9.733765433505513e-05, + "loss": 0.7095553278923035, + "step": 2504 + }, + { + "epoch": 1.0573839662447257, + "grad_norm": 1.1289541721343994, + "learning_rate": 9.732991711509428e-05, + "loss": 0.6734166145324707, + "step": 2506 + }, + { + "epoch": 1.058227848101266, + "grad_norm": 1.1858116388320923, + "learning_rate": 9.732216897703054e-05, + "loss": 0.7006195187568665, + "step": 2508 + }, + { + "epoch": 1.0590717299578059, + "grad_norm": 1.1365686655044556, + "learning_rate": 9.731440992265127e-05, + "loss": 0.6481205821037292, + "step": 2510 + }, + { + "epoch": 1.059915611814346, + "grad_norm": 1.2886228561401367, + "learning_rate": 9.730663995374632e-05, + "loss": 0.679282546043396, + "step": 2512 + }, + { + "epoch": 1.0607594936708862, + "grad_norm": 1.355322003364563, + "learning_rate": 9.729885907210808e-05, + "loss": 0.7656359672546387, + "step": 2514 + }, + { + "epoch": 1.0616033755274261, + "grad_norm": 1.1552364826202393, + "learning_rate": 9.729106727953142e-05, + "loss": 0.5996183156967163, + "step": 2516 + }, + { + "epoch": 1.0624472573839663, + "grad_norm": 1.1419235467910767, + "learning_rate": 9.728326457781381e-05, + "loss": 0.7599716782569885, + "step": 2518 + }, + { + "epoch": 1.0632911392405062, + "grad_norm": 1.2240079641342163, + "learning_rate": 9.727545096875512e-05, + "loss": 0.7150241732597351, + "step": 2520 + }, + { + "epoch": 1.0641350210970464, + "grad_norm": 1.2463440895080566, + "learning_rate": 9.726762645415785e-05, + "loss": 0.734352171421051, + "step": 2522 + }, + { + "epoch": 1.0649789029535865, + "grad_norm": 1.1680364608764648, + "learning_rate": 9.725979103582697e-05, + "loss": 0.6950796842575073, + "step": 2524 + }, + { + "epoch": 1.0658227848101265, + "grad_norm": 1.1680421829223633, + "learning_rate": 9.725194471556991e-05, + "loss": 0.7096341252326965, + "step": 2526 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.043717861175537, + "learning_rate": 9.724408749519671e-05, + "loss": 0.6486304402351379, + "step": 2528 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 1.1240284442901611, + "learning_rate": 9.723621937651985e-05, + "loss": 0.6519505381584167, + "step": 2530 + }, + { + "epoch": 1.0683544303797468, + "grad_norm": 1.185223937034607, + "learning_rate": 9.722834036135439e-05, + "loss": 0.6724293231964111, + "step": 2532 + }, + { + "epoch": 1.069198312236287, + "grad_norm": 1.3234196901321411, + "learning_rate": 9.722045045151784e-05, + "loss": 0.6886576414108276, + "step": 2534 + }, + { + "epoch": 1.070042194092827, + "grad_norm": 1.333084225654602, + "learning_rate": 9.721254964883024e-05, + "loss": 0.688493549823761, + "step": 2536 + }, + { + "epoch": 1.070886075949367, + "grad_norm": 1.2435462474822998, + "learning_rate": 9.720463795511419e-05, + "loss": 0.6527412533760071, + "step": 2538 + }, + { + "epoch": 1.0717299578059072, + "grad_norm": 1.1521880626678467, + "learning_rate": 9.719671537219472e-05, + "loss": 0.6508163809776306, + "step": 2540 + }, + { + "epoch": 1.0725738396624473, + "grad_norm": 1.015013575553894, + "learning_rate": 9.718878190189947e-05, + "loss": 0.6954023838043213, + "step": 2542 + }, + { + "epoch": 1.0734177215189873, + "grad_norm": 1.1507678031921387, + "learning_rate": 9.718083754605851e-05, + "loss": 0.7201322913169861, + "step": 2544 + }, + { + "epoch": 1.0742616033755275, + "grad_norm": 1.0569016933441162, + "learning_rate": 9.717288230650444e-05, + "loss": 0.6688649654388428, + "step": 2546 + }, + { + "epoch": 1.0751054852320676, + "grad_norm": 1.2178492546081543, + "learning_rate": 9.716491618507241e-05, + "loss": 0.7077898979187012, + "step": 2548 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 1.3587230443954468, + "learning_rate": 9.715693918360002e-05, + "loss": 0.7312119603157043, + "step": 2550 + }, + { + "epoch": 1.0767932489451477, + "grad_norm": 1.1930122375488281, + "learning_rate": 9.714895130392744e-05, + "loss": 0.6910589337348938, + "step": 2552 + }, + { + "epoch": 1.0776371308016879, + "grad_norm": 1.2440707683563232, + "learning_rate": 9.71409525478973e-05, + "loss": 0.7942836284637451, + "step": 2554 + }, + { + "epoch": 1.0784810126582278, + "grad_norm": 1.3755065202713013, + "learning_rate": 9.713294291735477e-05, + "loss": 0.6652286052703857, + "step": 2556 + }, + { + "epoch": 1.079324894514768, + "grad_norm": 1.165448784828186, + "learning_rate": 9.71249224141475e-05, + "loss": 0.6025735139846802, + "step": 2558 + }, + { + "epoch": 1.080168776371308, + "grad_norm": 1.2981204986572266, + "learning_rate": 9.711689104012569e-05, + "loss": 0.7343734502792358, + "step": 2560 + }, + { + "epoch": 1.081012658227848, + "grad_norm": 1.2040622234344482, + "learning_rate": 9.710884879714202e-05, + "loss": 0.6903306841850281, + "step": 2562 + }, + { + "epoch": 1.0818565400843883, + "grad_norm": 1.1835904121398926, + "learning_rate": 9.710079568705168e-05, + "loss": 0.69134920835495, + "step": 2564 + }, + { + "epoch": 1.0827004219409282, + "grad_norm": 1.3345229625701904, + "learning_rate": 9.709273171171235e-05, + "loss": 0.6471185088157654, + "step": 2566 + }, + { + "epoch": 1.0835443037974684, + "grad_norm": 1.0884469747543335, + "learning_rate": 9.708465687298425e-05, + "loss": 0.6302382349967957, + "step": 2568 + }, + { + "epoch": 1.0843881856540085, + "grad_norm": 1.1994211673736572, + "learning_rate": 9.707657117273007e-05, + "loss": 0.7329678535461426, + "step": 2570 + }, + { + "epoch": 1.0852320675105485, + "grad_norm": 1.2609503269195557, + "learning_rate": 9.706847461281507e-05, + "loss": 0.719862163066864, + "step": 2572 + }, + { + "epoch": 1.0860759493670886, + "grad_norm": 1.2686879634857178, + "learning_rate": 9.706036719510694e-05, + "loss": 0.7142901420593262, + "step": 2574 + }, + { + "epoch": 1.0869198312236288, + "grad_norm": 1.2763310670852661, + "learning_rate": 9.705224892147591e-05, + "loss": 0.7009075284004211, + "step": 2576 + }, + { + "epoch": 1.0877637130801687, + "grad_norm": 1.1704022884368896, + "learning_rate": 9.70441197937947e-05, + "loss": 0.6873779296875, + "step": 2578 + }, + { + "epoch": 1.0886075949367089, + "grad_norm": 1.0482875108718872, + "learning_rate": 9.703597981393856e-05, + "loss": 0.6437726020812988, + "step": 2580 + }, + { + "epoch": 1.0894514767932488, + "grad_norm": 1.28431236743927, + "learning_rate": 9.702782898378521e-05, + "loss": 0.6933431625366211, + "step": 2582 + }, + { + "epoch": 1.090295358649789, + "grad_norm": 1.0962283611297607, + "learning_rate": 9.701966730521491e-05, + "loss": 0.6488757133483887, + "step": 2584 + }, + { + "epoch": 1.0911392405063292, + "grad_norm": 1.2177873849868774, + "learning_rate": 9.70114947801104e-05, + "loss": 0.6385396122932434, + "step": 2586 + }, + { + "epoch": 1.091983122362869, + "grad_norm": 1.197059988975525, + "learning_rate": 9.70033114103569e-05, + "loss": 0.6826614737510681, + "step": 2588 + }, + { + "epoch": 1.0928270042194093, + "grad_norm": 1.1624075174331665, + "learning_rate": 9.699511719784217e-05, + "loss": 0.605629563331604, + "step": 2590 + }, + { + "epoch": 1.0936708860759494, + "grad_norm": 1.2975167036056519, + "learning_rate": 9.698691214445648e-05, + "loss": 0.734926700592041, + "step": 2592 + }, + { + "epoch": 1.0945147679324894, + "grad_norm": 1.215414047241211, + "learning_rate": 9.697869625209255e-05, + "loss": 0.7281333804130554, + "step": 2594 + }, + { + "epoch": 1.0953586497890295, + "grad_norm": 1.1862860918045044, + "learning_rate": 9.697046952264563e-05, + "loss": 0.7388250827789307, + "step": 2596 + }, + { + "epoch": 1.0962025316455697, + "grad_norm": 1.1127797365188599, + "learning_rate": 9.696223195801348e-05, + "loss": 0.6495320796966553, + "step": 2598 + }, + { + "epoch": 1.0970464135021096, + "grad_norm": 1.0863338708877563, + "learning_rate": 9.695398356009636e-05, + "loss": 0.7157143950462341, + "step": 2600 + }, + { + "epoch": 1.0970464135021096, + "eval_loss": 0.7377332448959351, + "eval_runtime": 859.6612, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 2600 + }, + { + "epoch": 1.0978902953586498, + "grad_norm": 1.1228652000427246, + "learning_rate": 9.694572433079699e-05, + "loss": 0.6597335934638977, + "step": 2602 + }, + { + "epoch": 1.09873417721519, + "grad_norm": 1.3077653646469116, + "learning_rate": 9.69374542720206e-05, + "loss": 0.6715680360794067, + "step": 2604 + }, + { + "epoch": 1.09957805907173, + "grad_norm": 1.241603970527649, + "learning_rate": 9.692917338567499e-05, + "loss": 0.6910243034362793, + "step": 2606 + }, + { + "epoch": 1.10042194092827, + "grad_norm": 1.1372551918029785, + "learning_rate": 9.692088167367037e-05, + "loss": 0.6519553065299988, + "step": 2608 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 1.2894765138626099, + "learning_rate": 9.691257913791949e-05, + "loss": 0.6542758941650391, + "step": 2610 + }, + { + "epoch": 1.1021097046413502, + "grad_norm": 1.0800915956497192, + "learning_rate": 9.690426578033755e-05, + "loss": 0.6886795163154602, + "step": 2612 + }, + { + "epoch": 1.1029535864978903, + "grad_norm": 1.3394384384155273, + "learning_rate": 9.689594160284233e-05, + "loss": 0.7512150406837463, + "step": 2614 + }, + { + "epoch": 1.1037974683544305, + "grad_norm": 1.2175323963165283, + "learning_rate": 9.688760660735402e-05, + "loss": 0.67207932472229, + "step": 2616 + }, + { + "epoch": 1.1046413502109704, + "grad_norm": 1.2181185483932495, + "learning_rate": 9.687926079579537e-05, + "loss": 0.6591740846633911, + "step": 2618 + }, + { + "epoch": 1.1054852320675106, + "grad_norm": 1.1740983724594116, + "learning_rate": 9.68709041700916e-05, + "loss": 0.6431041359901428, + "step": 2620 + }, + { + "epoch": 1.1063291139240505, + "grad_norm": 1.1792434453964233, + "learning_rate": 9.686253673217038e-05, + "loss": 0.6573615074157715, + "step": 2622 + }, + { + "epoch": 1.1071729957805907, + "grad_norm": 1.058391809463501, + "learning_rate": 9.685415848396196e-05, + "loss": 0.5576209425926208, + "step": 2624 + }, + { + "epoch": 1.1080168776371309, + "grad_norm": 1.3203206062316895, + "learning_rate": 9.684576942739903e-05, + "loss": 0.668684184551239, + "step": 2626 + }, + { + "epoch": 1.1088607594936708, + "grad_norm": 1.2391762733459473, + "learning_rate": 9.68373695644168e-05, + "loss": 0.6800089478492737, + "step": 2628 + }, + { + "epoch": 1.109704641350211, + "grad_norm": 1.2323405742645264, + "learning_rate": 9.682895889695292e-05, + "loss": 0.6433757543563843, + "step": 2630 + }, + { + "epoch": 1.1105485232067511, + "grad_norm": 1.2656551599502563, + "learning_rate": 9.682053742694759e-05, + "loss": 0.6628785729408264, + "step": 2632 + }, + { + "epoch": 1.111392405063291, + "grad_norm": 1.2984392642974854, + "learning_rate": 9.681210515634349e-05, + "loss": 0.6838971972465515, + "step": 2634 + }, + { + "epoch": 1.1122362869198312, + "grad_norm": 1.3200393915176392, + "learning_rate": 9.680366208708576e-05, + "loss": 0.7548647522926331, + "step": 2636 + }, + { + "epoch": 1.1130801687763714, + "grad_norm": 1.225388526916504, + "learning_rate": 9.679520822112208e-05, + "loss": 0.6553335189819336, + "step": 2638 + }, + { + "epoch": 1.1139240506329113, + "grad_norm": 1.2350653409957886, + "learning_rate": 9.678674356040259e-05, + "loss": 0.631401538848877, + "step": 2640 + }, + { + "epoch": 1.1147679324894515, + "grad_norm": 1.2325507402420044, + "learning_rate": 9.677826810687989e-05, + "loss": 0.6459156274795532, + "step": 2642 + }, + { + "epoch": 1.1156118143459917, + "grad_norm": 1.0008996725082397, + "learning_rate": 9.676978186250915e-05, + "loss": 0.6425284743309021, + "step": 2644 + }, + { + "epoch": 1.1164556962025316, + "grad_norm": 1.3767247200012207, + "learning_rate": 9.676128482924796e-05, + "loss": 0.6451422572135925, + "step": 2646 + }, + { + "epoch": 1.1172995780590718, + "grad_norm": 1.2070895433425903, + "learning_rate": 9.675277700905643e-05, + "loss": 0.6713272929191589, + "step": 2648 + }, + { + "epoch": 1.1181434599156117, + "grad_norm": 1.1582069396972656, + "learning_rate": 9.674425840389716e-05, + "loss": 0.6285044550895691, + "step": 2650 + }, + { + "epoch": 1.1189873417721519, + "grad_norm": 1.1641311645507812, + "learning_rate": 9.67357290157352e-05, + "loss": 0.624229907989502, + "step": 2652 + }, + { + "epoch": 1.119831223628692, + "grad_norm": 1.3071147203445435, + "learning_rate": 9.672718884653814e-05, + "loss": 0.7214919328689575, + "step": 2654 + }, + { + "epoch": 1.120675105485232, + "grad_norm": 1.2157800197601318, + "learning_rate": 9.671863789827602e-05, + "loss": 0.8062215447425842, + "step": 2656 + }, + { + "epoch": 1.1215189873417721, + "grad_norm": 1.2843927145004272, + "learning_rate": 9.671007617292138e-05, + "loss": 0.6362426280975342, + "step": 2658 + }, + { + "epoch": 1.1223628691983123, + "grad_norm": 1.1182712316513062, + "learning_rate": 9.670150367244927e-05, + "loss": 0.6181318163871765, + "step": 2660 + }, + { + "epoch": 1.1232067510548522, + "grad_norm": 1.566605806350708, + "learning_rate": 9.669292039883717e-05, + "loss": 0.6973897218704224, + "step": 2662 + }, + { + "epoch": 1.1240506329113924, + "grad_norm": 1.0726850032806396, + "learning_rate": 9.66843263540651e-05, + "loss": 0.6117324829101562, + "step": 2664 + }, + { + "epoch": 1.1248945147679326, + "grad_norm": 1.2953020334243774, + "learning_rate": 9.66757215401155e-05, + "loss": 0.642676830291748, + "step": 2666 + }, + { + "epoch": 1.1257383966244725, + "grad_norm": 1.1184383630752563, + "learning_rate": 9.66671059589734e-05, + "loss": 0.6757452487945557, + "step": 2668 + }, + { + "epoch": 1.1265822784810127, + "grad_norm": 1.2732970714569092, + "learning_rate": 9.66584796126262e-05, + "loss": 0.6861951947212219, + "step": 2670 + }, + { + "epoch": 1.1274261603375528, + "grad_norm": 1.2713000774383545, + "learning_rate": 9.664984250306383e-05, + "loss": 0.6727077960968018, + "step": 2672 + }, + { + "epoch": 1.1282700421940928, + "grad_norm": 1.269827961921692, + "learning_rate": 9.664119463227874e-05, + "loss": 0.7355974912643433, + "step": 2674 + }, + { + "epoch": 1.129113924050633, + "grad_norm": 1.3067172765731812, + "learning_rate": 9.663253600226581e-05, + "loss": 0.7121313214302063, + "step": 2676 + }, + { + "epoch": 1.129957805907173, + "grad_norm": 1.2958797216415405, + "learning_rate": 9.662386661502242e-05, + "loss": 0.6671369075775146, + "step": 2678 + }, + { + "epoch": 1.130801687763713, + "grad_norm": 1.2943401336669922, + "learning_rate": 9.661518647254842e-05, + "loss": 0.6153768301010132, + "step": 2680 + }, + { + "epoch": 1.1316455696202532, + "grad_norm": 1.1744167804718018, + "learning_rate": 9.660649557684616e-05, + "loss": 0.6070778965950012, + "step": 2682 + }, + { + "epoch": 1.1324894514767934, + "grad_norm": 1.159209132194519, + "learning_rate": 9.659779392992047e-05, + "loss": 0.676887035369873, + "step": 2684 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.1937510967254639, + "learning_rate": 9.658908153377866e-05, + "loss": 0.6086745262145996, + "step": 2686 + }, + { + "epoch": 1.1341772151898735, + "grad_norm": 1.1461687088012695, + "learning_rate": 9.658035839043049e-05, + "loss": 0.6493708491325378, + "step": 2688 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 2.066361665725708, + "learning_rate": 9.657162450188824e-05, + "loss": 0.6813004016876221, + "step": 2690 + }, + { + "epoch": 1.1358649789029536, + "grad_norm": 1.086910367012024, + "learning_rate": 9.656287987016664e-05, + "loss": 0.721062183380127, + "step": 2692 + }, + { + "epoch": 1.1367088607594937, + "grad_norm": 1.1869292259216309, + "learning_rate": 9.65541244972829e-05, + "loss": 0.5975021123886108, + "step": 2694 + }, + { + "epoch": 1.1375527426160337, + "grad_norm": 1.2456518411636353, + "learning_rate": 9.654535838525674e-05, + "loss": 0.6818324327468872, + "step": 2696 + }, + { + "epoch": 1.1383966244725738, + "grad_norm": 1.5271464586257935, + "learning_rate": 9.653658153611031e-05, + "loss": 0.6844469308853149, + "step": 2698 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.1403794288635254, + "learning_rate": 9.652779395186827e-05, + "loss": 0.6388684511184692, + "step": 2700 + }, + { + "epoch": 1.139240506329114, + "eval_loss": 0.7335711717605591, + "eval_runtime": 861.9651, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 2700 + }, + { + "epoch": 1.140084388185654, + "grad_norm": 1.1091634035110474, + "learning_rate": 9.651899563455775e-05, + "loss": 0.6154619455337524, + "step": 2702 + }, + { + "epoch": 1.140928270042194, + "grad_norm": 1.3280601501464844, + "learning_rate": 9.651018658620837e-05, + "loss": 0.629319429397583, + "step": 2704 + }, + { + "epoch": 1.1417721518987343, + "grad_norm": 1.226806402206421, + "learning_rate": 9.650136680885216e-05, + "loss": 0.6088175773620605, + "step": 2706 + }, + { + "epoch": 1.1426160337552742, + "grad_norm": 1.0593408346176147, + "learning_rate": 9.649253630452372e-05, + "loss": 0.6199659705162048, + "step": 2708 + }, + { + "epoch": 1.1434599156118144, + "grad_norm": 1.1112475395202637, + "learning_rate": 9.648369507526008e-05, + "loss": 0.7233364582061768, + "step": 2710 + }, + { + "epoch": 1.1443037974683543, + "grad_norm": 1.1737885475158691, + "learning_rate": 9.647484312310068e-05, + "loss": 0.6687955856323242, + "step": 2712 + }, + { + "epoch": 1.1451476793248945, + "grad_norm": 1.194532036781311, + "learning_rate": 9.646598045008756e-05, + "loss": 0.6508969068527222, + "step": 2714 + }, + { + "epoch": 1.1459915611814346, + "grad_norm": 1.069395899772644, + "learning_rate": 9.645710705826517e-05, + "loss": 0.6408317685127258, + "step": 2716 + }, + { + "epoch": 1.1468354430379746, + "grad_norm": 1.2429133653640747, + "learning_rate": 9.644822294968037e-05, + "loss": 0.650763750076294, + "step": 2718 + }, + { + "epoch": 1.1476793248945147, + "grad_norm": 1.2950133085250854, + "learning_rate": 9.64393281263826e-05, + "loss": 0.6952191591262817, + "step": 2720 + }, + { + "epoch": 1.148523206751055, + "grad_norm": 1.1972628831863403, + "learning_rate": 9.643042259042372e-05, + "loss": 0.6772956252098083, + "step": 2722 + }, + { + "epoch": 1.1493670886075948, + "grad_norm": 1.1670407056808472, + "learning_rate": 9.642150634385805e-05, + "loss": 0.6734447479248047, + "step": 2724 + }, + { + "epoch": 1.150210970464135, + "grad_norm": 1.120302677154541, + "learning_rate": 9.641257938874243e-05, + "loss": 0.6387717127799988, + "step": 2726 + }, + { + "epoch": 1.1510548523206752, + "grad_norm": 1.1241344213485718, + "learning_rate": 9.640364172713609e-05, + "loss": 0.6592874526977539, + "step": 2728 + }, + { + "epoch": 1.1518987341772151, + "grad_norm": 1.2627261877059937, + "learning_rate": 9.639469336110083e-05, + "loss": 0.7257466912269592, + "step": 2730 + }, + { + "epoch": 1.1527426160337553, + "grad_norm": 1.0528618097305298, + "learning_rate": 9.638573429270083e-05, + "loss": 0.572188138961792, + "step": 2732 + }, + { + "epoch": 1.1535864978902954, + "grad_norm": 1.212536334991455, + "learning_rate": 9.637676452400277e-05, + "loss": 0.678981602191925, + "step": 2734 + }, + { + "epoch": 1.1544303797468354, + "grad_norm": 1.152167797088623, + "learning_rate": 9.636778405707582e-05, + "loss": 0.6375001072883606, + "step": 2736 + }, + { + "epoch": 1.1552742616033755, + "grad_norm": 1.2400429248809814, + "learning_rate": 9.635879289399161e-05, + "loss": 0.7602289319038391, + "step": 2738 + }, + { + "epoch": 1.1561181434599157, + "grad_norm": 1.3488622903823853, + "learning_rate": 9.634979103682421e-05, + "loss": 0.6209543943405151, + "step": 2740 + }, + { + "epoch": 1.1569620253164556, + "grad_norm": 1.1999555826187134, + "learning_rate": 9.634077848765019e-05, + "loss": 0.6215830445289612, + "step": 2742 + }, + { + "epoch": 1.1578059071729958, + "grad_norm": 1.2008578777313232, + "learning_rate": 9.633175524854855e-05, + "loss": 0.6634654998779297, + "step": 2744 + }, + { + "epoch": 1.158649789029536, + "grad_norm": 1.3920676708221436, + "learning_rate": 9.63227213216008e-05, + "loss": 0.7515161633491516, + "step": 2746 + }, + { + "epoch": 1.159493670886076, + "grad_norm": 1.0551656484603882, + "learning_rate": 9.631367670889089e-05, + "loss": 0.724361777305603, + "step": 2748 + }, + { + "epoch": 1.160337552742616, + "grad_norm": 1.2820028066635132, + "learning_rate": 9.630462141250523e-05, + "loss": 0.6673553586006165, + "step": 2750 + }, + { + "epoch": 1.1611814345991562, + "grad_norm": 1.1452983617782593, + "learning_rate": 9.62955554345327e-05, + "loss": 0.7029784917831421, + "step": 2752 + }, + { + "epoch": 1.1620253164556962, + "grad_norm": 1.1808624267578125, + "learning_rate": 9.628647877706466e-05, + "loss": 0.7355457544326782, + "step": 2754 + }, + { + "epoch": 1.1628691983122363, + "grad_norm": 1.0574703216552734, + "learning_rate": 9.627739144219492e-05, + "loss": 0.6144933700561523, + "step": 2756 + }, + { + "epoch": 1.1637130801687763, + "grad_norm": 1.215733528137207, + "learning_rate": 9.626829343201974e-05, + "loss": 0.6843759417533875, + "step": 2758 + }, + { + "epoch": 1.1645569620253164, + "grad_norm": 1.1667706966400146, + "learning_rate": 9.625918474863787e-05, + "loss": 0.6197049617767334, + "step": 2760 + }, + { + "epoch": 1.1654008438818566, + "grad_norm": 1.3765631914138794, + "learning_rate": 9.62500653941505e-05, + "loss": 0.715958297252655, + "step": 2762 + }, + { + "epoch": 1.1662447257383965, + "grad_norm": 1.173715591430664, + "learning_rate": 9.62409353706613e-05, + "loss": 0.7433139085769653, + "step": 2764 + }, + { + "epoch": 1.1670886075949367, + "grad_norm": 1.1837430000305176, + "learning_rate": 9.623179468027637e-05, + "loss": 0.7174371480941772, + "step": 2766 + }, + { + "epoch": 1.1679324894514769, + "grad_norm": 1.1577154397964478, + "learning_rate": 9.622264332510432e-05, + "loss": 0.7184823751449585, + "step": 2768 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 1.165246605873108, + "learning_rate": 9.621348130725617e-05, + "loss": 0.693343460559845, + "step": 2770 + }, + { + "epoch": 1.169620253164557, + "grad_norm": 1.2853080034255981, + "learning_rate": 9.620430862884542e-05, + "loss": 0.6999852061271667, + "step": 2772 + }, + { + "epoch": 1.1704641350210971, + "grad_norm": 1.1782865524291992, + "learning_rate": 9.619512529198806e-05, + "loss": 0.6034331321716309, + "step": 2774 + }, + { + "epoch": 1.171308016877637, + "grad_norm": 1.4055447578430176, + "learning_rate": 9.61859312988025e-05, + "loss": 0.7588269710540771, + "step": 2776 + }, + { + "epoch": 1.1721518987341772, + "grad_norm": 1.1148805618286133, + "learning_rate": 9.617672665140957e-05, + "loss": 0.6913981437683105, + "step": 2778 + }, + { + "epoch": 1.1729957805907172, + "grad_norm": 1.1311042308807373, + "learning_rate": 9.616751135193266e-05, + "loss": 0.5976925492286682, + "step": 2780 + }, + { + "epoch": 1.1738396624472573, + "grad_norm": 1.2378602027893066, + "learning_rate": 9.615828540249754e-05, + "loss": 0.6897050142288208, + "step": 2782 + }, + { + "epoch": 1.1746835443037975, + "grad_norm": 1.3445732593536377, + "learning_rate": 9.614904880523248e-05, + "loss": 0.6772098541259766, + "step": 2784 + }, + { + "epoch": 1.1755274261603375, + "grad_norm": 1.3380862474441528, + "learning_rate": 9.613980156226815e-05, + "loss": 0.6354818344116211, + "step": 2786 + }, + { + "epoch": 1.1763713080168776, + "grad_norm": 1.0955157279968262, + "learning_rate": 9.613054367573773e-05, + "loss": 0.6541208028793335, + "step": 2788 + }, + { + "epoch": 1.1772151898734178, + "grad_norm": 1.0176626443862915, + "learning_rate": 9.612127514777686e-05, + "loss": 0.6472887992858887, + "step": 2790 + }, + { + "epoch": 1.1780590717299577, + "grad_norm": 1.2644864320755005, + "learning_rate": 9.611199598052357e-05, + "loss": 0.7511212229728699, + "step": 2792 + }, + { + "epoch": 1.1789029535864979, + "grad_norm": 1.248197317123413, + "learning_rate": 9.61027061761184e-05, + "loss": 0.696236789226532, + "step": 2794 + }, + { + "epoch": 1.179746835443038, + "grad_norm": 1.189935564994812, + "learning_rate": 9.609340573670436e-05, + "loss": 0.5962010622024536, + "step": 2796 + }, + { + "epoch": 1.180590717299578, + "grad_norm": 1.1760492324829102, + "learning_rate": 9.608409466442685e-05, + "loss": 0.5981685519218445, + "step": 2798 + }, + { + "epoch": 1.1814345991561181, + "grad_norm": 1.1820716857910156, + "learning_rate": 9.607477296143374e-05, + "loss": 0.6186091303825378, + "step": 2800 + }, + { + "epoch": 1.1814345991561181, + "eval_loss": 0.7298192977905273, + "eval_runtime": 849.544, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 2.48, + "step": 2800 + }, + { + "epoch": 1.1822784810126583, + "grad_norm": 1.0353888273239136, + "learning_rate": 9.606544062987541e-05, + "loss": 0.5859389901161194, + "step": 2802 + }, + { + "epoch": 1.1831223628691983, + "grad_norm": 1.3141933679580688, + "learning_rate": 9.605609767190464e-05, + "loss": 0.6573460698127747, + "step": 2804 + }, + { + "epoch": 1.1839662447257384, + "grad_norm": 1.1209372282028198, + "learning_rate": 9.604674408967664e-05, + "loss": 0.6991921067237854, + "step": 2806 + }, + { + "epoch": 1.1848101265822786, + "grad_norm": 1.2830493450164795, + "learning_rate": 9.603737988534913e-05, + "loss": 0.6438087821006775, + "step": 2808 + }, + { + "epoch": 1.1856540084388185, + "grad_norm": 1.1427195072174072, + "learning_rate": 9.602800506108225e-05, + "loss": 0.6452094316482544, + "step": 2810 + }, + { + "epoch": 1.1864978902953587, + "grad_norm": 1.316420078277588, + "learning_rate": 9.601861961903857e-05, + "loss": 0.6745601296424866, + "step": 2812 + }, + { + "epoch": 1.1873417721518988, + "grad_norm": 1.1643308401107788, + "learning_rate": 9.600922356138317e-05, + "loss": 0.6761514544487, + "step": 2814 + }, + { + "epoch": 1.1881856540084388, + "grad_norm": 1.036056399345398, + "learning_rate": 9.59998168902835e-05, + "loss": 0.6453908681869507, + "step": 2816 + }, + { + "epoch": 1.189029535864979, + "grad_norm": 1.2211129665374756, + "learning_rate": 9.599039960790954e-05, + "loss": 0.6576406359672546, + "step": 2818 + }, + { + "epoch": 1.189873417721519, + "grad_norm": 1.084114670753479, + "learning_rate": 9.598097171643364e-05, + "loss": 0.6214181780815125, + "step": 2820 + }, + { + "epoch": 1.190717299578059, + "grad_norm": 1.1297314167022705, + "learning_rate": 9.597153321803064e-05, + "loss": 0.6381646990776062, + "step": 2822 + }, + { + "epoch": 1.1915611814345992, + "grad_norm": 1.2568120956420898, + "learning_rate": 9.596208411487784e-05, + "loss": 0.7129076719284058, + "step": 2824 + }, + { + "epoch": 1.1924050632911392, + "grad_norm": 1.07041335105896, + "learning_rate": 9.595262440915493e-05, + "loss": 0.7123546004295349, + "step": 2826 + }, + { + "epoch": 1.1932489451476793, + "grad_norm": 1.3950074911117554, + "learning_rate": 9.594315410304413e-05, + "loss": 0.7263038158416748, + "step": 2828 + }, + { + "epoch": 1.1940928270042195, + "grad_norm": 1.2470672130584717, + "learning_rate": 9.593367319873002e-05, + "loss": 0.6863036751747131, + "step": 2830 + }, + { + "epoch": 1.1949367088607594, + "grad_norm": 1.2065461874008179, + "learning_rate": 9.592418169839968e-05, + "loss": 0.745354175567627, + "step": 2832 + }, + { + "epoch": 1.1957805907172996, + "grad_norm": 1.1710152626037598, + "learning_rate": 9.591467960424261e-05, + "loss": 0.6401656866073608, + "step": 2834 + }, + { + "epoch": 1.1966244725738397, + "grad_norm": 1.3324087858200073, + "learning_rate": 9.590516691845077e-05, + "loss": 0.7402615547180176, + "step": 2836 + }, + { + "epoch": 1.1974683544303797, + "grad_norm": 1.0100195407867432, + "learning_rate": 9.589564364321855e-05, + "loss": 0.5723769068717957, + "step": 2838 + }, + { + "epoch": 1.1983122362869199, + "grad_norm": 1.2706246376037598, + "learning_rate": 9.588610978074277e-05, + "loss": 0.6618966460227966, + "step": 2840 + }, + { + "epoch": 1.1991561181434598, + "grad_norm": 1.1921758651733398, + "learning_rate": 9.587656533322273e-05, + "loss": 0.7090804576873779, + "step": 2842 + }, + { + "epoch": 1.2, + "grad_norm": 1.36713445186615, + "learning_rate": 9.586701030286014e-05, + "loss": 0.6930652856826782, + "step": 2844 + }, + { + "epoch": 1.2008438818565401, + "grad_norm": 1.3084295988082886, + "learning_rate": 9.585744469185917e-05, + "loss": 0.7386236190795898, + "step": 2846 + }, + { + "epoch": 1.20168776371308, + "grad_norm": 1.198922038078308, + "learning_rate": 9.584786850242642e-05, + "loss": 0.6179903149604797, + "step": 2848 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.2106369733810425, + "learning_rate": 9.583828173677092e-05, + "loss": 0.7027528882026672, + "step": 2850 + }, + { + "epoch": 1.2033755274261604, + "grad_norm": 1.2959522008895874, + "learning_rate": 9.582868439710418e-05, + "loss": 0.6612945199012756, + "step": 2852 + }, + { + "epoch": 1.2042194092827003, + "grad_norm": 1.1441705226898193, + "learning_rate": 9.58190764856401e-05, + "loss": 0.7085917592048645, + "step": 2854 + }, + { + "epoch": 1.2050632911392405, + "grad_norm": 1.1586185693740845, + "learning_rate": 9.580945800459504e-05, + "loss": 0.7480600476264954, + "step": 2856 + }, + { + "epoch": 1.2059071729957807, + "grad_norm": 1.2068266868591309, + "learning_rate": 9.579982895618783e-05, + "loss": 0.7185836434364319, + "step": 2858 + }, + { + "epoch": 1.2067510548523206, + "grad_norm": 1.2188525199890137, + "learning_rate": 9.579018934263966e-05, + "loss": 0.6737306118011475, + "step": 2860 + }, + { + "epoch": 1.2075949367088608, + "grad_norm": 1.1513181924819946, + "learning_rate": 9.578053916617423e-05, + "loss": 0.7239293456077576, + "step": 2862 + }, + { + "epoch": 1.208438818565401, + "grad_norm": 1.2063703536987305, + "learning_rate": 9.577087842901764e-05, + "loss": 0.6416276097297668, + "step": 2864 + }, + { + "epoch": 1.2092827004219409, + "grad_norm": 1.102460503578186, + "learning_rate": 9.576120713339844e-05, + "loss": 0.697213351726532, + "step": 2866 + }, + { + "epoch": 1.210126582278481, + "grad_norm": 1.2484638690948486, + "learning_rate": 9.575152528154763e-05, + "loss": 0.6664742231369019, + "step": 2868 + }, + { + "epoch": 1.2109704641350212, + "grad_norm": 1.4476624727249146, + "learning_rate": 9.57418328756986e-05, + "loss": 0.6914868354797363, + "step": 2870 + }, + { + "epoch": 1.2118143459915611, + "grad_norm": 1.0130122900009155, + "learning_rate": 9.573212991808722e-05, + "loss": 0.662024736404419, + "step": 2872 + }, + { + "epoch": 1.2126582278481013, + "grad_norm": 1.014470100402832, + "learning_rate": 9.572241641095177e-05, + "loss": 0.6330409646034241, + "step": 2874 + }, + { + "epoch": 1.2135021097046415, + "grad_norm": 1.1803333759307861, + "learning_rate": 9.571269235653298e-05, + "loss": 0.6607463955879211, + "step": 2876 + }, + { + "epoch": 1.2143459915611814, + "grad_norm": 1.261366844177246, + "learning_rate": 9.570295775707398e-05, + "loss": 0.6925629377365112, + "step": 2878 + }, + { + "epoch": 1.2151898734177216, + "grad_norm": 1.226670503616333, + "learning_rate": 9.569321261482037e-05, + "loss": 0.7070510983467102, + "step": 2880 + }, + { + "epoch": 1.2160337552742617, + "grad_norm": 1.164565920829773, + "learning_rate": 9.568345693202016e-05, + "loss": 0.7243561744689941, + "step": 2882 + }, + { + "epoch": 1.2168776371308017, + "grad_norm": 1.060331106185913, + "learning_rate": 9.567369071092382e-05, + "loss": 0.6316909790039062, + "step": 2884 + }, + { + "epoch": 1.2177215189873418, + "grad_norm": 1.1998693943023682, + "learning_rate": 9.566391395378419e-05, + "loss": 0.6139125227928162, + "step": 2886 + }, + { + "epoch": 1.2185654008438818, + "grad_norm": 1.1875834465026855, + "learning_rate": 9.565412666285661e-05, + "loss": 0.688897430896759, + "step": 2888 + }, + { + "epoch": 1.219409282700422, + "grad_norm": 1.199174404144287, + "learning_rate": 9.564432884039882e-05, + "loss": 0.684590756893158, + "step": 2890 + }, + { + "epoch": 1.220253164556962, + "grad_norm": 1.2428219318389893, + "learning_rate": 9.563452048867099e-05, + "loss": 0.67433100938797, + "step": 2892 + }, + { + "epoch": 1.221097046413502, + "grad_norm": 1.0826431512832642, + "learning_rate": 9.562470160993568e-05, + "loss": 0.6959785223007202, + "step": 2894 + }, + { + "epoch": 1.2219409282700422, + "grad_norm": 1.3140246868133545, + "learning_rate": 9.561487220645797e-05, + "loss": 0.6443175673484802, + "step": 2896 + }, + { + "epoch": 1.2227848101265824, + "grad_norm": 1.2758334875106812, + "learning_rate": 9.560503228050529e-05, + "loss": 0.6715332865715027, + "step": 2898 + }, + { + "epoch": 1.2236286919831223, + "grad_norm": 1.3326421976089478, + "learning_rate": 9.559518183434753e-05, + "loss": 0.6896081566810608, + "step": 2900 + }, + { + "epoch": 1.2236286919831223, + "eval_loss": 0.7281573414802551, + "eval_runtime": 854.563, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 2.466, + "step": 2900 + }, + { + "epoch": 1.2244725738396625, + "grad_norm": 1.3225606679916382, + "learning_rate": 9.558532087025697e-05, + "loss": 0.6797633171081543, + "step": 2902 + }, + { + "epoch": 1.2253164556962026, + "grad_norm": 1.3058340549468994, + "learning_rate": 9.55754493905084e-05, + "loss": 0.6510948538780212, + "step": 2904 + }, + { + "epoch": 1.2261603375527426, + "grad_norm": 1.140268087387085, + "learning_rate": 9.556556739737892e-05, + "loss": 0.6481176614761353, + "step": 2906 + }, + { + "epoch": 1.2270042194092827, + "grad_norm": 1.465113639831543, + "learning_rate": 9.555567489314816e-05, + "loss": 0.7533771991729736, + "step": 2908 + }, + { + "epoch": 1.2278481012658227, + "grad_norm": 1.1468979120254517, + "learning_rate": 9.554577188009812e-05, + "loss": 0.6924305558204651, + "step": 2910 + }, + { + "epoch": 1.2286919831223628, + "grad_norm": 1.2193517684936523, + "learning_rate": 9.553585836051321e-05, + "loss": 0.7082820534706116, + "step": 2912 + }, + { + "epoch": 1.229535864978903, + "grad_norm": 1.2015037536621094, + "learning_rate": 9.552593433668034e-05, + "loss": 0.6735695004463196, + "step": 2914 + }, + { + "epoch": 1.230379746835443, + "grad_norm": 1.1915435791015625, + "learning_rate": 9.551599981088874e-05, + "loss": 0.7312048673629761, + "step": 2916 + }, + { + "epoch": 1.231223628691983, + "grad_norm": 1.2849410772323608, + "learning_rate": 9.550605478543013e-05, + "loss": 0.6590308547019958, + "step": 2918 + }, + { + "epoch": 1.2320675105485233, + "grad_norm": 1.192238688468933, + "learning_rate": 9.549609926259866e-05, + "loss": 0.6237715482711792, + "step": 2920 + }, + { + "epoch": 1.2329113924050632, + "grad_norm": 1.141845703125, + "learning_rate": 9.548613324469085e-05, + "loss": 0.6546295881271362, + "step": 2922 + }, + { + "epoch": 1.2337552742616034, + "grad_norm": 1.1662311553955078, + "learning_rate": 9.547615673400566e-05, + "loss": 0.5800934433937073, + "step": 2924 + }, + { + "epoch": 1.2345991561181435, + "grad_norm": 1.120578646659851, + "learning_rate": 9.546616973284453e-05, + "loss": 0.6487136483192444, + "step": 2926 + }, + { + "epoch": 1.2354430379746835, + "grad_norm": 1.0884860754013062, + "learning_rate": 9.54561722435112e-05, + "loss": 0.7515342235565186, + "step": 2928 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 1.4208670854568481, + "learning_rate": 9.544616426831196e-05, + "loss": 0.7162003517150879, + "step": 2930 + }, + { + "epoch": 1.2371308016877638, + "grad_norm": 1.083389401435852, + "learning_rate": 9.543614580955543e-05, + "loss": 0.708450198173523, + "step": 2932 + }, + { + "epoch": 1.2379746835443037, + "grad_norm": 1.141364336013794, + "learning_rate": 9.542611686955268e-05, + "loss": 0.6255859732627869, + "step": 2934 + }, + { + "epoch": 1.238818565400844, + "grad_norm": 1.122036099433899, + "learning_rate": 9.54160774506172e-05, + "loss": 0.6485402584075928, + "step": 2936 + }, + { + "epoch": 1.239662447257384, + "grad_norm": 1.3514165878295898, + "learning_rate": 9.540602755506487e-05, + "loss": 0.6735473871231079, + "step": 2938 + }, + { + "epoch": 1.240506329113924, + "grad_norm": 1.1762629747390747, + "learning_rate": 9.539596718521403e-05, + "loss": 0.6154970526695251, + "step": 2940 + }, + { + "epoch": 1.2413502109704642, + "grad_norm": 1.1609408855438232, + "learning_rate": 9.53858963433854e-05, + "loss": 0.6410251259803772, + "step": 2942 + }, + { + "epoch": 1.2421940928270043, + "grad_norm": 1.1750361919403076, + "learning_rate": 9.537581503190214e-05, + "loss": 0.6841039657592773, + "step": 2944 + }, + { + "epoch": 1.2430379746835443, + "grad_norm": 1.3125680685043335, + "learning_rate": 9.536572325308982e-05, + "loss": 0.7293462753295898, + "step": 2946 + }, + { + "epoch": 1.2438818565400844, + "grad_norm": 1.1737277507781982, + "learning_rate": 9.53556210092764e-05, + "loss": 0.7713663578033447, + "step": 2948 + }, + { + "epoch": 1.2447257383966246, + "grad_norm": 1.1702152490615845, + "learning_rate": 9.53455083027923e-05, + "loss": 0.6612298488616943, + "step": 2950 + }, + { + "epoch": 1.2455696202531645, + "grad_norm": 1.2594486474990845, + "learning_rate": 9.533538513597028e-05, + "loss": 0.6725803017616272, + "step": 2952 + }, + { + "epoch": 1.2464135021097047, + "grad_norm": 1.180816411972046, + "learning_rate": 9.532525151114562e-05, + "loss": 0.6421069502830505, + "step": 2954 + }, + { + "epoch": 1.2472573839662446, + "grad_norm": 1.25814688205719, + "learning_rate": 9.531510743065593e-05, + "loss": 0.7042996287345886, + "step": 2956 + }, + { + "epoch": 1.2481012658227848, + "grad_norm": 1.2101783752441406, + "learning_rate": 9.530495289684122e-05, + "loss": 0.7359137535095215, + "step": 2958 + }, + { + "epoch": 1.248945147679325, + "grad_norm": 1.1438405513763428, + "learning_rate": 9.5294787912044e-05, + "loss": 0.6186386346817017, + "step": 2960 + }, + { + "epoch": 1.249789029535865, + "grad_norm": 1.163364291191101, + "learning_rate": 9.52846124786091e-05, + "loss": 0.6243056058883667, + "step": 2962 + }, + { + "epoch": 1.250632911392405, + "grad_norm": 1.0695953369140625, + "learning_rate": 9.52744265988838e-05, + "loss": 0.6568763852119446, + "step": 2964 + }, + { + "epoch": 1.2514767932489452, + "grad_norm": 1.2228879928588867, + "learning_rate": 9.52642302752178e-05, + "loss": 0.6486776471138, + "step": 2966 + }, + { + "epoch": 1.2523206751054852, + "grad_norm": 1.2262967824935913, + "learning_rate": 9.52540235099632e-05, + "loss": 0.6293455958366394, + "step": 2968 + }, + { + "epoch": 1.2531645569620253, + "grad_norm": 1.0862956047058105, + "learning_rate": 9.524380630547449e-05, + "loss": 0.6549884080886841, + "step": 2970 + }, + { + "epoch": 1.2540084388185653, + "grad_norm": 1.1721880435943604, + "learning_rate": 9.52335786641086e-05, + "loss": 0.6126490831375122, + "step": 2972 + }, + { + "epoch": 1.2548523206751054, + "grad_norm": 1.2452391386032104, + "learning_rate": 9.522334058822483e-05, + "loss": 0.7078590393066406, + "step": 2974 + }, + { + "epoch": 1.2556962025316456, + "grad_norm": 1.2290222644805908, + "learning_rate": 9.521309208018492e-05, + "loss": 0.6166214942932129, + "step": 2976 + }, + { + "epoch": 1.2565400843881855, + "grad_norm": 1.1823618412017822, + "learning_rate": 9.520283314235299e-05, + "loss": 0.666228175163269, + "step": 2978 + }, + { + "epoch": 1.2573839662447257, + "grad_norm": 1.1702475547790527, + "learning_rate": 9.51925637770956e-05, + "loss": 0.7436795830726624, + "step": 2980 + }, + { + "epoch": 1.2582278481012659, + "grad_norm": 1.0879321098327637, + "learning_rate": 9.518228398678168e-05, + "loss": 0.7120893001556396, + "step": 2982 + }, + { + "epoch": 1.2590717299578058, + "grad_norm": 1.1608418226242065, + "learning_rate": 9.517199377378261e-05, + "loss": 0.6931713223457336, + "step": 2984 + }, + { + "epoch": 1.259915611814346, + "grad_norm": 1.1289087533950806, + "learning_rate": 9.51616931404721e-05, + "loss": 0.6803538799285889, + "step": 2986 + }, + { + "epoch": 1.2607594936708861, + "grad_norm": 1.1622236967086792, + "learning_rate": 9.515138208922633e-05, + "loss": 0.6499706506729126, + "step": 2988 + }, + { + "epoch": 1.261603375527426, + "grad_norm": 1.2492594718933105, + "learning_rate": 9.514106062242386e-05, + "loss": 0.6132655739784241, + "step": 2990 + }, + { + "epoch": 1.2624472573839662, + "grad_norm": 1.1538822650909424, + "learning_rate": 9.513072874244567e-05, + "loss": 0.6309265494346619, + "step": 2992 + }, + { + "epoch": 1.2632911392405064, + "grad_norm": 1.0828478336334229, + "learning_rate": 9.512038645167509e-05, + "loss": 0.6297751665115356, + "step": 2994 + }, + { + "epoch": 1.2641350210970463, + "grad_norm": 1.2440937757492065, + "learning_rate": 9.511003375249792e-05, + "loss": 0.6335258483886719, + "step": 2996 + }, + { + "epoch": 1.2649789029535865, + "grad_norm": 1.1259970664978027, + "learning_rate": 9.50996706473023e-05, + "loss": 0.6513770818710327, + "step": 2998 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.1530309915542603, + "learning_rate": 9.508929713847884e-05, + "loss": 0.6490892767906189, + "step": 3000 + }, + { + "epoch": 1.2658227848101267, + "eval_loss": 0.72515869140625, + "eval_runtime": 868.0515, + "eval_samples_per_second": 2.427, + "eval_steps_per_second": 2.427, + "step": 3000 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.2257169485092163, + "learning_rate": 9.507891322842048e-05, + "loss": 0.6936060786247253, + "step": 3002 + }, + { + "epoch": 1.2675105485232068, + "grad_norm": 1.0380109548568726, + "learning_rate": 9.506851891952259e-05, + "loss": 0.5941951870918274, + "step": 3004 + }, + { + "epoch": 1.268354430379747, + "grad_norm": 1.2830222845077515, + "learning_rate": 9.505811421418296e-05, + "loss": 0.648429811000824, + "step": 3006 + }, + { + "epoch": 1.2691983122362869, + "grad_norm": 1.2212986946105957, + "learning_rate": 9.504769911480171e-05, + "loss": 0.6868565678596497, + "step": 3008 + }, + { + "epoch": 1.270042194092827, + "grad_norm": 1.104656457901001, + "learning_rate": 9.503727362378145e-05, + "loss": 0.6777986288070679, + "step": 3010 + }, + { + "epoch": 1.2708860759493672, + "grad_norm": 1.1449005603790283, + "learning_rate": 9.502683774352713e-05, + "loss": 0.6581128239631653, + "step": 3012 + }, + { + "epoch": 1.2717299578059071, + "grad_norm": 1.2753362655639648, + "learning_rate": 9.501639147644608e-05, + "loss": 0.689930260181427, + "step": 3014 + }, + { + "epoch": 1.2725738396624473, + "grad_norm": 1.3367106914520264, + "learning_rate": 9.500593482494809e-05, + "loss": 0.7549214363098145, + "step": 3016 + }, + { + "epoch": 1.2734177215189875, + "grad_norm": 1.2309048175811768, + "learning_rate": 9.499546779144528e-05, + "loss": 0.6713513135910034, + "step": 3018 + }, + { + "epoch": 1.2742616033755274, + "grad_norm": 1.3833240270614624, + "learning_rate": 9.49849903783522e-05, + "loss": 0.7045458555221558, + "step": 3020 + }, + { + "epoch": 1.2751054852320676, + "grad_norm": 1.1402570009231567, + "learning_rate": 9.49745025880858e-05, + "loss": 0.708249568939209, + "step": 3022 + }, + { + "epoch": 1.2759493670886077, + "grad_norm": 1.0476267337799072, + "learning_rate": 9.496400442306541e-05, + "loss": 0.616210401058197, + "step": 3024 + }, + { + "epoch": 1.2767932489451477, + "grad_norm": 1.1045979261398315, + "learning_rate": 9.495349588571274e-05, + "loss": 0.6691827178001404, + "step": 3026 + }, + { + "epoch": 1.2776371308016878, + "grad_norm": 1.1760368347167969, + "learning_rate": 9.494297697845194e-05, + "loss": 0.6198306083679199, + "step": 3028 + }, + { + "epoch": 1.2784810126582278, + "grad_norm": 1.0015549659729004, + "learning_rate": 9.493244770370946e-05, + "loss": 0.5756480097770691, + "step": 3030 + }, + { + "epoch": 1.279324894514768, + "grad_norm": 1.2190428972244263, + "learning_rate": 9.492190806391427e-05, + "loss": 0.6794419884681702, + "step": 3032 + }, + { + "epoch": 1.2801687763713079, + "grad_norm": 1.0210410356521606, + "learning_rate": 9.491135806149762e-05, + "loss": 0.5847988724708557, + "step": 3034 + }, + { + "epoch": 1.281012658227848, + "grad_norm": 1.0678503513336182, + "learning_rate": 9.490079769889319e-05, + "loss": 0.6760231256484985, + "step": 3036 + }, + { + "epoch": 1.2818565400843882, + "grad_norm": 1.1811012029647827, + "learning_rate": 9.489022697853709e-05, + "loss": 0.7188448309898376, + "step": 3038 + }, + { + "epoch": 1.2827004219409281, + "grad_norm": 1.1134302616119385, + "learning_rate": 9.487964590286776e-05, + "loss": 0.674904465675354, + "step": 3040 + }, + { + "epoch": 1.2835443037974683, + "grad_norm": 1.1868232488632202, + "learning_rate": 9.486905447432603e-05, + "loss": 0.6016344428062439, + "step": 3042 + }, + { + "epoch": 1.2843881856540085, + "grad_norm": 1.1586613655090332, + "learning_rate": 9.485845269535517e-05, + "loss": 0.6965603828430176, + "step": 3044 + }, + { + "epoch": 1.2852320675105484, + "grad_norm": 1.149837613105774, + "learning_rate": 9.48478405684008e-05, + "loss": 0.656144380569458, + "step": 3046 + }, + { + "epoch": 1.2860759493670886, + "grad_norm": 1.228752613067627, + "learning_rate": 9.48372180959109e-05, + "loss": 0.6388653516769409, + "step": 3048 + }, + { + "epoch": 1.2869198312236287, + "grad_norm": 1.2403100728988647, + "learning_rate": 9.482658528033595e-05, + "loss": 0.6255465745925903, + "step": 3050 + }, + { + "epoch": 1.2877637130801687, + "grad_norm": 1.2483839988708496, + "learning_rate": 9.481594212412865e-05, + "loss": 0.6828253269195557, + "step": 3052 + }, + { + "epoch": 1.2886075949367088, + "grad_norm": 1.4161021709442139, + "learning_rate": 9.480528862974422e-05, + "loss": 0.7072080373764038, + "step": 3054 + }, + { + "epoch": 1.289451476793249, + "grad_norm": 1.1500437259674072, + "learning_rate": 9.479462479964021e-05, + "loss": 0.6082415580749512, + "step": 3056 + }, + { + "epoch": 1.290295358649789, + "grad_norm": 1.196595549583435, + "learning_rate": 9.478395063627654e-05, + "loss": 0.6653015613555908, + "step": 3058 + }, + { + "epoch": 1.2911392405063291, + "grad_norm": 1.2832285165786743, + "learning_rate": 9.477326614211557e-05, + "loss": 0.7095832824707031, + "step": 3060 + }, + { + "epoch": 1.2919831223628693, + "grad_norm": 1.2234288454055786, + "learning_rate": 9.476257131962198e-05, + "loss": 0.7183426022529602, + "step": 3062 + }, + { + "epoch": 1.2928270042194092, + "grad_norm": 1.2350459098815918, + "learning_rate": 9.475186617126286e-05, + "loss": 0.713284432888031, + "step": 3064 + }, + { + "epoch": 1.2936708860759494, + "grad_norm": 1.2079555988311768, + "learning_rate": 9.47411506995077e-05, + "loss": 0.6580002307891846, + "step": 3066 + }, + { + "epoch": 1.2945147679324895, + "grad_norm": 1.129796028137207, + "learning_rate": 9.473042490682835e-05, + "loss": 0.5967763662338257, + "step": 3068 + }, + { + "epoch": 1.2953586497890295, + "grad_norm": 1.1706618070602417, + "learning_rate": 9.471968879569901e-05, + "loss": 0.6724388003349304, + "step": 3070 + }, + { + "epoch": 1.2962025316455696, + "grad_norm": 1.0336005687713623, + "learning_rate": 9.470894236859635e-05, + "loss": 0.6527577638626099, + "step": 3072 + }, + { + "epoch": 1.2970464135021098, + "grad_norm": 1.1124558448791504, + "learning_rate": 9.469818562799932e-05, + "loss": 0.677132785320282, + "step": 3074 + }, + { + "epoch": 1.2978902953586497, + "grad_norm": 1.158069372177124, + "learning_rate": 9.468741857638933e-05, + "loss": 0.649718165397644, + "step": 3076 + }, + { + "epoch": 1.29873417721519, + "grad_norm": 1.092926263809204, + "learning_rate": 9.46766412162501e-05, + "loss": 0.6872133612632751, + "step": 3078 + }, + { + "epoch": 1.29957805907173, + "grad_norm": 1.1324822902679443, + "learning_rate": 9.466585355006777e-05, + "loss": 0.6495246291160583, + "step": 3080 + }, + { + "epoch": 1.30042194092827, + "grad_norm": 1.5882837772369385, + "learning_rate": 9.465505558033086e-05, + "loss": 0.6730570197105408, + "step": 3082 + }, + { + "epoch": 1.3012658227848102, + "grad_norm": 0.9866069555282593, + "learning_rate": 9.464424730953023e-05, + "loss": 0.5677527785301208, + "step": 3084 + }, + { + "epoch": 1.3021097046413503, + "grad_norm": 1.1560224294662476, + "learning_rate": 9.463342874015917e-05, + "loss": 0.6247856020927429, + "step": 3086 + }, + { + "epoch": 1.3029535864978903, + "grad_norm": 1.135939359664917, + "learning_rate": 9.462259987471329e-05, + "loss": 0.6889358758926392, + "step": 3088 + }, + { + "epoch": 1.3037974683544304, + "grad_norm": 1.3935760259628296, + "learning_rate": 9.461176071569063e-05, + "loss": 0.7097522020339966, + "step": 3090 + }, + { + "epoch": 1.3046413502109704, + "grad_norm": 1.153518795967102, + "learning_rate": 9.460091126559155e-05, + "loss": 0.7044580578804016, + "step": 3092 + }, + { + "epoch": 1.3054852320675105, + "grad_norm": 1.2112717628479004, + "learning_rate": 9.45900515269188e-05, + "loss": 0.6119300723075867, + "step": 3094 + }, + { + "epoch": 1.3063291139240507, + "grad_norm": 1.295591115951538, + "learning_rate": 9.457918150217754e-05, + "loss": 0.7150222063064575, + "step": 3096 + }, + { + "epoch": 1.3071729957805907, + "grad_norm": 1.1175775527954102, + "learning_rate": 9.456830119387527e-05, + "loss": 0.6043334007263184, + "step": 3098 + }, + { + "epoch": 1.3080168776371308, + "grad_norm": 1.4022588729858398, + "learning_rate": 9.455741060452186e-05, + "loss": 0.6354425549507141, + "step": 3100 + }, + { + "epoch": 1.3080168776371308, + "eval_loss": 0.7225774526596069, + "eval_runtime": 862.4006, + "eval_samples_per_second": 2.443, + "eval_steps_per_second": 2.443, + "step": 3100 + }, + { + "epoch": 1.3088607594936708, + "grad_norm": 1.1657692193984985, + "learning_rate": 9.454650973662957e-05, + "loss": 0.7281571626663208, + "step": 3102 + }, + { + "epoch": 1.309704641350211, + "grad_norm": 1.6169127225875854, + "learning_rate": 9.453559859271301e-05, + "loss": 0.8038214445114136, + "step": 3104 + }, + { + "epoch": 1.310548523206751, + "grad_norm": 1.1256520748138428, + "learning_rate": 9.452467717528918e-05, + "loss": 0.6488606333732605, + "step": 3106 + }, + { + "epoch": 1.311392405063291, + "grad_norm": 1.1224530935287476, + "learning_rate": 9.451374548687745e-05, + "loss": 0.6897066235542297, + "step": 3108 + }, + { + "epoch": 1.3122362869198312, + "grad_norm": 1.1123055219650269, + "learning_rate": 9.450280352999952e-05, + "loss": 0.6332913041114807, + "step": 3110 + }, + { + "epoch": 1.3130801687763713, + "grad_norm": 1.1688940525054932, + "learning_rate": 9.449185130717952e-05, + "loss": 0.7426630854606628, + "step": 3112 + }, + { + "epoch": 1.3139240506329113, + "grad_norm": 1.1898044347763062, + "learning_rate": 9.44808888209439e-05, + "loss": 0.7156099677085876, + "step": 3114 + }, + { + "epoch": 1.3147679324894515, + "grad_norm": 1.3030686378479004, + "learning_rate": 9.44699160738215e-05, + "loss": 0.7150979042053223, + "step": 3116 + }, + { + "epoch": 1.3156118143459916, + "grad_norm": 1.1539074182510376, + "learning_rate": 9.445893306834352e-05, + "loss": 0.6687285900115967, + "step": 3118 + }, + { + "epoch": 1.3164556962025316, + "grad_norm": 1.311808466911316, + "learning_rate": 9.444793980704355e-05, + "loss": 0.7340983152389526, + "step": 3120 + }, + { + "epoch": 1.3172995780590717, + "grad_norm": 1.3325430154800415, + "learning_rate": 9.44369362924575e-05, + "loss": 0.6620677709579468, + "step": 3122 + }, + { + "epoch": 1.3181434599156119, + "grad_norm": 1.201518177986145, + "learning_rate": 9.442592252712365e-05, + "loss": 0.6169955134391785, + "step": 3124 + }, + { + "epoch": 1.3189873417721518, + "grad_norm": 1.2124013900756836, + "learning_rate": 9.441489851358272e-05, + "loss": 0.6696792840957642, + "step": 3126 + }, + { + "epoch": 1.319831223628692, + "grad_norm": 1.2186850309371948, + "learning_rate": 9.440386425437768e-05, + "loss": 0.7303428649902344, + "step": 3128 + }, + { + "epoch": 1.3206751054852321, + "grad_norm": 1.3780523538589478, + "learning_rate": 9.439281975205396e-05, + "loss": 0.7093026638031006, + "step": 3130 + }, + { + "epoch": 1.321518987341772, + "grad_norm": 1.233353614807129, + "learning_rate": 9.438176500915932e-05, + "loss": 0.6821767687797546, + "step": 3132 + }, + { + "epoch": 1.3223628691983123, + "grad_norm": 1.2425329685211182, + "learning_rate": 9.437070002824385e-05, + "loss": 0.700680136680603, + "step": 3134 + }, + { + "epoch": 1.3232067510548524, + "grad_norm": 1.1600432395935059, + "learning_rate": 9.435962481186003e-05, + "loss": 0.6173145771026611, + "step": 3136 + }, + { + "epoch": 1.3240506329113924, + "grad_norm": 1.279336929321289, + "learning_rate": 9.434853936256272e-05, + "loss": 0.6597106456756592, + "step": 3138 + }, + { + "epoch": 1.3248945147679325, + "grad_norm": 1.1787258386611938, + "learning_rate": 9.433744368290909e-05, + "loss": 0.6655287742614746, + "step": 3140 + }, + { + "epoch": 1.3257383966244727, + "grad_norm": 1.3658509254455566, + "learning_rate": 9.432633777545874e-05, + "loss": 0.6312944889068604, + "step": 3142 + }, + { + "epoch": 1.3265822784810126, + "grad_norm": 1.1220000982284546, + "learning_rate": 9.431522164277356e-05, + "loss": 0.6696156859397888, + "step": 3144 + }, + { + "epoch": 1.3274261603375528, + "grad_norm": 1.224761724472046, + "learning_rate": 9.430409528741783e-05, + "loss": 0.6586571335792542, + "step": 3146 + }, + { + "epoch": 1.328270042194093, + "grad_norm": 1.227510929107666, + "learning_rate": 9.429295871195821e-05, + "loss": 0.64905846118927, + "step": 3148 + }, + { + "epoch": 1.3291139240506329, + "grad_norm": 1.1359103918075562, + "learning_rate": 9.428181191896366e-05, + "loss": 0.6407933831214905, + "step": 3150 + }, + { + "epoch": 1.329957805907173, + "grad_norm": 1.2729473114013672, + "learning_rate": 9.427065491100556e-05, + "loss": 0.7004884481430054, + "step": 3152 + }, + { + "epoch": 1.3308016877637132, + "grad_norm": 1.1182841062545776, + "learning_rate": 9.42594876906576e-05, + "loss": 0.6835907101631165, + "step": 3154 + }, + { + "epoch": 1.3316455696202532, + "grad_norm": 1.2309781312942505, + "learning_rate": 9.424831026049585e-05, + "loss": 0.7476315498352051, + "step": 3156 + }, + { + "epoch": 1.3324894514767933, + "grad_norm": 1.0857728719711304, + "learning_rate": 9.423712262309873e-05, + "loss": 0.6811426281929016, + "step": 3158 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.299680233001709, + "learning_rate": 9.4225924781047e-05, + "loss": 0.6403942108154297, + "step": 3160 + }, + { + "epoch": 1.3341772151898734, + "grad_norm": 1.226472020149231, + "learning_rate": 9.421471673692382e-05, + "loss": 0.6758930683135986, + "step": 3162 + }, + { + "epoch": 1.3350210970464136, + "grad_norm": 1.1403205394744873, + "learning_rate": 9.420349849331463e-05, + "loss": 0.7119444608688354, + "step": 3164 + }, + { + "epoch": 1.3358649789029535, + "grad_norm": 1.2888442277908325, + "learning_rate": 9.419227005280729e-05, + "loss": 0.7411463260650635, + "step": 3166 + }, + { + "epoch": 1.3367088607594937, + "grad_norm": 1.1929190158843994, + "learning_rate": 9.418103141799197e-05, + "loss": 0.5992606282234192, + "step": 3168 + }, + { + "epoch": 1.3375527426160336, + "grad_norm": 1.2574355602264404, + "learning_rate": 9.416978259146122e-05, + "loss": 0.6728890538215637, + "step": 3170 + }, + { + "epoch": 1.3383966244725738, + "grad_norm": 0.9653727412223816, + "learning_rate": 9.415852357580992e-05, + "loss": 0.6294883489608765, + "step": 3172 + }, + { + "epoch": 1.339240506329114, + "grad_norm": 1.2107670307159424, + "learning_rate": 9.414725437363532e-05, + "loss": 0.6816665530204773, + "step": 3174 + }, + { + "epoch": 1.340084388185654, + "grad_norm": 1.024849534034729, + "learning_rate": 9.4135974987537e-05, + "loss": 0.6186381578445435, + "step": 3176 + }, + { + "epoch": 1.340928270042194, + "grad_norm": 1.1556614637374878, + "learning_rate": 9.41246854201169e-05, + "loss": 0.6071005463600159, + "step": 3178 + }, + { + "epoch": 1.3417721518987342, + "grad_norm": 1.2382808923721313, + "learning_rate": 9.41133856739793e-05, + "loss": 0.7871434092521667, + "step": 3180 + }, + { + "epoch": 1.3426160337552742, + "grad_norm": 1.0499578714370728, + "learning_rate": 9.410207575173082e-05, + "loss": 0.6578201651573181, + "step": 3182 + }, + { + "epoch": 1.3434599156118143, + "grad_norm": 1.2048250436782837, + "learning_rate": 9.409075565598049e-05, + "loss": 0.6271620392799377, + "step": 3184 + }, + { + "epoch": 1.3443037974683545, + "grad_norm": 1.0287591218948364, + "learning_rate": 9.407942538933958e-05, + "loss": 0.5773864388465881, + "step": 3186 + }, + { + "epoch": 1.3451476793248944, + "grad_norm": 1.1125097274780273, + "learning_rate": 9.406808495442181e-05, + "loss": 0.6745175719261169, + "step": 3188 + }, + { + "epoch": 1.3459915611814346, + "grad_norm": 1.036125898361206, + "learning_rate": 9.405673435384319e-05, + "loss": 0.6001214385032654, + "step": 3190 + }, + { + "epoch": 1.3468354430379748, + "grad_norm": 1.2771985530853271, + "learning_rate": 9.404537359022207e-05, + "loss": 0.6703945994377136, + "step": 3192 + }, + { + "epoch": 1.3476793248945147, + "grad_norm": 1.0891097784042358, + "learning_rate": 9.403400266617918e-05, + "loss": 0.6159096360206604, + "step": 3194 + }, + { + "epoch": 1.3485232067510549, + "grad_norm": 1.1926233768463135, + "learning_rate": 9.402262158433755e-05, + "loss": 0.6439315676689148, + "step": 3196 + }, + { + "epoch": 1.349367088607595, + "grad_norm": 1.272557020187378, + "learning_rate": 9.40112303473226e-05, + "loss": 0.7125352025032043, + "step": 3198 + }, + { + "epoch": 1.350210970464135, + "grad_norm": 1.052037239074707, + "learning_rate": 9.399982895776207e-05, + "loss": 0.594719648361206, + "step": 3200 + }, + { + "epoch": 1.350210970464135, + "eval_loss": 0.7200453281402588, + "eval_runtime": 846.2953, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 3200 + }, + { + "epoch": 1.3510548523206751, + "grad_norm": 1.204728126525879, + "learning_rate": 9.398841741828601e-05, + "loss": 0.6390520334243774, + "step": 3202 + }, + { + "epoch": 1.3518987341772153, + "grad_norm": 1.0873899459838867, + "learning_rate": 9.397699573152689e-05, + "loss": 0.6010531187057495, + "step": 3204 + }, + { + "epoch": 1.3527426160337552, + "grad_norm": 1.3124359846115112, + "learning_rate": 9.396556390011944e-05, + "loss": 0.724280834197998, + "step": 3206 + }, + { + "epoch": 1.3535864978902954, + "grad_norm": 1.2179948091506958, + "learning_rate": 9.395412192670075e-05, + "loss": 0.6430405378341675, + "step": 3208 + }, + { + "epoch": 1.3544303797468356, + "grad_norm": 1.2617219686508179, + "learning_rate": 9.394266981391031e-05, + "loss": 0.7188641428947449, + "step": 3210 + }, + { + "epoch": 1.3552742616033755, + "grad_norm": 1.2151501178741455, + "learning_rate": 9.393120756438988e-05, + "loss": 0.6724364757537842, + "step": 3212 + }, + { + "epoch": 1.3561181434599157, + "grad_norm": 1.221528172492981, + "learning_rate": 9.391973518078357e-05, + "loss": 0.6340664625167847, + "step": 3214 + }, + { + "epoch": 1.3569620253164558, + "grad_norm": 1.3180092573165894, + "learning_rate": 9.390825266573786e-05, + "loss": 0.6914255023002625, + "step": 3216 + }, + { + "epoch": 1.3578059071729958, + "grad_norm": 1.103994369506836, + "learning_rate": 9.38967600219015e-05, + "loss": 0.6137136220932007, + "step": 3218 + }, + { + "epoch": 1.358649789029536, + "grad_norm": 1.33389413356781, + "learning_rate": 9.38852572519257e-05, + "loss": 0.7173700332641602, + "step": 3220 + }, + { + "epoch": 1.3594936708860759, + "grad_norm": 1.1074159145355225, + "learning_rate": 9.387374435846386e-05, + "loss": 0.5942243933677673, + "step": 3222 + }, + { + "epoch": 1.360337552742616, + "grad_norm": 1.1157063245773315, + "learning_rate": 9.386222134417182e-05, + "loss": 0.6362866163253784, + "step": 3224 + }, + { + "epoch": 1.3611814345991562, + "grad_norm": 1.1717792749404907, + "learning_rate": 9.38506882117077e-05, + "loss": 0.6784523129463196, + "step": 3226 + }, + { + "epoch": 1.3620253164556961, + "grad_norm": 1.0946043729782104, + "learning_rate": 9.383914496373197e-05, + "loss": 0.6647377014160156, + "step": 3228 + }, + { + "epoch": 1.3628691983122363, + "grad_norm": 1.1519699096679688, + "learning_rate": 9.382759160290746e-05, + "loss": 0.6302075982093811, + "step": 3230 + }, + { + "epoch": 1.3637130801687762, + "grad_norm": 0.9928684830665588, + "learning_rate": 9.381602813189929e-05, + "loss": 0.5979090332984924, + "step": 3232 + }, + { + "epoch": 1.3645569620253164, + "grad_norm": 1.2488124370574951, + "learning_rate": 9.380445455337492e-05, + "loss": 0.6949353218078613, + "step": 3234 + }, + { + "epoch": 1.3654008438818566, + "grad_norm": 1.3884797096252441, + "learning_rate": 9.379287087000416e-05, + "loss": 0.7225558161735535, + "step": 3236 + }, + { + "epoch": 1.3662447257383965, + "grad_norm": 1.2981176376342773, + "learning_rate": 9.378127708445917e-05, + "loss": 0.6993390917778015, + "step": 3238 + }, + { + "epoch": 1.3670886075949367, + "grad_norm": 0.9884640574455261, + "learning_rate": 9.376967319941438e-05, + "loss": 0.6983805894851685, + "step": 3240 + }, + { + "epoch": 1.3679324894514768, + "grad_norm": 1.2051894664764404, + "learning_rate": 9.375805921754659e-05, + "loss": 0.7062534689903259, + "step": 3242 + }, + { + "epoch": 1.3687763713080168, + "grad_norm": 1.1943434476852417, + "learning_rate": 9.374643514153494e-05, + "loss": 0.6405107378959656, + "step": 3244 + }, + { + "epoch": 1.369620253164557, + "grad_norm": 1.249214768409729, + "learning_rate": 9.373480097406086e-05, + "loss": 0.6844781637191772, + "step": 3246 + }, + { + "epoch": 1.370464135021097, + "grad_norm": 1.1847131252288818, + "learning_rate": 9.372315671780813e-05, + "loss": 0.6048306226730347, + "step": 3248 + }, + { + "epoch": 1.371308016877637, + "grad_norm": 1.125545859336853, + "learning_rate": 9.37115023754629e-05, + "loss": 0.6772685050964355, + "step": 3250 + }, + { + "epoch": 1.3721518987341772, + "grad_norm": 1.466615915298462, + "learning_rate": 9.369983794971354e-05, + "loss": 0.7536272406578064, + "step": 3252 + }, + { + "epoch": 1.3729957805907174, + "grad_norm": 1.066699504852295, + "learning_rate": 9.368816344325084e-05, + "loss": 0.6640655398368835, + "step": 3254 + }, + { + "epoch": 1.3738396624472573, + "grad_norm": 1.4793988466262817, + "learning_rate": 9.367647885876787e-05, + "loss": 0.7029458284378052, + "step": 3256 + }, + { + "epoch": 1.3746835443037975, + "grad_norm": 1.258540153503418, + "learning_rate": 9.366478419896006e-05, + "loss": 0.7231863737106323, + "step": 3258 + }, + { + "epoch": 1.3755274261603376, + "grad_norm": 1.176106333732605, + "learning_rate": 9.365307946652512e-05, + "loss": 0.6679144501686096, + "step": 3260 + }, + { + "epoch": 1.3763713080168776, + "grad_norm": 1.3301753997802734, + "learning_rate": 9.364136466416316e-05, + "loss": 0.6282188296318054, + "step": 3262 + }, + { + "epoch": 1.3772151898734177, + "grad_norm": 1.3616732358932495, + "learning_rate": 9.362963979457648e-05, + "loss": 0.6870840191841125, + "step": 3264 + }, + { + "epoch": 1.378059071729958, + "grad_norm": 1.1982418298721313, + "learning_rate": 9.361790486046985e-05, + "loss": 0.6823731660842896, + "step": 3266 + }, + { + "epoch": 1.3789029535864978, + "grad_norm": 1.1869033575057983, + "learning_rate": 9.360615986455024e-05, + "loss": 0.6582897305488586, + "step": 3268 + }, + { + "epoch": 1.379746835443038, + "grad_norm": 1.1192975044250488, + "learning_rate": 9.359440480952703e-05, + "loss": 0.716654360294342, + "step": 3270 + }, + { + "epoch": 1.3805907172995782, + "grad_norm": 1.2210016250610352, + "learning_rate": 9.358263969811189e-05, + "loss": 0.6880061626434326, + "step": 3272 + }, + { + "epoch": 1.381434599156118, + "grad_norm": 1.0358284711837769, + "learning_rate": 9.357086453301878e-05, + "loss": 0.666864812374115, + "step": 3274 + }, + { + "epoch": 1.3822784810126583, + "grad_norm": 1.2790803909301758, + "learning_rate": 9.355907931696401e-05, + "loss": 0.6872087121009827, + "step": 3276 + }, + { + "epoch": 1.3831223628691984, + "grad_norm": 1.182991623878479, + "learning_rate": 9.354728405266623e-05, + "loss": 0.5929665565490723, + "step": 3278 + }, + { + "epoch": 1.3839662447257384, + "grad_norm": 1.1071184873580933, + "learning_rate": 9.353547874284634e-05, + "loss": 0.5928181409835815, + "step": 3280 + }, + { + "epoch": 1.3848101265822785, + "grad_norm": 1.3139623403549194, + "learning_rate": 9.352366339022763e-05, + "loss": 0.6783652901649475, + "step": 3282 + }, + { + "epoch": 1.3856540084388187, + "grad_norm": 1.2534632682800293, + "learning_rate": 9.351183799753567e-05, + "loss": 0.7652941346168518, + "step": 3284 + }, + { + "epoch": 1.3864978902953586, + "grad_norm": 1.4487930536270142, + "learning_rate": 9.350000256749833e-05, + "loss": 0.7430433630943298, + "step": 3286 + }, + { + "epoch": 1.3873417721518988, + "grad_norm": 1.0786021947860718, + "learning_rate": 9.348815710284584e-05, + "loss": 0.5854598879814148, + "step": 3288 + }, + { + "epoch": 1.3881856540084387, + "grad_norm": 1.0544480085372925, + "learning_rate": 9.347630160631071e-05, + "loss": 0.6365222334861755, + "step": 3290 + }, + { + "epoch": 1.389029535864979, + "grad_norm": 0.9989988207817078, + "learning_rate": 9.346443608062778e-05, + "loss": 0.6485803127288818, + "step": 3292 + }, + { + "epoch": 1.389873417721519, + "grad_norm": 1.100951910018921, + "learning_rate": 9.345256052853419e-05, + "loss": 0.6417753100395203, + "step": 3294 + }, + { + "epoch": 1.390717299578059, + "grad_norm": 1.1398471593856812, + "learning_rate": 9.344067495276942e-05, + "loss": 0.6333693861961365, + "step": 3296 + }, + { + "epoch": 1.3915611814345992, + "grad_norm": 1.1745941638946533, + "learning_rate": 9.342877935607521e-05, + "loss": 0.677288293838501, + "step": 3298 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 1.2651115655899048, + "learning_rate": 9.34168737411957e-05, + "loss": 0.7408396005630493, + "step": 3300 + }, + { + "epoch": 1.3924050632911391, + "eval_loss": 0.7173135876655579, + "eval_runtime": 853.5344, + "eval_samples_per_second": 2.469, + "eval_steps_per_second": 2.469, + "step": 3300 + }, + { + "epoch": 1.3932489451476793, + "grad_norm": 1.0747730731964111, + "learning_rate": 9.340495811087723e-05, + "loss": 0.6810371279716492, + "step": 3302 + }, + { + "epoch": 1.3940928270042194, + "grad_norm": 1.2857651710510254, + "learning_rate": 9.339303246786854e-05, + "loss": 0.6693953275680542, + "step": 3304 + }, + { + "epoch": 1.3949367088607594, + "grad_norm": 1.4544212818145752, + "learning_rate": 9.338109681492063e-05, + "loss": 0.7019274234771729, + "step": 3306 + }, + { + "epoch": 1.3957805907172995, + "grad_norm": 1.687755823135376, + "learning_rate": 9.336915115478685e-05, + "loss": 0.6074224710464478, + "step": 3308 + }, + { + "epoch": 1.3966244725738397, + "grad_norm": 1.1645431518554688, + "learning_rate": 9.33571954902228e-05, + "loss": 0.6981383562088013, + "step": 3310 + }, + { + "epoch": 1.3974683544303796, + "grad_norm": 1.6173527240753174, + "learning_rate": 9.334522982398646e-05, + "loss": 0.7282926440238953, + "step": 3312 + }, + { + "epoch": 1.3983122362869198, + "grad_norm": 1.3132909536361694, + "learning_rate": 9.333325415883804e-05, + "loss": 0.6574883460998535, + "step": 3314 + }, + { + "epoch": 1.39915611814346, + "grad_norm": 1.1629762649536133, + "learning_rate": 9.332126849754014e-05, + "loss": 0.6559937596321106, + "step": 3316 + }, + { + "epoch": 1.4, + "grad_norm": 1.1666897535324097, + "learning_rate": 9.33092728428576e-05, + "loss": 0.683718740940094, + "step": 3318 + }, + { + "epoch": 1.40084388185654, + "grad_norm": 1.2269554138183594, + "learning_rate": 9.329726719755756e-05, + "loss": 0.6909779906272888, + "step": 3320 + }, + { + "epoch": 1.4016877637130802, + "grad_norm": 1.1010066270828247, + "learning_rate": 9.328525156440952e-05, + "loss": 0.6051948666572571, + "step": 3322 + }, + { + "epoch": 1.4025316455696202, + "grad_norm": 1.127143144607544, + "learning_rate": 9.327322594618528e-05, + "loss": 0.6266679763793945, + "step": 3324 + }, + { + "epoch": 1.4033755274261603, + "grad_norm": 1.2160708904266357, + "learning_rate": 9.326119034565887e-05, + "loss": 0.6587526202201843, + "step": 3326 + }, + { + "epoch": 1.4042194092827005, + "grad_norm": 1.0853947401046753, + "learning_rate": 9.32491447656067e-05, + "loss": 0.5916946530342102, + "step": 3328 + }, + { + "epoch": 1.4050632911392404, + "grad_norm": 1.2205027341842651, + "learning_rate": 9.323708920880744e-05, + "loss": 0.6032452583312988, + "step": 3330 + }, + { + "epoch": 1.4059071729957806, + "grad_norm": 1.1964668035507202, + "learning_rate": 9.32250236780421e-05, + "loss": 0.6649114489555359, + "step": 3332 + }, + { + "epoch": 1.4067510548523208, + "grad_norm": 1.2507994174957275, + "learning_rate": 9.321294817609394e-05, + "loss": 0.7142994403839111, + "step": 3334 + }, + { + "epoch": 1.4075949367088607, + "grad_norm": 1.1310259103775024, + "learning_rate": 9.320086270574854e-05, + "loss": 0.709568977355957, + "step": 3336 + }, + { + "epoch": 1.4084388185654009, + "grad_norm": 1.2454090118408203, + "learning_rate": 9.318876726979385e-05, + "loss": 0.7800853848457336, + "step": 3338 + }, + { + "epoch": 1.409282700421941, + "grad_norm": 1.1168389320373535, + "learning_rate": 9.317666187101996e-05, + "loss": 0.6187908053398132, + "step": 3340 + }, + { + "epoch": 1.410126582278481, + "grad_norm": 1.6696287393569946, + "learning_rate": 9.316454651221942e-05, + "loss": 0.6222613453865051, + "step": 3342 + }, + { + "epoch": 1.4109704641350211, + "grad_norm": 0.9500295519828796, + "learning_rate": 9.315242119618698e-05, + "loss": 0.6116594672203064, + "step": 3344 + }, + { + "epoch": 1.4118143459915613, + "grad_norm": 1.186358094215393, + "learning_rate": 9.314028592571973e-05, + "loss": 0.633224368095398, + "step": 3346 + }, + { + "epoch": 1.4126582278481012, + "grad_norm": 1.1855978965759277, + "learning_rate": 9.312814070361705e-05, + "loss": 0.6675921082496643, + "step": 3348 + }, + { + "epoch": 1.4135021097046414, + "grad_norm": 1.2465872764587402, + "learning_rate": 9.311598553268059e-05, + "loss": 0.7268879413604736, + "step": 3350 + }, + { + "epoch": 1.4143459915611816, + "grad_norm": 1.151274561882019, + "learning_rate": 9.310382041571435e-05, + "loss": 0.6147416830062866, + "step": 3352 + }, + { + "epoch": 1.4151898734177215, + "grad_norm": 1.1226807832717896, + "learning_rate": 9.309164535552453e-05, + "loss": 0.6678543090820312, + "step": 3354 + }, + { + "epoch": 1.4160337552742617, + "grad_norm": 1.375842571258545, + "learning_rate": 9.307946035491975e-05, + "loss": 0.6334129571914673, + "step": 3356 + }, + { + "epoch": 1.4168776371308016, + "grad_norm": 1.058353066444397, + "learning_rate": 9.306726541671081e-05, + "loss": 0.6582583785057068, + "step": 3358 + }, + { + "epoch": 1.4177215189873418, + "grad_norm": 1.0511330366134644, + "learning_rate": 9.305506054371084e-05, + "loss": 0.5877419114112854, + "step": 3360 + }, + { + "epoch": 1.4185654008438817, + "grad_norm": 1.2246462106704712, + "learning_rate": 9.304284573873532e-05, + "loss": 0.711665689945221, + "step": 3362 + }, + { + "epoch": 1.4194092827004219, + "grad_norm": 1.0242294073104858, + "learning_rate": 9.303062100460193e-05, + "loss": 0.6743642687797546, + "step": 3364 + }, + { + "epoch": 1.420253164556962, + "grad_norm": 1.1432100534439087, + "learning_rate": 9.301838634413069e-05, + "loss": 0.6825576424598694, + "step": 3366 + }, + { + "epoch": 1.421097046413502, + "grad_norm": 1.0128604173660278, + "learning_rate": 9.30061417601439e-05, + "loss": 0.624455988407135, + "step": 3368 + }, + { + "epoch": 1.4219409282700421, + "grad_norm": 1.2738330364227295, + "learning_rate": 9.299388725546617e-05, + "loss": 0.7029586434364319, + "step": 3370 + }, + { + "epoch": 1.4227848101265823, + "grad_norm": 1.0857324600219727, + "learning_rate": 9.298162283292435e-05, + "loss": 0.5994319915771484, + "step": 3372 + }, + { + "epoch": 1.4236286919831223, + "grad_norm": 1.0811917781829834, + "learning_rate": 9.296934849534763e-05, + "loss": 0.6537772417068481, + "step": 3374 + }, + { + "epoch": 1.4244725738396624, + "grad_norm": 1.006913185119629, + "learning_rate": 9.295706424556745e-05, + "loss": 0.5775008201599121, + "step": 3376 + }, + { + "epoch": 1.4253164556962026, + "grad_norm": 1.2306486368179321, + "learning_rate": 9.294477008641755e-05, + "loss": 0.7445536255836487, + "step": 3378 + }, + { + "epoch": 1.4261603375527425, + "grad_norm": 1.223608374595642, + "learning_rate": 9.293246602073398e-05, + "loss": 0.6081538796424866, + "step": 3380 + }, + { + "epoch": 1.4270042194092827, + "grad_norm": 1.0933321714401245, + "learning_rate": 9.2920152051355e-05, + "loss": 0.6134634613990784, + "step": 3382 + }, + { + "epoch": 1.4278481012658228, + "grad_norm": 1.1738401651382446, + "learning_rate": 9.290782818112127e-05, + "loss": 0.5961087346076965, + "step": 3384 + }, + { + "epoch": 1.4286919831223628, + "grad_norm": 1.1493438482284546, + "learning_rate": 9.289549441287561e-05, + "loss": 0.6284122467041016, + "step": 3386 + }, + { + "epoch": 1.429535864978903, + "grad_norm": 1.1907998323440552, + "learning_rate": 9.288315074946324e-05, + "loss": 0.6654639840126038, + "step": 3388 + }, + { + "epoch": 1.4303797468354431, + "grad_norm": 1.3423025608062744, + "learning_rate": 9.287079719373157e-05, + "loss": 0.652850329875946, + "step": 3390 + }, + { + "epoch": 1.431223628691983, + "grad_norm": 1.3932039737701416, + "learning_rate": 9.285843374853034e-05, + "loss": 0.703445315361023, + "step": 3392 + }, + { + "epoch": 1.4320675105485232, + "grad_norm": 5.349400043487549, + "learning_rate": 9.284606041671155e-05, + "loss": 0.693265438079834, + "step": 3394 + }, + { + "epoch": 1.4329113924050634, + "grad_norm": 1.0921961069107056, + "learning_rate": 9.28336772011295e-05, + "loss": 0.6578536033630371, + "step": 3396 + }, + { + "epoch": 1.4337552742616033, + "grad_norm": 1.184157133102417, + "learning_rate": 9.282128410464074e-05, + "loss": 0.7092277407646179, + "step": 3398 + }, + { + "epoch": 1.4345991561181435, + "grad_norm": 1.0923491716384888, + "learning_rate": 9.280888113010415e-05, + "loss": 0.6866328120231628, + "step": 3400 + }, + { + "epoch": 1.4345991561181435, + "eval_loss": 0.715917706489563, + "eval_runtime": 868.51, + "eval_samples_per_second": 2.426, + "eval_steps_per_second": 2.426, + "step": 3400 + }, + { + "epoch": 1.4354430379746836, + "grad_norm": 1.2515597343444824, + "learning_rate": 9.279646828038083e-05, + "loss": 0.6617444157600403, + "step": 3402 + }, + { + "epoch": 1.4362869198312236, + "grad_norm": 1.2122540473937988, + "learning_rate": 9.278404555833422e-05, + "loss": 0.6373176574707031, + "step": 3404 + }, + { + "epoch": 1.4371308016877637, + "grad_norm": 1.191904902458191, + "learning_rate": 9.277161296682997e-05, + "loss": 0.6506488919258118, + "step": 3406 + }, + { + "epoch": 1.437974683544304, + "grad_norm": 1.2492214441299438, + "learning_rate": 9.275917050873606e-05, + "loss": 0.7172291874885559, + "step": 3408 + }, + { + "epoch": 1.4388185654008439, + "grad_norm": 1.0518640279769897, + "learning_rate": 9.274671818692272e-05, + "loss": 0.6180248260498047, + "step": 3410 + }, + { + "epoch": 1.439662447257384, + "grad_norm": 1.150563359260559, + "learning_rate": 9.273425600426245e-05, + "loss": 0.6828892827033997, + "step": 3412 + }, + { + "epoch": 1.4405063291139242, + "grad_norm": 1.76945960521698, + "learning_rate": 9.272178396363005e-05, + "loss": 0.6585919857025146, + "step": 3414 + }, + { + "epoch": 1.4413502109704641, + "grad_norm": 1.2367758750915527, + "learning_rate": 9.270930206790257e-05, + "loss": 0.7548692226409912, + "step": 3416 + }, + { + "epoch": 1.4421940928270043, + "grad_norm": 1.2292778491973877, + "learning_rate": 9.269681031995936e-05, + "loss": 0.7017102837562561, + "step": 3418 + }, + { + "epoch": 1.4430379746835442, + "grad_norm": 1.2193396091461182, + "learning_rate": 9.268430872268202e-05, + "loss": 0.6657648682594299, + "step": 3420 + }, + { + "epoch": 1.4438818565400844, + "grad_norm": 1.0505954027175903, + "learning_rate": 9.267179727895443e-05, + "loss": 0.6950910091400146, + "step": 3422 + }, + { + "epoch": 1.4447257383966245, + "grad_norm": 1.1560698747634888, + "learning_rate": 9.265927599166272e-05, + "loss": 0.689308226108551, + "step": 3424 + }, + { + "epoch": 1.4455696202531645, + "grad_norm": 1.189336895942688, + "learning_rate": 9.264674486369533e-05, + "loss": 0.6481659412384033, + "step": 3426 + }, + { + "epoch": 1.4464135021097047, + "grad_norm": 1.3527976274490356, + "learning_rate": 9.263420389794294e-05, + "loss": 0.6626612544059753, + "step": 3428 + }, + { + "epoch": 1.4472573839662446, + "grad_norm": 1.096303105354309, + "learning_rate": 9.262165309729854e-05, + "loss": 0.690841794013977, + "step": 3430 + }, + { + "epoch": 1.4481012658227848, + "grad_norm": 1.2131421566009521, + "learning_rate": 9.260909246465732e-05, + "loss": 0.6497649550437927, + "step": 3432 + }, + { + "epoch": 1.448945147679325, + "grad_norm": 1.1831032037734985, + "learning_rate": 9.259652200291678e-05, + "loss": 0.6236130595207214, + "step": 3434 + }, + { + "epoch": 1.4497890295358649, + "grad_norm": 0.9745979309082031, + "learning_rate": 9.25839417149767e-05, + "loss": 0.5223423838615417, + "step": 3436 + }, + { + "epoch": 1.450632911392405, + "grad_norm": 1.372460126876831, + "learning_rate": 9.257135160373912e-05, + "loss": 0.6642022728919983, + "step": 3438 + }, + { + "epoch": 1.4514767932489452, + "grad_norm": 1.421044111251831, + "learning_rate": 9.255875167210832e-05, + "loss": 0.5426992774009705, + "step": 3440 + }, + { + "epoch": 1.4523206751054851, + "grad_norm": 1.1694250106811523, + "learning_rate": 9.254614192299086e-05, + "loss": 0.6260567307472229, + "step": 3442 + }, + { + "epoch": 1.4531645569620253, + "grad_norm": 1.0892298221588135, + "learning_rate": 9.253352235929558e-05, + "loss": 0.5776100158691406, + "step": 3444 + }, + { + "epoch": 1.4540084388185655, + "grad_norm": 1.1841259002685547, + "learning_rate": 9.252089298393356e-05, + "loss": 0.6495202779769897, + "step": 3446 + }, + { + "epoch": 1.4548523206751054, + "grad_norm": 1.1133549213409424, + "learning_rate": 9.250825379981815e-05, + "loss": 0.6570594906806946, + "step": 3448 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 1.197100281715393, + "learning_rate": 9.249560480986498e-05, + "loss": 0.6496587991714478, + "step": 3450 + }, + { + "epoch": 1.4565400843881857, + "grad_norm": 1.1661107540130615, + "learning_rate": 9.248294601699193e-05, + "loss": 0.6644704341888428, + "step": 3452 + }, + { + "epoch": 1.4573839662447257, + "grad_norm": 1.2257879972457886, + "learning_rate": 9.247027742411912e-05, + "loss": 0.6451231241226196, + "step": 3454 + }, + { + "epoch": 1.4582278481012658, + "grad_norm": 1.3634982109069824, + "learning_rate": 9.245759903416897e-05, + "loss": 0.6108601093292236, + "step": 3456 + }, + { + "epoch": 1.459071729957806, + "grad_norm": 1.1802605390548706, + "learning_rate": 9.244491085006615e-05, + "loss": 0.6080004572868347, + "step": 3458 + }, + { + "epoch": 1.459915611814346, + "grad_norm": 1.280831217765808, + "learning_rate": 9.243221287473756e-05, + "loss": 0.6406423449516296, + "step": 3460 + }, + { + "epoch": 1.460759493670886, + "grad_norm": 1.3127192258834839, + "learning_rate": 9.241950511111237e-05, + "loss": 0.7320113778114319, + "step": 3462 + }, + { + "epoch": 1.4616033755274263, + "grad_norm": 1.1711835861206055, + "learning_rate": 9.240678756212204e-05, + "loss": 0.572110652923584, + "step": 3464 + }, + { + "epoch": 1.4624472573839662, + "grad_norm": 1.347143292427063, + "learning_rate": 9.239406023070028e-05, + "loss": 0.7446795105934143, + "step": 3466 + }, + { + "epoch": 1.4632911392405064, + "grad_norm": 1.4953652620315552, + "learning_rate": 9.238132311978299e-05, + "loss": 0.6709978580474854, + "step": 3468 + }, + { + "epoch": 1.4641350210970465, + "grad_norm": 1.2199387550354004, + "learning_rate": 9.236857623230842e-05, + "loss": 0.6691445112228394, + "step": 3470 + }, + { + "epoch": 1.4649789029535865, + "grad_norm": 1.0959199666976929, + "learning_rate": 9.235581957121702e-05, + "loss": 0.6964292526245117, + "step": 3472 + }, + { + "epoch": 1.4658227848101266, + "grad_norm": 1.455505609512329, + "learning_rate": 9.234305313945149e-05, + "loss": 0.6880454421043396, + "step": 3474 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.2820862531661987, + "learning_rate": 9.233027693995681e-05, + "loss": 0.6737138032913208, + "step": 3476 + }, + { + "epoch": 1.4675105485232067, + "grad_norm": 1.3459213972091675, + "learning_rate": 9.231749097568023e-05, + "loss": 0.6874006390571594, + "step": 3478 + }, + { + "epoch": 1.4683544303797469, + "grad_norm": 1.2815442085266113, + "learning_rate": 9.230469524957119e-05, + "loss": 0.7179469466209412, + "step": 3480 + }, + { + "epoch": 1.469198312236287, + "grad_norm": 1.6181597709655762, + "learning_rate": 9.229188976458145e-05, + "loss": 0.7525522112846375, + "step": 3482 + }, + { + "epoch": 1.470042194092827, + "grad_norm": 1.0633227825164795, + "learning_rate": 9.227907452366495e-05, + "loss": 0.5918128490447998, + "step": 3484 + }, + { + "epoch": 1.4708860759493672, + "grad_norm": 1.2055985927581787, + "learning_rate": 9.226624952977796e-05, + "loss": 0.6686186194419861, + "step": 3486 + }, + { + "epoch": 1.471729957805907, + "grad_norm": 1.2495088577270508, + "learning_rate": 9.225341478587893e-05, + "loss": 0.764410674571991, + "step": 3488 + }, + { + "epoch": 1.4725738396624473, + "grad_norm": 1.174229383468628, + "learning_rate": 9.22405702949286e-05, + "loss": 0.7066780924797058, + "step": 3490 + }, + { + "epoch": 1.4734177215189874, + "grad_norm": 1.0970302820205688, + "learning_rate": 9.222771605988995e-05, + "loss": 0.6740228533744812, + "step": 3492 + }, + { + "epoch": 1.4742616033755274, + "grad_norm": 1.2470436096191406, + "learning_rate": 9.221485208372822e-05, + "loss": 0.698371410369873, + "step": 3494 + }, + { + "epoch": 1.4751054852320675, + "grad_norm": 1.0750112533569336, + "learning_rate": 9.220197836941084e-05, + "loss": 0.6354188919067383, + "step": 3496 + }, + { + "epoch": 1.4759493670886075, + "grad_norm": 1.2656232118606567, + "learning_rate": 9.218909491990757e-05, + "loss": 0.7268608212471008, + "step": 3498 + }, + { + "epoch": 1.4767932489451476, + "grad_norm": 1.2389028072357178, + "learning_rate": 9.217620173819037e-05, + "loss": 0.6652966141700745, + "step": 3500 + }, + { + "epoch": 1.4767932489451476, + "eval_loss": 0.7155047059059143, + "eval_runtime": 855.8428, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 2.462, + "step": 3500 + }, + { + "epoch": 1.4776371308016878, + "grad_norm": 1.218304991722107, + "learning_rate": 9.216329882723343e-05, + "loss": 0.6845020651817322, + "step": 3502 + }, + { + "epoch": 1.4784810126582277, + "grad_norm": 1.123903512954712, + "learning_rate": 9.21503861900132e-05, + "loss": 0.6972519755363464, + "step": 3504 + }, + { + "epoch": 1.479324894514768, + "grad_norm": 1.1827739477157593, + "learning_rate": 9.213746382950839e-05, + "loss": 0.6699702739715576, + "step": 3506 + }, + { + "epoch": 1.480168776371308, + "grad_norm": 0.9934872984886169, + "learning_rate": 9.212453174869995e-05, + "loss": 0.5623225569725037, + "step": 3508 + }, + { + "epoch": 1.481012658227848, + "grad_norm": 1.221093773841858, + "learning_rate": 9.211158995057105e-05, + "loss": 0.6527173519134521, + "step": 3510 + }, + { + "epoch": 1.4818565400843882, + "grad_norm": 1.4569166898727417, + "learning_rate": 9.209863843810711e-05, + "loss": 0.7015712261199951, + "step": 3512 + }, + { + "epoch": 1.4827004219409283, + "grad_norm": 1.0764813423156738, + "learning_rate": 9.208567721429581e-05, + "loss": 0.6442505717277527, + "step": 3514 + }, + { + "epoch": 1.4835443037974683, + "grad_norm": 2.1307506561279297, + "learning_rate": 9.207270628212704e-05, + "loss": 0.666451096534729, + "step": 3516 + }, + { + "epoch": 1.4843881856540084, + "grad_norm": 1.180590271949768, + "learning_rate": 9.205972564459296e-05, + "loss": 0.6354807019233704, + "step": 3518 + }, + { + "epoch": 1.4852320675105486, + "grad_norm": 1.2999447584152222, + "learning_rate": 9.204673530468795e-05, + "loss": 0.6080324053764343, + "step": 3520 + }, + { + "epoch": 1.4860759493670885, + "grad_norm": 1.1680655479431152, + "learning_rate": 9.203373526540862e-05, + "loss": 0.6411244869232178, + "step": 3522 + }, + { + "epoch": 1.4869198312236287, + "grad_norm": 1.0565013885498047, + "learning_rate": 9.202072552975383e-05, + "loss": 0.6498287916183472, + "step": 3524 + }, + { + "epoch": 1.4877637130801689, + "grad_norm": 1.246267318725586, + "learning_rate": 9.20077061007247e-05, + "loss": 0.633613109588623, + "step": 3526 + }, + { + "epoch": 1.4886075949367088, + "grad_norm": 1.0626300573349, + "learning_rate": 9.199467698132453e-05, + "loss": 0.6102107167243958, + "step": 3528 + }, + { + "epoch": 1.489451476793249, + "grad_norm": 1.256600260734558, + "learning_rate": 9.198163817455892e-05, + "loss": 0.669352114200592, + "step": 3530 + }, + { + "epoch": 1.4902953586497891, + "grad_norm": 1.143188238143921, + "learning_rate": 9.196858968343565e-05, + "loss": 0.6305804252624512, + "step": 3532 + }, + { + "epoch": 1.491139240506329, + "grad_norm": 1.1471205949783325, + "learning_rate": 9.195553151096475e-05, + "loss": 0.6256994605064392, + "step": 3534 + }, + { + "epoch": 1.4919831223628692, + "grad_norm": 1.1771589517593384, + "learning_rate": 9.194246366015851e-05, + "loss": 0.6395107507705688, + "step": 3536 + }, + { + "epoch": 1.4928270042194094, + "grad_norm": 1.1997097730636597, + "learning_rate": 9.192938613403144e-05, + "loss": 0.6875160932540894, + "step": 3538 + }, + { + "epoch": 1.4936708860759493, + "grad_norm": 1.3962169885635376, + "learning_rate": 9.191629893560024e-05, + "loss": 0.7216510772705078, + "step": 3540 + }, + { + "epoch": 1.4945147679324895, + "grad_norm": 1.1835654973983765, + "learning_rate": 9.19032020678839e-05, + "loss": 0.6870693564414978, + "step": 3542 + }, + { + "epoch": 1.4953586497890297, + "grad_norm": 1.112331509590149, + "learning_rate": 9.18900955339036e-05, + "loss": 0.6266092658042908, + "step": 3544 + }, + { + "epoch": 1.4962025316455696, + "grad_norm": 1.0298354625701904, + "learning_rate": 9.187697933668278e-05, + "loss": 0.5906343460083008, + "step": 3546 + }, + { + "epoch": 1.4970464135021098, + "grad_norm": 1.2650012969970703, + "learning_rate": 9.186385347924709e-05, + "loss": 0.6203610897064209, + "step": 3548 + }, + { + "epoch": 1.49789029535865, + "grad_norm": 1.1208417415618896, + "learning_rate": 9.185071796462441e-05, + "loss": 0.6841281652450562, + "step": 3550 + }, + { + "epoch": 1.4987341772151899, + "grad_norm": 1.1319488286972046, + "learning_rate": 9.183757279584486e-05, + "loss": 0.7089514136314392, + "step": 3552 + }, + { + "epoch": 1.49957805907173, + "grad_norm": 1.1104235649108887, + "learning_rate": 9.182441797594076e-05, + "loss": 0.6663861870765686, + "step": 3554 + }, + { + "epoch": 1.5004219409282702, + "grad_norm": 1.161412000656128, + "learning_rate": 9.18112535079467e-05, + "loss": 0.6713237762451172, + "step": 3556 + }, + { + "epoch": 1.5012658227848101, + "grad_norm": 1.2925246953964233, + "learning_rate": 9.179807939489945e-05, + "loss": 0.6665274500846863, + "step": 3558 + }, + { + "epoch": 1.50210970464135, + "grad_norm": 1.0968270301818848, + "learning_rate": 9.178489563983802e-05, + "loss": 0.6881593465805054, + "step": 3560 + }, + { + "epoch": 1.5029535864978905, + "grad_norm": 1.111439824104309, + "learning_rate": 9.177170224580368e-05, + "loss": 0.631568431854248, + "step": 3562 + }, + { + "epoch": 1.5037974683544304, + "grad_norm": 1.6731075048446655, + "learning_rate": 9.175849921583986e-05, + "loss": 0.6896167397499084, + "step": 3564 + }, + { + "epoch": 1.5046413502109703, + "grad_norm": 1.226739525794983, + "learning_rate": 9.174528655299226e-05, + "loss": 0.6285277605056763, + "step": 3566 + }, + { + "epoch": 1.5054852320675105, + "grad_norm": 1.2030941247940063, + "learning_rate": 9.17320642603088e-05, + "loss": 0.6256678700447083, + "step": 3568 + }, + { + "epoch": 1.5063291139240507, + "grad_norm": 1.1980781555175781, + "learning_rate": 9.171883234083958e-05, + "loss": 0.6895992159843445, + "step": 3570 + }, + { + "epoch": 1.5071729957805906, + "grad_norm": 1.2083429098129272, + "learning_rate": 9.170559079763696e-05, + "loss": 0.6642275452613831, + "step": 3572 + }, + { + "epoch": 1.5080168776371308, + "grad_norm": 1.134020209312439, + "learning_rate": 9.169233963375552e-05, + "loss": 0.7441924214363098, + "step": 3574 + }, + { + "epoch": 1.508860759493671, + "grad_norm": 1.8178621530532837, + "learning_rate": 9.167907885225204e-05, + "loss": 0.6435995101928711, + "step": 3576 + }, + { + "epoch": 1.5097046413502109, + "grad_norm": 1.3850326538085938, + "learning_rate": 9.166580845618553e-05, + "loss": 0.6933603882789612, + "step": 3578 + }, + { + "epoch": 1.510548523206751, + "grad_norm": 1.2500641345977783, + "learning_rate": 9.165252844861723e-05, + "loss": 0.6686714887619019, + "step": 3580 + }, + { + "epoch": 1.5113924050632912, + "grad_norm": 1.0226643085479736, + "learning_rate": 9.163923883261056e-05, + "loss": 0.607890248298645, + "step": 3582 + }, + { + "epoch": 1.5122362869198311, + "grad_norm": 1.233402132987976, + "learning_rate": 9.162593961123118e-05, + "loss": 0.6604583859443665, + "step": 3584 + }, + { + "epoch": 1.5130801687763713, + "grad_norm": 1.2609056234359741, + "learning_rate": 9.161263078754698e-05, + "loss": 0.6756428480148315, + "step": 3586 + }, + { + "epoch": 1.5139240506329115, + "grad_norm": 1.22673761844635, + "learning_rate": 9.159931236462805e-05, + "loss": 0.6990940570831299, + "step": 3588 + }, + { + "epoch": 1.5147679324894514, + "grad_norm": 1.1386182308197021, + "learning_rate": 9.158598434554668e-05, + "loss": 0.6436648964881897, + "step": 3590 + }, + { + "epoch": 1.5156118143459916, + "grad_norm": 1.1136831045150757, + "learning_rate": 9.157264673337739e-05, + "loss": 0.6420145034790039, + "step": 3592 + }, + { + "epoch": 1.5164556962025317, + "grad_norm": 1.1957908868789673, + "learning_rate": 9.155929953119693e-05, + "loss": 0.6518592834472656, + "step": 3594 + }, + { + "epoch": 1.5172995780590717, + "grad_norm": 1.1049647331237793, + "learning_rate": 9.154594274208422e-05, + "loss": 0.6891129612922668, + "step": 3596 + }, + { + "epoch": 1.5181434599156118, + "grad_norm": 1.243675947189331, + "learning_rate": 9.153257636912043e-05, + "loss": 0.6945107579231262, + "step": 3598 + }, + { + "epoch": 1.518987341772152, + "grad_norm": 1.2633713483810425, + "learning_rate": 9.15192004153889e-05, + "loss": 0.7011660933494568, + "step": 3600 + }, + { + "epoch": 1.518987341772152, + "eval_loss": 0.7118256688117981, + "eval_runtime": 851.3079, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 2.475, + "step": 3600 + }, + { + "epoch": 1.519831223628692, + "grad_norm": 1.2995525598526, + "learning_rate": 9.150581488397525e-05, + "loss": 0.6843758821487427, + "step": 3602 + }, + { + "epoch": 1.520675105485232, + "grad_norm": 1.3140910863876343, + "learning_rate": 9.149241977796723e-05, + "loss": 0.6699353456497192, + "step": 3604 + }, + { + "epoch": 1.5215189873417723, + "grad_norm": 1.2674909830093384, + "learning_rate": 9.147901510045485e-05, + "loss": 0.7269271612167358, + "step": 3606 + }, + { + "epoch": 1.5223628691983122, + "grad_norm": 1.0232038497924805, + "learning_rate": 9.146560085453031e-05, + "loss": 0.5556837916374207, + "step": 3608 + }, + { + "epoch": 1.5232067510548524, + "grad_norm": 1.2598992586135864, + "learning_rate": 9.1452177043288e-05, + "loss": 0.7273092269897461, + "step": 3610 + }, + { + "epoch": 1.5240506329113925, + "grad_norm": 1.2002917528152466, + "learning_rate": 9.143874366982455e-05, + "loss": 0.6897470355033875, + "step": 3612 + }, + { + "epoch": 1.5248945147679325, + "grad_norm": 1.0959099531173706, + "learning_rate": 9.142530073723878e-05, + "loss": 0.6060715913772583, + "step": 3614 + }, + { + "epoch": 1.5257383966244724, + "grad_norm": 1.9890750646591187, + "learning_rate": 9.141184824863173e-05, + "loss": 0.6585046052932739, + "step": 3616 + }, + { + "epoch": 1.5265822784810128, + "grad_norm": 1.1460137367248535, + "learning_rate": 9.139838620710663e-05, + "loss": 0.6022046804428101, + "step": 3618 + }, + { + "epoch": 1.5274261603375527, + "grad_norm": 1.193206548690796, + "learning_rate": 9.138491461576888e-05, + "loss": 0.6332581639289856, + "step": 3620 + }, + { + "epoch": 1.5282700421940927, + "grad_norm": 1.2813689708709717, + "learning_rate": 9.137143347772614e-05, + "loss": 0.6690208315849304, + "step": 3622 + }, + { + "epoch": 1.529113924050633, + "grad_norm": 1.0950052738189697, + "learning_rate": 9.135794279608827e-05, + "loss": 0.6034293174743652, + "step": 3624 + }, + { + "epoch": 1.529957805907173, + "grad_norm": 1.208884358406067, + "learning_rate": 9.134444257396729e-05, + "loss": 0.7077960968017578, + "step": 3626 + }, + { + "epoch": 1.530801687763713, + "grad_norm": 1.093759298324585, + "learning_rate": 9.133093281447742e-05, + "loss": 0.6741147637367249, + "step": 3628 + }, + { + "epoch": 1.5316455696202531, + "grad_norm": 1.1280012130737305, + "learning_rate": 9.131741352073514e-05, + "loss": 0.6816818118095398, + "step": 3630 + }, + { + "epoch": 1.5324894514767933, + "grad_norm": 1.2868385314941406, + "learning_rate": 9.130388469585907e-05, + "loss": 0.7149180769920349, + "step": 3632 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.9654553532600403, + "learning_rate": 9.129034634297007e-05, + "loss": 0.613467812538147, + "step": 3634 + }, + { + "epoch": 1.5341772151898734, + "grad_norm": 1.8958736658096313, + "learning_rate": 9.127679846519115e-05, + "loss": 0.7034116387367249, + "step": 3636 + }, + { + "epoch": 1.5350210970464135, + "grad_norm": 1.305284857749939, + "learning_rate": 9.126324106564757e-05, + "loss": 0.7076106667518616, + "step": 3638 + }, + { + "epoch": 1.5358649789029535, + "grad_norm": 1.1843762397766113, + "learning_rate": 9.124967414746675e-05, + "loss": 0.6671180725097656, + "step": 3640 + }, + { + "epoch": 1.5367088607594936, + "grad_norm": 1.0460047721862793, + "learning_rate": 9.123609771377832e-05, + "loss": 0.667533814907074, + "step": 3642 + }, + { + "epoch": 1.5375527426160338, + "grad_norm": 1.0441135168075562, + "learning_rate": 9.122251176771409e-05, + "loss": 0.6454499959945679, + "step": 3644 + }, + { + "epoch": 1.5383966244725737, + "grad_norm": 1.5647634267807007, + "learning_rate": 9.120891631240811e-05, + "loss": 0.677007794380188, + "step": 3646 + }, + { + "epoch": 1.539240506329114, + "grad_norm": 1.0650273561477661, + "learning_rate": 9.119531135099655e-05, + "loss": 0.7017449736595154, + "step": 3648 + }, + { + "epoch": 1.540084388185654, + "grad_norm": 1.2904767990112305, + "learning_rate": 9.118169688661784e-05, + "loss": 0.683830738067627, + "step": 3650 + }, + { + "epoch": 1.540928270042194, + "grad_norm": 1.1278672218322754, + "learning_rate": 9.116807292241257e-05, + "loss": 0.5923286080360413, + "step": 3652 + }, + { + "epoch": 1.5417721518987342, + "grad_norm": 1.1107184886932373, + "learning_rate": 9.115443946152352e-05, + "loss": 0.6595140099525452, + "step": 3654 + }, + { + "epoch": 1.5426160337552743, + "grad_norm": 1.0917898416519165, + "learning_rate": 9.114079650709566e-05, + "loss": 0.655241072177887, + "step": 3656 + }, + { + "epoch": 1.5434599156118143, + "grad_norm": 1.1922433376312256, + "learning_rate": 9.11271440622762e-05, + "loss": 0.5987096428871155, + "step": 3658 + }, + { + "epoch": 1.5443037974683544, + "grad_norm": 0.9974617958068848, + "learning_rate": 9.111348213021445e-05, + "loss": 0.5710145235061646, + "step": 3660 + }, + { + "epoch": 1.5451476793248946, + "grad_norm": 1.133683443069458, + "learning_rate": 9.109981071406197e-05, + "loss": 0.6067734360694885, + "step": 3662 + }, + { + "epoch": 1.5459915611814345, + "grad_norm": 1.1958736181259155, + "learning_rate": 9.108612981697248e-05, + "loss": 0.622981071472168, + "step": 3664 + }, + { + "epoch": 1.5468354430379747, + "grad_norm": 1.234328031539917, + "learning_rate": 9.107243944210194e-05, + "loss": 0.6520710587501526, + "step": 3666 + }, + { + "epoch": 1.5476793248945149, + "grad_norm": 1.0374714136123657, + "learning_rate": 9.105873959260842e-05, + "loss": 0.5993341207504272, + "step": 3668 + }, + { + "epoch": 1.5485232067510548, + "grad_norm": 0.9987428784370422, + "learning_rate": 9.104503027165223e-05, + "loss": 0.6564813852310181, + "step": 3670 + }, + { + "epoch": 1.549367088607595, + "grad_norm": 1.0823339223861694, + "learning_rate": 9.103131148239584e-05, + "loss": 0.61710524559021, + "step": 3672 + }, + { + "epoch": 1.5502109704641351, + "grad_norm": 1.3481065034866333, + "learning_rate": 9.101758322800391e-05, + "loss": 0.687752366065979, + "step": 3674 + }, + { + "epoch": 1.551054852320675, + "grad_norm": 1.2243965864181519, + "learning_rate": 9.10038455116433e-05, + "loss": 0.5981095433235168, + "step": 3676 + }, + { + "epoch": 1.5518987341772152, + "grad_norm": 1.1384631395339966, + "learning_rate": 9.0990098336483e-05, + "loss": 0.7181004285812378, + "step": 3678 + }, + { + "epoch": 1.5527426160337554, + "grad_norm": 1.042925477027893, + "learning_rate": 9.097634170569426e-05, + "loss": 0.6137188076972961, + "step": 3680 + }, + { + "epoch": 1.5535864978902953, + "grad_norm": 1.372023105621338, + "learning_rate": 9.096257562245045e-05, + "loss": 0.6761168241500854, + "step": 3682 + }, + { + "epoch": 1.5544303797468353, + "grad_norm": 1.0574673414230347, + "learning_rate": 9.094880008992714e-05, + "loss": 0.614276647567749, + "step": 3684 + }, + { + "epoch": 1.5552742616033757, + "grad_norm": 1.2894645929336548, + "learning_rate": 9.093501511130208e-05, + "loss": 0.668122410774231, + "step": 3686 + }, + { + "epoch": 1.5561181434599156, + "grad_norm": 1.2241230010986328, + "learning_rate": 9.092122068975523e-05, + "loss": 0.6305631399154663, + "step": 3688 + }, + { + "epoch": 1.5569620253164556, + "grad_norm": 1.1316208839416504, + "learning_rate": 9.090741682846866e-05, + "loss": 0.633276641368866, + "step": 3690 + }, + { + "epoch": 1.557805907172996, + "grad_norm": 1.2857953310012817, + "learning_rate": 9.089360353062666e-05, + "loss": 0.6657599806785583, + "step": 3692 + }, + { + "epoch": 1.5586497890295359, + "grad_norm": 1.2325671911239624, + "learning_rate": 9.087978079941573e-05, + "loss": 0.6379332542419434, + "step": 3694 + }, + { + "epoch": 1.5594936708860758, + "grad_norm": 1.3286080360412598, + "learning_rate": 9.086594863802445e-05, + "loss": 0.6841909885406494, + "step": 3696 + }, + { + "epoch": 1.560337552742616, + "grad_norm": 1.261890172958374, + "learning_rate": 9.085210704964368e-05, + "loss": 0.6735964417457581, + "step": 3698 + }, + { + "epoch": 1.5611814345991561, + "grad_norm": 1.0922305583953857, + "learning_rate": 9.083825603746639e-05, + "loss": 0.6602351665496826, + "step": 3700 + }, + { + "epoch": 1.5611814345991561, + "eval_loss": 0.7099412679672241, + "eval_runtime": 857.2273, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 3700 + }, + { + "epoch": 1.562025316455696, + "grad_norm": 1.1113468408584595, + "learning_rate": 9.082439560468774e-05, + "loss": 0.6590834259986877, + "step": 3702 + }, + { + "epoch": 1.5628691983122363, + "grad_norm": 1.1476659774780273, + "learning_rate": 9.081052575450508e-05, + "loss": 0.6397460103034973, + "step": 3704 + }, + { + "epoch": 1.5637130801687764, + "grad_norm": 1.2270452976226807, + "learning_rate": 9.07966464901179e-05, + "loss": 0.6337460279464722, + "step": 3706 + }, + { + "epoch": 1.5645569620253164, + "grad_norm": 1.233667016029358, + "learning_rate": 9.07827578147279e-05, + "loss": 0.680374801158905, + "step": 3708 + }, + { + "epoch": 1.5654008438818565, + "grad_norm": 1.0761466026306152, + "learning_rate": 9.076885973153891e-05, + "loss": 0.6234241724014282, + "step": 3710 + }, + { + "epoch": 1.5662447257383967, + "grad_norm": 0.9219012260437012, + "learning_rate": 9.075495224375697e-05, + "loss": 0.6096800565719604, + "step": 3712 + }, + { + "epoch": 1.5670886075949366, + "grad_norm": 1.151168942451477, + "learning_rate": 9.074103535459026e-05, + "loss": 0.649919867515564, + "step": 3714 + }, + { + "epoch": 1.5679324894514768, + "grad_norm": 1.1380470991134644, + "learning_rate": 9.072710906724914e-05, + "loss": 0.6704574227333069, + "step": 3716 + }, + { + "epoch": 1.568776371308017, + "grad_norm": 1.2184447050094604, + "learning_rate": 9.071317338494614e-05, + "loss": 0.6619362831115723, + "step": 3718 + }, + { + "epoch": 1.5696202531645569, + "grad_norm": 1.131170630455017, + "learning_rate": 9.069922831089594e-05, + "loss": 0.6179121732711792, + "step": 3720 + }, + { + "epoch": 1.570464135021097, + "grad_norm": 1.2668405771255493, + "learning_rate": 9.06852738483154e-05, + "loss": 0.594958484172821, + "step": 3722 + }, + { + "epoch": 1.5713080168776372, + "grad_norm": 1.1624782085418701, + "learning_rate": 9.067131000042359e-05, + "loss": 0.6323778629302979, + "step": 3724 + }, + { + "epoch": 1.5721518987341772, + "grad_norm": 1.2936128377914429, + "learning_rate": 9.065733677044166e-05, + "loss": 0.628058910369873, + "step": 3726 + }, + { + "epoch": 1.5729957805907173, + "grad_norm": 1.1847784519195557, + "learning_rate": 9.064335416159296e-05, + "loss": 0.6472614407539368, + "step": 3728 + }, + { + "epoch": 1.5738396624472575, + "grad_norm": 1.8903449773788452, + "learning_rate": 9.062936217710305e-05, + "loss": 0.6395491361618042, + "step": 3730 + }, + { + "epoch": 1.5746835443037974, + "grad_norm": 1.1150785684585571, + "learning_rate": 9.061536082019956e-05, + "loss": 0.6911961436271667, + "step": 3732 + }, + { + "epoch": 1.5755274261603376, + "grad_norm": 1.1206107139587402, + "learning_rate": 9.060135009411239e-05, + "loss": 0.7051874399185181, + "step": 3734 + }, + { + "epoch": 1.5763713080168777, + "grad_norm": 1.27924382686615, + "learning_rate": 9.05873300020735e-05, + "loss": 0.7012752890586853, + "step": 3736 + }, + { + "epoch": 1.5772151898734177, + "grad_norm": 1.3970832824707031, + "learning_rate": 9.057330054731707e-05, + "loss": 0.7185142040252686, + "step": 3738 + }, + { + "epoch": 1.5780590717299579, + "grad_norm": 0.9732457995414734, + "learning_rate": 9.055926173307945e-05, + "loss": 0.6298858523368835, + "step": 3740 + }, + { + "epoch": 1.578902953586498, + "grad_norm": 1.230928897857666, + "learning_rate": 9.054521356259909e-05, + "loss": 0.7142943739891052, + "step": 3742 + }, + { + "epoch": 1.579746835443038, + "grad_norm": 1.1297426223754883, + "learning_rate": 9.053115603911664e-05, + "loss": 0.6535376310348511, + "step": 3744 + }, + { + "epoch": 1.580590717299578, + "grad_norm": 1.2132076025009155, + "learning_rate": 9.051708916587491e-05, + "loss": 0.6236510872840881, + "step": 3746 + }, + { + "epoch": 1.5814345991561183, + "grad_norm": 1.201319932937622, + "learning_rate": 9.050301294611885e-05, + "loss": 0.6752219200134277, + "step": 3748 + }, + { + "epoch": 1.5822784810126582, + "grad_norm": 1.2969163656234741, + "learning_rate": 9.048892738309559e-05, + "loss": 0.7248554825782776, + "step": 3750 + }, + { + "epoch": 1.5831223628691982, + "grad_norm": 1.0721957683563232, + "learning_rate": 9.047483248005439e-05, + "loss": 0.6488997340202332, + "step": 3752 + }, + { + "epoch": 1.5839662447257385, + "grad_norm": 0.9988508820533752, + "learning_rate": 9.046072824024667e-05, + "loss": 0.6191130876541138, + "step": 3754 + }, + { + "epoch": 1.5848101265822785, + "grad_norm": 1.260183572769165, + "learning_rate": 9.0446614666926e-05, + "loss": 0.6681985259056091, + "step": 3756 + }, + { + "epoch": 1.5856540084388184, + "grad_norm": 1.1288834810256958, + "learning_rate": 9.043249176334812e-05, + "loss": 0.662024736404419, + "step": 3758 + }, + { + "epoch": 1.5864978902953588, + "grad_norm": 1.4384263753890991, + "learning_rate": 9.04183595327709e-05, + "loss": 0.609916627407074, + "step": 3760 + }, + { + "epoch": 1.5873417721518988, + "grad_norm": 1.1109941005706787, + "learning_rate": 9.04042179784544e-05, + "loss": 0.6532528400421143, + "step": 3762 + }, + { + "epoch": 1.5881856540084387, + "grad_norm": 1.0959233045578003, + "learning_rate": 9.039006710366078e-05, + "loss": 0.7136290669441223, + "step": 3764 + }, + { + "epoch": 1.5890295358649789, + "grad_norm": 1.2313964366912842, + "learning_rate": 9.037590691165439e-05, + "loss": 0.6907190084457397, + "step": 3766 + }, + { + "epoch": 1.589873417721519, + "grad_norm": 1.3127682209014893, + "learning_rate": 9.036173740570172e-05, + "loss": 0.7114790678024292, + "step": 3768 + }, + { + "epoch": 1.590717299578059, + "grad_norm": 1.0038903951644897, + "learning_rate": 9.034755858907138e-05, + "loss": 0.6257581114768982, + "step": 3770 + }, + { + "epoch": 1.5915611814345991, + "grad_norm": 1.1058061122894287, + "learning_rate": 9.033337046503416e-05, + "loss": 0.578145444393158, + "step": 3772 + }, + { + "epoch": 1.5924050632911393, + "grad_norm": 1.0893515348434448, + "learning_rate": 9.0319173036863e-05, + "loss": 0.6312620043754578, + "step": 3774 + }, + { + "epoch": 1.5932489451476792, + "grad_norm": 1.1091047525405884, + "learning_rate": 9.030496630783297e-05, + "loss": 0.6799508333206177, + "step": 3776 + }, + { + "epoch": 1.5940928270042194, + "grad_norm": 1.1103609800338745, + "learning_rate": 9.029075028122127e-05, + "loss": 0.678726315498352, + "step": 3778 + }, + { + "epoch": 1.5949367088607596, + "grad_norm": 1.1918376684188843, + "learning_rate": 9.027652496030728e-05, + "loss": 0.7357890009880066, + "step": 3780 + }, + { + "epoch": 1.5957805907172995, + "grad_norm": 1.0541924238204956, + "learning_rate": 9.026229034837253e-05, + "loss": 0.6079391241073608, + "step": 3782 + }, + { + "epoch": 1.5966244725738397, + "grad_norm": 1.195845603942871, + "learning_rate": 9.024804644870062e-05, + "loss": 0.7173702120780945, + "step": 3784 + }, + { + "epoch": 1.5974683544303798, + "grad_norm": 1.1362866163253784, + "learning_rate": 9.023379326457737e-05, + "loss": 0.6431670188903809, + "step": 3786 + }, + { + "epoch": 1.5983122362869198, + "grad_norm": 1.2327499389648438, + "learning_rate": 9.021953079929074e-05, + "loss": 0.6346777677536011, + "step": 3788 + }, + { + "epoch": 1.59915611814346, + "grad_norm": 1.1623177528381348, + "learning_rate": 9.020525905613078e-05, + "loss": 0.6852784156799316, + "step": 3790 + }, + { + "epoch": 1.6, + "grad_norm": 1.0258424282073975, + "learning_rate": 9.019097803838971e-05, + "loss": 0.6357095241546631, + "step": 3792 + }, + { + "epoch": 1.60084388185654, + "grad_norm": 1.0825177431106567, + "learning_rate": 9.017668774936188e-05, + "loss": 0.6663659811019897, + "step": 3794 + }, + { + "epoch": 1.6016877637130802, + "grad_norm": 1.1190401315689087, + "learning_rate": 9.016238819234381e-05, + "loss": 0.6009758710861206, + "step": 3796 + }, + { + "epoch": 1.6025316455696204, + "grad_norm": 1.09871244430542, + "learning_rate": 9.01480793706341e-05, + "loss": 0.6907890439033508, + "step": 3798 + }, + { + "epoch": 1.6033755274261603, + "grad_norm": 1.2046958208084106, + "learning_rate": 9.013376128753354e-05, + "loss": 0.6709389090538025, + "step": 3800 + }, + { + "epoch": 1.6033755274261603, + "eval_loss": 0.7080941200256348, + "eval_runtime": 865.6774, + "eval_samples_per_second": 2.434, + "eval_steps_per_second": 2.434, + "step": 3800 + }, + { + "epoch": 1.6042194092827005, + "grad_norm": 1.0671489238739014, + "learning_rate": 9.011943394634505e-05, + "loss": 0.653937041759491, + "step": 3802 + }, + { + "epoch": 1.6050632911392406, + "grad_norm": 1.4205375909805298, + "learning_rate": 9.010509735037364e-05, + "loss": 0.6647229194641113, + "step": 3804 + }, + { + "epoch": 1.6059071729957806, + "grad_norm": 1.3793799877166748, + "learning_rate": 9.009075150292652e-05, + "loss": 0.6981267929077148, + "step": 3806 + }, + { + "epoch": 1.6067510548523207, + "grad_norm": 1.0534380674362183, + "learning_rate": 9.007639640731298e-05, + "loss": 0.6151314973831177, + "step": 3808 + }, + { + "epoch": 1.6075949367088609, + "grad_norm": 1.1359853744506836, + "learning_rate": 9.006203206684447e-05, + "loss": 0.6671237349510193, + "step": 3810 + }, + { + "epoch": 1.6084388185654008, + "grad_norm": 1.2385475635528564, + "learning_rate": 9.004765848483456e-05, + "loss": 0.7145646810531616, + "step": 3812 + }, + { + "epoch": 1.6092827004219408, + "grad_norm": 1.1323930025100708, + "learning_rate": 9.003327566459899e-05, + "loss": 0.6524789929389954, + "step": 3814 + }, + { + "epoch": 1.6101265822784812, + "grad_norm": 1.1863508224487305, + "learning_rate": 9.001888360945555e-05, + "loss": 0.7574670314788818, + "step": 3816 + }, + { + "epoch": 1.610970464135021, + "grad_norm": 1.0288994312286377, + "learning_rate": 9.000448232272425e-05, + "loss": 0.5858811736106873, + "step": 3818 + }, + { + "epoch": 1.611814345991561, + "grad_norm": 1.2674148082733154, + "learning_rate": 8.999007180772719e-05, + "loss": 0.6834250688552856, + "step": 3820 + }, + { + "epoch": 1.6126582278481014, + "grad_norm": 1.2014318704605103, + "learning_rate": 8.997565206778856e-05, + "loss": 0.6435309052467346, + "step": 3822 + }, + { + "epoch": 1.6135021097046414, + "grad_norm": 1.205741286277771, + "learning_rate": 8.996122310623476e-05, + "loss": 0.6212471127510071, + "step": 3824 + }, + { + "epoch": 1.6143459915611813, + "grad_norm": 1.0866186618804932, + "learning_rate": 8.994678492639426e-05, + "loss": 0.6832143664360046, + "step": 3826 + }, + { + "epoch": 1.6151898734177215, + "grad_norm": 1.0786924362182617, + "learning_rate": 8.993233753159768e-05, + "loss": 0.6129988431930542, + "step": 3828 + }, + { + "epoch": 1.6160337552742616, + "grad_norm": 1.176597237586975, + "learning_rate": 8.991788092517775e-05, + "loss": 0.6376019716262817, + "step": 3830 + }, + { + "epoch": 1.6168776371308016, + "grad_norm": 1.149990200996399, + "learning_rate": 8.99034151104693e-05, + "loss": 0.7300569415092468, + "step": 3832 + }, + { + "epoch": 1.6177215189873417, + "grad_norm": 1.0655301809310913, + "learning_rate": 8.988894009080936e-05, + "loss": 0.6163336634635925, + "step": 3834 + }, + { + "epoch": 1.618565400843882, + "grad_norm": 1.1596909761428833, + "learning_rate": 8.987445586953703e-05, + "loss": 0.6459008455276489, + "step": 3836 + }, + { + "epoch": 1.6194092827004218, + "grad_norm": 1.201897382736206, + "learning_rate": 8.985996244999352e-05, + "loss": 0.6166399121284485, + "step": 3838 + }, + { + "epoch": 1.620253164556962, + "grad_norm": 1.1000950336456299, + "learning_rate": 8.984545983552219e-05, + "loss": 0.6438087224960327, + "step": 3840 + }, + { + "epoch": 1.6210970464135022, + "grad_norm": 0.9962409734725952, + "learning_rate": 8.983094802946854e-05, + "loss": 0.6238043308258057, + "step": 3842 + }, + { + "epoch": 1.621940928270042, + "grad_norm": 1.2501682043075562, + "learning_rate": 8.981642703518015e-05, + "loss": 0.6445946097373962, + "step": 3844 + }, + { + "epoch": 1.6227848101265823, + "grad_norm": 1.2027913331985474, + "learning_rate": 8.980189685600673e-05, + "loss": 0.7147613167762756, + "step": 3846 + }, + { + "epoch": 1.6236286919831224, + "grad_norm": 1.1382197141647339, + "learning_rate": 8.97873574953001e-05, + "loss": 0.6531714200973511, + "step": 3848 + }, + { + "epoch": 1.6244725738396624, + "grad_norm": 1.2600723505020142, + "learning_rate": 8.977280895641425e-05, + "loss": 0.6811055541038513, + "step": 3850 + }, + { + "epoch": 1.6253164556962025, + "grad_norm": 0.9908071160316467, + "learning_rate": 8.97582512427052e-05, + "loss": 0.6142261624336243, + "step": 3852 + }, + { + "epoch": 1.6261603375527427, + "grad_norm": 1.171557068824768, + "learning_rate": 8.974368435753117e-05, + "loss": 0.6408987045288086, + "step": 3854 + }, + { + "epoch": 1.6270042194092826, + "grad_norm": 1.1839419603347778, + "learning_rate": 8.972910830425247e-05, + "loss": 0.7352069616317749, + "step": 3856 + }, + { + "epoch": 1.6278481012658228, + "grad_norm": 1.233730673789978, + "learning_rate": 8.971452308623148e-05, + "loss": 0.7663040161132812, + "step": 3858 + }, + { + "epoch": 1.628691983122363, + "grad_norm": 1.3636224269866943, + "learning_rate": 8.969992870683273e-05, + "loss": 0.6496971249580383, + "step": 3860 + }, + { + "epoch": 1.629535864978903, + "grad_norm": 1.2819573879241943, + "learning_rate": 8.96853251694229e-05, + "loss": 0.6079609394073486, + "step": 3862 + }, + { + "epoch": 1.630379746835443, + "grad_norm": 1.087265968322754, + "learning_rate": 8.967071247737071e-05, + "loss": 0.6299422979354858, + "step": 3864 + }, + { + "epoch": 1.6312236286919832, + "grad_norm": 1.24200439453125, + "learning_rate": 8.965609063404706e-05, + "loss": 0.6691840291023254, + "step": 3866 + }, + { + "epoch": 1.6320675105485232, + "grad_norm": 1.0771806240081787, + "learning_rate": 8.96414596428249e-05, + "loss": 0.6623613238334656, + "step": 3868 + }, + { + "epoch": 1.6329113924050633, + "grad_norm": 1.1830974817276, + "learning_rate": 8.962681950707932e-05, + "loss": 0.6663276553153992, + "step": 3870 + }, + { + "epoch": 1.6337552742616035, + "grad_norm": 1.1107177734375, + "learning_rate": 8.961217023018754e-05, + "loss": 0.6426810622215271, + "step": 3872 + }, + { + "epoch": 1.6345991561181434, + "grad_norm": 1.2528507709503174, + "learning_rate": 8.959751181552886e-05, + "loss": 0.7113696336746216, + "step": 3874 + }, + { + "epoch": 1.6354430379746834, + "grad_norm": 1.0656070709228516, + "learning_rate": 8.958284426648467e-05, + "loss": 0.6211581230163574, + "step": 3876 + }, + { + "epoch": 1.6362869198312238, + "grad_norm": 1.0627381801605225, + "learning_rate": 8.956816758643852e-05, + "loss": 0.5950066447257996, + "step": 3878 + }, + { + "epoch": 1.6371308016877637, + "grad_norm": 0.9812912344932556, + "learning_rate": 8.955348177877603e-05, + "loss": 0.6519815325737, + "step": 3880 + }, + { + "epoch": 1.6379746835443036, + "grad_norm": 1.1843842267990112, + "learning_rate": 8.953878684688493e-05, + "loss": 0.6830767393112183, + "step": 3882 + }, + { + "epoch": 1.638818565400844, + "grad_norm": 1.0393236875534058, + "learning_rate": 8.952408279415507e-05, + "loss": 0.5920302271842957, + "step": 3884 + }, + { + "epoch": 1.639662447257384, + "grad_norm": 0.9931944608688354, + "learning_rate": 8.950936962397838e-05, + "loss": 0.6269177198410034, + "step": 3886 + }, + { + "epoch": 1.640506329113924, + "grad_norm": 1.1461358070373535, + "learning_rate": 8.949464733974891e-05, + "loss": 0.7021532654762268, + "step": 3888 + }, + { + "epoch": 1.6413502109704643, + "grad_norm": 1.2654093503952026, + "learning_rate": 8.947991594486279e-05, + "loss": 0.7331246733665466, + "step": 3890 + }, + { + "epoch": 1.6421940928270042, + "grad_norm": 1.1487081050872803, + "learning_rate": 8.946517544271831e-05, + "loss": 0.6438513994216919, + "step": 3892 + }, + { + "epoch": 1.6430379746835442, + "grad_norm": 1.0876784324645996, + "learning_rate": 8.945042583671579e-05, + "loss": 0.6779276728630066, + "step": 3894 + }, + { + "epoch": 1.6438818565400843, + "grad_norm": 1.2382020950317383, + "learning_rate": 8.943566713025768e-05, + "loss": 0.7255419492721558, + "step": 3896 + }, + { + "epoch": 1.6447257383966245, + "grad_norm": 1.3502718210220337, + "learning_rate": 8.942089932674855e-05, + "loss": 0.7068934440612793, + "step": 3898 + }, + { + "epoch": 1.6455696202531644, + "grad_norm": 1.050878643989563, + "learning_rate": 8.940612242959503e-05, + "loss": 0.608700156211853, + "step": 3900 + }, + { + "epoch": 1.6455696202531644, + "eval_loss": 0.7049403786659241, + "eval_runtime": 854.9866, + "eval_samples_per_second": 2.464, + "eval_steps_per_second": 2.464, + "step": 3900 + }, + { + "epoch": 1.6464135021097046, + "grad_norm": 1.0536954402923584, + "learning_rate": 8.939133644220588e-05, + "loss": 0.6257222890853882, + "step": 3902 + }, + { + "epoch": 1.6472573839662448, + "grad_norm": 1.1903947591781616, + "learning_rate": 8.937654136799195e-05, + "loss": 0.6823404431343079, + "step": 3904 + }, + { + "epoch": 1.6481012658227847, + "grad_norm": 1.225679874420166, + "learning_rate": 8.936173721036616e-05, + "loss": 0.6596478819847107, + "step": 3906 + }, + { + "epoch": 1.6489451476793249, + "grad_norm": 1.0071430206298828, + "learning_rate": 8.934692397274354e-05, + "loss": 0.5638422966003418, + "step": 3908 + }, + { + "epoch": 1.649789029535865, + "grad_norm": 1.0146223306655884, + "learning_rate": 8.933210165854125e-05, + "loss": 0.5743419528007507, + "step": 3910 + }, + { + "epoch": 1.650632911392405, + "grad_norm": 1.122976541519165, + "learning_rate": 8.931727027117848e-05, + "loss": 0.6775169372558594, + "step": 3912 + }, + { + "epoch": 1.6514767932489451, + "grad_norm": 0.9223271012306213, + "learning_rate": 8.930242981407656e-05, + "loss": 0.5984215140342712, + "step": 3914 + }, + { + "epoch": 1.6523206751054853, + "grad_norm": 1.1599735021591187, + "learning_rate": 8.928758029065891e-05, + "loss": 0.6342158913612366, + "step": 3916 + }, + { + "epoch": 1.6531645569620252, + "grad_norm": 1.2680121660232544, + "learning_rate": 8.927272170435101e-05, + "loss": 0.678507924079895, + "step": 3918 + }, + { + "epoch": 1.6540084388185654, + "grad_norm": 1.3628549575805664, + "learning_rate": 8.925785405858047e-05, + "loss": 0.6739710569381714, + "step": 3920 + }, + { + "epoch": 1.6548523206751056, + "grad_norm": 1.163482427597046, + "learning_rate": 8.924297735677694e-05, + "loss": 0.7050020098686218, + "step": 3922 + }, + { + "epoch": 1.6556962025316455, + "grad_norm": 1.2057000398635864, + "learning_rate": 8.922809160237222e-05, + "loss": 0.6847540140151978, + "step": 3924 + }, + { + "epoch": 1.6565400843881857, + "grad_norm": 1.2784082889556885, + "learning_rate": 8.921319679880016e-05, + "loss": 0.7079069018363953, + "step": 3926 + }, + { + "epoch": 1.6573839662447258, + "grad_norm": 1.1701157093048096, + "learning_rate": 8.919829294949671e-05, + "loss": 0.665060818195343, + "step": 3928 + }, + { + "epoch": 1.6582278481012658, + "grad_norm": 1.3886606693267822, + "learning_rate": 8.918338005789988e-05, + "loss": 0.7547550201416016, + "step": 3930 + }, + { + "epoch": 1.659071729957806, + "grad_norm": 0.9504727721214294, + "learning_rate": 8.91684581274498e-05, + "loss": 0.5718522667884827, + "step": 3932 + }, + { + "epoch": 1.659915611814346, + "grad_norm": 1.1185030937194824, + "learning_rate": 8.915352716158869e-05, + "loss": 0.5984254479408264, + "step": 3934 + }, + { + "epoch": 1.660759493670886, + "grad_norm": 1.1489602327346802, + "learning_rate": 8.913858716376081e-05, + "loss": 0.6749780774116516, + "step": 3936 + }, + { + "epoch": 1.6616033755274262, + "grad_norm": 1.389431118965149, + "learning_rate": 8.912363813741255e-05, + "loss": 0.6537864804267883, + "step": 3938 + }, + { + "epoch": 1.6624472573839664, + "grad_norm": 1.0958757400512695, + "learning_rate": 8.910868008599235e-05, + "loss": 0.6033569574356079, + "step": 3940 + }, + { + "epoch": 1.6632911392405063, + "grad_norm": 1.2735344171524048, + "learning_rate": 8.909371301295075e-05, + "loss": 0.7404987215995789, + "step": 3942 + }, + { + "epoch": 1.6641350210970463, + "grad_norm": 1.123336911201477, + "learning_rate": 8.907873692174038e-05, + "loss": 0.6265006065368652, + "step": 3944 + }, + { + "epoch": 1.6649789029535866, + "grad_norm": 1.259470820426941, + "learning_rate": 8.90637518158159e-05, + "loss": 0.650705099105835, + "step": 3946 + }, + { + "epoch": 1.6658227848101266, + "grad_norm": 1.4020485877990723, + "learning_rate": 8.904875769863412e-05, + "loss": 0.7813970446586609, + "step": 3948 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.1709671020507812, + "learning_rate": 8.903375457365389e-05, + "loss": 0.6499447822570801, + "step": 3950 + }, + { + "epoch": 1.667510548523207, + "grad_norm": 1.085585355758667, + "learning_rate": 8.901874244433612e-05, + "loss": 0.6141875386238098, + "step": 3952 + }, + { + "epoch": 1.6683544303797468, + "grad_norm": 1.2340166568756104, + "learning_rate": 8.900372131414386e-05, + "loss": 0.7080221176147461, + "step": 3954 + }, + { + "epoch": 1.6691983122362868, + "grad_norm": 1.148576259613037, + "learning_rate": 8.898869118654216e-05, + "loss": 0.6340513229370117, + "step": 3956 + }, + { + "epoch": 1.6700421940928272, + "grad_norm": 1.2231999635696411, + "learning_rate": 8.89736520649982e-05, + "loss": 0.6999116539955139, + "step": 3958 + }, + { + "epoch": 1.6708860759493671, + "grad_norm": 1.1600396633148193, + "learning_rate": 8.895860395298121e-05, + "loss": 0.7177759408950806, + "step": 3960 + }, + { + "epoch": 1.671729957805907, + "grad_norm": 1.3019158840179443, + "learning_rate": 8.894354685396251e-05, + "loss": 0.6485702395439148, + "step": 3962 + }, + { + "epoch": 1.6725738396624472, + "grad_norm": 1.0153226852416992, + "learning_rate": 8.892848077141546e-05, + "loss": 0.6189450025558472, + "step": 3964 + }, + { + "epoch": 1.6734177215189874, + "grad_norm": 1.1953094005584717, + "learning_rate": 8.891340570881555e-05, + "loss": 0.6756728291511536, + "step": 3966 + }, + { + "epoch": 1.6742616033755273, + "grad_norm": 1.3376187086105347, + "learning_rate": 8.889832166964027e-05, + "loss": 0.6851167678833008, + "step": 3968 + }, + { + "epoch": 1.6751054852320675, + "grad_norm": 1.0045926570892334, + "learning_rate": 8.888322865736924e-05, + "loss": 0.5991915464401245, + "step": 3970 + }, + { + "epoch": 1.6759493670886076, + "grad_norm": 1.2115750312805176, + "learning_rate": 8.886812667548414e-05, + "loss": 0.713362455368042, + "step": 3972 + }, + { + "epoch": 1.6767932489451476, + "grad_norm": 1.1887929439544678, + "learning_rate": 8.88530157274687e-05, + "loss": 0.7058883309364319, + "step": 3974 + }, + { + "epoch": 1.6776371308016877, + "grad_norm": 1.1465295553207397, + "learning_rate": 8.883789581680868e-05, + "loss": 0.6501380801200867, + "step": 3976 + }, + { + "epoch": 1.678481012658228, + "grad_norm": 1.184693694114685, + "learning_rate": 8.882276694699204e-05, + "loss": 0.6109840273857117, + "step": 3978 + }, + { + "epoch": 1.6793248945147679, + "grad_norm": 1.2034777402877808, + "learning_rate": 8.880762912150862e-05, + "loss": 0.6815584897994995, + "step": 3980 + }, + { + "epoch": 1.680168776371308, + "grad_norm": 1.1312000751495361, + "learning_rate": 8.879248234385052e-05, + "loss": 0.6859248876571655, + "step": 3982 + }, + { + "epoch": 1.6810126582278482, + "grad_norm": 1.2273681163787842, + "learning_rate": 8.877732661751173e-05, + "loss": 0.6426702737808228, + "step": 3984 + }, + { + "epoch": 1.6818565400843881, + "grad_norm": 1.2550326585769653, + "learning_rate": 8.876216194598844e-05, + "loss": 0.6462456583976746, + "step": 3986 + }, + { + "epoch": 1.6827004219409283, + "grad_norm": 1.3111321926116943, + "learning_rate": 8.874698833277884e-05, + "loss": 0.6293925046920776, + "step": 3988 + }, + { + "epoch": 1.6835443037974684, + "grad_norm": 1.037883996963501, + "learning_rate": 8.873180578138316e-05, + "loss": 0.59798264503479, + "step": 3990 + }, + { + "epoch": 1.6843881856540084, + "grad_norm": 1.2411901950836182, + "learning_rate": 8.871661429530376e-05, + "loss": 0.6741529703140259, + "step": 3992 + }, + { + "epoch": 1.6852320675105485, + "grad_norm": 1.206354022026062, + "learning_rate": 8.8701413878045e-05, + "loss": 0.5972680449485779, + "step": 3994 + }, + { + "epoch": 1.6860759493670887, + "grad_norm": 1.1922144889831543, + "learning_rate": 8.868620453311334e-05, + "loss": 0.5879245400428772, + "step": 3996 + }, + { + "epoch": 1.6869198312236287, + "grad_norm": 1.3499996662139893, + "learning_rate": 8.867098626401729e-05, + "loss": 0.7381167411804199, + "step": 3998 + }, + { + "epoch": 1.6877637130801688, + "grad_norm": 1.3601514101028442, + "learning_rate": 8.865575907426737e-05, + "loss": 0.6590276956558228, + "step": 4000 + }, + { + "epoch": 1.6877637130801688, + "eval_loss": 0.7027890682220459, + "eval_runtime": 848.7529, + "eval_samples_per_second": 2.482, + "eval_steps_per_second": 2.482, + "step": 4000 + }, + { + "epoch": 1.688607594936709, + "grad_norm": 1.1060529947280884, + "learning_rate": 8.864052296737624e-05, + "loss": 0.5958077907562256, + "step": 4002 + }, + { + "epoch": 1.689451476793249, + "grad_norm": 1.2067371606826782, + "learning_rate": 8.862527794685858e-05, + "loss": 0.6802279353141785, + "step": 4004 + }, + { + "epoch": 1.690295358649789, + "grad_norm": 1.0094636678695679, + "learning_rate": 8.86100240162311e-05, + "loss": 0.5701603889465332, + "step": 4006 + }, + { + "epoch": 1.6911392405063292, + "grad_norm": 1.0976500511169434, + "learning_rate": 8.85947611790126e-05, + "loss": 0.6580625176429749, + "step": 4008 + }, + { + "epoch": 1.6919831223628692, + "grad_norm": 0.9448981285095215, + "learning_rate": 8.857948943872392e-05, + "loss": 0.5947542190551758, + "step": 4010 + }, + { + "epoch": 1.6928270042194091, + "grad_norm": 1.219609260559082, + "learning_rate": 8.856420879888796e-05, + "loss": 0.6361464262008667, + "step": 4012 + }, + { + "epoch": 1.6936708860759495, + "grad_norm": 1.2395503520965576, + "learning_rate": 8.854891926302966e-05, + "loss": 0.608664333820343, + "step": 4014 + }, + { + "epoch": 1.6945147679324895, + "grad_norm": 1.1300057172775269, + "learning_rate": 8.853362083467604e-05, + "loss": 0.6932460069656372, + "step": 4016 + }, + { + "epoch": 1.6953586497890294, + "grad_norm": 1.2300254106521606, + "learning_rate": 8.851831351735616e-05, + "loss": 0.646004855632782, + "step": 4018 + }, + { + "epoch": 1.6962025316455698, + "grad_norm": 1.2328956127166748, + "learning_rate": 8.85029973146011e-05, + "loss": 0.6760826110839844, + "step": 4020 + }, + { + "epoch": 1.6970464135021097, + "grad_norm": 1.1252286434173584, + "learning_rate": 8.848767222994401e-05, + "loss": 0.5943224430084229, + "step": 4022 + }, + { + "epoch": 1.6978902953586497, + "grad_norm": 1.1587592363357544, + "learning_rate": 8.847233826692012e-05, + "loss": 0.7535276412963867, + "step": 4024 + }, + { + "epoch": 1.6987341772151898, + "grad_norm": 1.0294606685638428, + "learning_rate": 8.845699542906667e-05, + "loss": 0.5903090834617615, + "step": 4026 + }, + { + "epoch": 1.69957805907173, + "grad_norm": 1.1940597295761108, + "learning_rate": 8.844164371992295e-05, + "loss": 0.6031379699707031, + "step": 4028 + }, + { + "epoch": 1.70042194092827, + "grad_norm": 1.0416409969329834, + "learning_rate": 8.842628314303031e-05, + "loss": 0.6185168623924255, + "step": 4030 + }, + { + "epoch": 1.70126582278481, + "grad_norm": 1.8715689182281494, + "learning_rate": 8.841091370193214e-05, + "loss": 0.6325570344924927, + "step": 4032 + }, + { + "epoch": 1.7021097046413503, + "grad_norm": 1.230658769607544, + "learning_rate": 8.839553540017387e-05, + "loss": 0.7413952350616455, + "step": 4034 + }, + { + "epoch": 1.7029535864978902, + "grad_norm": 1.298003077507019, + "learning_rate": 8.838014824130299e-05, + "loss": 0.6973189115524292, + "step": 4036 + }, + { + "epoch": 1.7037974683544304, + "grad_norm": 1.0246652364730835, + "learning_rate": 8.836475222886902e-05, + "loss": 0.6582493185997009, + "step": 4038 + }, + { + "epoch": 1.7046413502109705, + "grad_norm": 1.3652594089508057, + "learning_rate": 8.834934736642351e-05, + "loss": 0.6934399008750916, + "step": 4040 + }, + { + "epoch": 1.7054852320675105, + "grad_norm": 1.029778242111206, + "learning_rate": 8.833393365752007e-05, + "loss": 0.6437561511993408, + "step": 4042 + }, + { + "epoch": 1.7063291139240506, + "grad_norm": 1.1993004083633423, + "learning_rate": 8.831851110571437e-05, + "loss": 0.605059027671814, + "step": 4044 + }, + { + "epoch": 1.7071729957805908, + "grad_norm": 1.286389946937561, + "learning_rate": 8.830307971456406e-05, + "loss": 0.7035017609596252, + "step": 4046 + }, + { + "epoch": 1.7080168776371307, + "grad_norm": 1.1211459636688232, + "learning_rate": 8.82876394876289e-05, + "loss": 0.6429924964904785, + "step": 4048 + }, + { + "epoch": 1.7088607594936709, + "grad_norm": 1.1284868717193604, + "learning_rate": 8.827219042847064e-05, + "loss": 0.6454769968986511, + "step": 4050 + }, + { + "epoch": 1.709704641350211, + "grad_norm": 1.1934884786605835, + "learning_rate": 8.825673254065306e-05, + "loss": 0.707233190536499, + "step": 4052 + }, + { + "epoch": 1.710548523206751, + "grad_norm": 1.1560680866241455, + "learning_rate": 8.824126582774203e-05, + "loss": 0.6790444254875183, + "step": 4054 + }, + { + "epoch": 1.7113924050632912, + "grad_norm": 1.1924364566802979, + "learning_rate": 8.822579029330541e-05, + "loss": 0.6115295886993408, + "step": 4056 + }, + { + "epoch": 1.7122362869198313, + "grad_norm": 1.107370138168335, + "learning_rate": 8.82103059409131e-05, + "loss": 0.7039182186126709, + "step": 4058 + }, + { + "epoch": 1.7130801687763713, + "grad_norm": 1.2554657459259033, + "learning_rate": 8.819481277413707e-05, + "loss": 0.6580052971839905, + "step": 4060 + }, + { + "epoch": 1.7139240506329114, + "grad_norm": 1.2873135805130005, + "learning_rate": 8.817931079655127e-05, + "loss": 0.6042479276657104, + "step": 4062 + }, + { + "epoch": 1.7147679324894516, + "grad_norm": 1.027056097984314, + "learning_rate": 8.816380001173172e-05, + "loss": 0.5992372632026672, + "step": 4064 + }, + { + "epoch": 1.7156118143459915, + "grad_norm": 1.0694721937179565, + "learning_rate": 8.814828042325644e-05, + "loss": 0.7078655362129211, + "step": 4066 + }, + { + "epoch": 1.7164556962025317, + "grad_norm": 1.194984793663025, + "learning_rate": 8.813275203470555e-05, + "loss": 0.6618752479553223, + "step": 4068 + }, + { + "epoch": 1.7172995780590719, + "grad_norm": 1.1713165044784546, + "learning_rate": 8.811721484966109e-05, + "loss": 0.6328625679016113, + "step": 4070 + }, + { + "epoch": 1.7181434599156118, + "grad_norm": 0.9993656277656555, + "learning_rate": 8.810166887170724e-05, + "loss": 0.5916416645050049, + "step": 4072 + }, + { + "epoch": 1.7189873417721517, + "grad_norm": 1.172642707824707, + "learning_rate": 8.808611410443011e-05, + "loss": 0.6490002274513245, + "step": 4074 + }, + { + "epoch": 1.7198312236286921, + "grad_norm": 1.1404821872711182, + "learning_rate": 8.807055055141793e-05, + "loss": 0.6571791172027588, + "step": 4076 + }, + { + "epoch": 1.720675105485232, + "grad_norm": 1.2104214429855347, + "learning_rate": 8.80549782162609e-05, + "loss": 0.6233854293823242, + "step": 4078 + }, + { + "epoch": 1.721518987341772, + "grad_norm": 1.1691396236419678, + "learning_rate": 8.803939710255126e-05, + "loss": 0.6331531405448914, + "step": 4080 + }, + { + "epoch": 1.7223628691983124, + "grad_norm": 1.263174057006836, + "learning_rate": 8.802380721388325e-05, + "loss": 0.6321156620979309, + "step": 4082 + }, + { + "epoch": 1.7232067510548523, + "grad_norm": 1.0685606002807617, + "learning_rate": 8.80082085538532e-05, + "loss": 0.644904613494873, + "step": 4084 + }, + { + "epoch": 1.7240506329113923, + "grad_norm": 1.2289735078811646, + "learning_rate": 8.799260112605938e-05, + "loss": 0.6743831634521484, + "step": 4086 + }, + { + "epoch": 1.7248945147679327, + "grad_norm": 1.0661355257034302, + "learning_rate": 8.797698493410216e-05, + "loss": 0.6866999268531799, + "step": 4088 + }, + { + "epoch": 1.7257383966244726, + "grad_norm": 1.1001228094100952, + "learning_rate": 8.796135998158386e-05, + "loss": 0.691387414932251, + "step": 4090 + }, + { + "epoch": 1.7265822784810125, + "grad_norm": 1.1078115701675415, + "learning_rate": 8.794572627210887e-05, + "loss": 0.5882864594459534, + "step": 4092 + }, + { + "epoch": 1.7274261603375527, + "grad_norm": 1.0483999252319336, + "learning_rate": 8.79300838092836e-05, + "loss": 0.6192089319229126, + "step": 4094 + }, + { + "epoch": 1.7282700421940929, + "grad_norm": 1.1194913387298584, + "learning_rate": 8.791443259671645e-05, + "loss": 0.603322446346283, + "step": 4096 + }, + { + "epoch": 1.7291139240506328, + "grad_norm": 1.1800397634506226, + "learning_rate": 8.789877263801787e-05, + "loss": 0.6141818165779114, + "step": 4098 + }, + { + "epoch": 1.729957805907173, + "grad_norm": 1.261768102645874, + "learning_rate": 8.78831039368003e-05, + "loss": 0.6707983016967773, + "step": 4100 + }, + { + "epoch": 1.729957805907173, + "eval_loss": 0.7022181153297424, + "eval_runtime": 844.6405, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 2.495, + "step": 4100 + }, + { + "epoch": 1.7308016877637131, + "grad_norm": 1.2505232095718384, + "learning_rate": 8.786742649667822e-05, + "loss": 0.6440353989601135, + "step": 4102 + }, + { + "epoch": 1.731645569620253, + "grad_norm": 1.2631809711456299, + "learning_rate": 8.78517403212681e-05, + "loss": 0.6712808012962341, + "step": 4104 + }, + { + "epoch": 1.7324894514767932, + "grad_norm": 1.2781071662902832, + "learning_rate": 8.783604541418845e-05, + "loss": 0.6854958534240723, + "step": 4106 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 1.1065936088562012, + "learning_rate": 8.782034177905976e-05, + "loss": 0.6281477808952332, + "step": 4108 + }, + { + "epoch": 1.7341772151898733, + "grad_norm": 1.010961890220642, + "learning_rate": 8.780462941950457e-05, + "loss": 0.6835165619850159, + "step": 4110 + }, + { + "epoch": 1.7350210970464135, + "grad_norm": 1.1467366218566895, + "learning_rate": 8.778890833914744e-05, + "loss": 0.6674962639808655, + "step": 4112 + }, + { + "epoch": 1.7358649789029537, + "grad_norm": 1.0221859216690063, + "learning_rate": 8.77731785416149e-05, + "loss": 0.5967551469802856, + "step": 4114 + }, + { + "epoch": 1.7367088607594936, + "grad_norm": 1.347937822341919, + "learning_rate": 8.775744003053552e-05, + "loss": 0.7356855869293213, + "step": 4116 + }, + { + "epoch": 1.7375527426160338, + "grad_norm": 1.2952557802200317, + "learning_rate": 8.774169280953988e-05, + "loss": 0.6932644844055176, + "step": 4118 + }, + { + "epoch": 1.738396624472574, + "grad_norm": 1.0157089233398438, + "learning_rate": 8.772593688226052e-05, + "loss": 0.5917407870292664, + "step": 4120 + }, + { + "epoch": 1.7392405063291139, + "grad_norm": 1.1537878513336182, + "learning_rate": 8.77101722523321e-05, + "loss": 0.6335760354995728, + "step": 4122 + }, + { + "epoch": 1.740084388185654, + "grad_norm": 1.0989667177200317, + "learning_rate": 8.769439892339115e-05, + "loss": 0.6892110109329224, + "step": 4124 + }, + { + "epoch": 1.7409282700421942, + "grad_norm": 1.1293572187423706, + "learning_rate": 8.767861689907633e-05, + "loss": 0.5966230630874634, + "step": 4126 + }, + { + "epoch": 1.7417721518987341, + "grad_norm": 1.1167775392532349, + "learning_rate": 8.76628261830282e-05, + "loss": 0.5981804728507996, + "step": 4128 + }, + { + "epoch": 1.7426160337552743, + "grad_norm": 1.0572419166564941, + "learning_rate": 8.76470267788894e-05, + "loss": 0.5539529919624329, + "step": 4130 + }, + { + "epoch": 1.7434599156118145, + "grad_norm": 0.937256932258606, + "learning_rate": 8.763121869030456e-05, + "loss": 0.6238219141960144, + "step": 4132 + }, + { + "epoch": 1.7443037974683544, + "grad_norm": 1.082932472229004, + "learning_rate": 8.761540192092029e-05, + "loss": 0.6033329963684082, + "step": 4134 + }, + { + "epoch": 1.7451476793248946, + "grad_norm": 1.0495184659957886, + "learning_rate": 8.75995764743852e-05, + "loss": 0.5567626357078552, + "step": 4136 + }, + { + "epoch": 1.7459915611814347, + "grad_norm": 1.3143779039382935, + "learning_rate": 8.758374235434994e-05, + "loss": 0.6759346127510071, + "step": 4138 + }, + { + "epoch": 1.7468354430379747, + "grad_norm": 1.2385786771774292, + "learning_rate": 8.756789956446713e-05, + "loss": 0.6439400315284729, + "step": 4140 + }, + { + "epoch": 1.7476793248945146, + "grad_norm": 1.0453747510910034, + "learning_rate": 8.75520481083914e-05, + "loss": 0.627493679523468, + "step": 4142 + }, + { + "epoch": 1.748523206751055, + "grad_norm": 1.09946608543396, + "learning_rate": 8.753618798977935e-05, + "loss": 0.677209198474884, + "step": 4144 + }, + { + "epoch": 1.749367088607595, + "grad_norm": 1.2207063436508179, + "learning_rate": 8.752031921228965e-05, + "loss": 0.6874014735221863, + "step": 4146 + }, + { + "epoch": 1.7502109704641349, + "grad_norm": 1.2520697116851807, + "learning_rate": 8.750444177958288e-05, + "loss": 0.6332831382751465, + "step": 4148 + }, + { + "epoch": 1.7510548523206753, + "grad_norm": 1.2463186979293823, + "learning_rate": 8.748855569532168e-05, + "loss": 0.682744562625885, + "step": 4150 + }, + { + "epoch": 1.7518987341772152, + "grad_norm": 1.1895235776901245, + "learning_rate": 8.747266096317069e-05, + "loss": 0.7006803750991821, + "step": 4152 + }, + { + "epoch": 1.7527426160337551, + "grad_norm": 1.1627185344696045, + "learning_rate": 8.745675758679646e-05, + "loss": 0.6751191020011902, + "step": 4154 + }, + { + "epoch": 1.7535864978902953, + "grad_norm": 1.324127197265625, + "learning_rate": 8.744084556986764e-05, + "loss": 0.661848247051239, + "step": 4156 + }, + { + "epoch": 1.7544303797468355, + "grad_norm": 1.226809024810791, + "learning_rate": 8.74249249160548e-05, + "loss": 0.7057217955589294, + "step": 4158 + }, + { + "epoch": 1.7552742616033754, + "grad_norm": 1.2341214418411255, + "learning_rate": 8.740899562903056e-05, + "loss": 0.6856105923652649, + "step": 4160 + }, + { + "epoch": 1.7561181434599156, + "grad_norm": 1.3907564878463745, + "learning_rate": 8.739305771246946e-05, + "loss": 0.6616930365562439, + "step": 4162 + }, + { + "epoch": 1.7569620253164557, + "grad_norm": 1.2756825685501099, + "learning_rate": 8.737711117004812e-05, + "loss": 0.5791551470756531, + "step": 4164 + }, + { + "epoch": 1.7578059071729957, + "grad_norm": 1.2861095666885376, + "learning_rate": 8.736115600544506e-05, + "loss": 0.7074756622314453, + "step": 4166 + }, + { + "epoch": 1.7586497890295358, + "grad_norm": 1.2198424339294434, + "learning_rate": 8.734519222234083e-05, + "loss": 0.6494167447090149, + "step": 4168 + }, + { + "epoch": 1.759493670886076, + "grad_norm": 1.19169020652771, + "learning_rate": 8.732921982441799e-05, + "loss": 0.6546841859817505, + "step": 4170 + }, + { + "epoch": 1.760337552742616, + "grad_norm": 1.11533784866333, + "learning_rate": 8.731323881536108e-05, + "loss": 0.6701815724372864, + "step": 4172 + }, + { + "epoch": 1.761181434599156, + "grad_norm": 1.2148140668869019, + "learning_rate": 8.729724919885657e-05, + "loss": 0.6678179502487183, + "step": 4174 + }, + { + "epoch": 1.7620253164556963, + "grad_norm": 1.1968709230422974, + "learning_rate": 8.728125097859298e-05, + "loss": 0.6505144834518433, + "step": 4176 + }, + { + "epoch": 1.7628691983122362, + "grad_norm": 1.0954766273498535, + "learning_rate": 8.726524415826079e-05, + "loss": 0.6531696915626526, + "step": 4178 + }, + { + "epoch": 1.7637130801687764, + "grad_norm": 1.5149537324905396, + "learning_rate": 8.724922874155246e-05, + "loss": 0.710014283657074, + "step": 4180 + }, + { + "epoch": 1.7645569620253165, + "grad_norm": 1.145113229751587, + "learning_rate": 8.723320473216245e-05, + "loss": 0.714016318321228, + "step": 4182 + }, + { + "epoch": 1.7654008438818565, + "grad_norm": 0.9454524517059326, + "learning_rate": 8.721717213378719e-05, + "loss": 0.6775414347648621, + "step": 4184 + }, + { + "epoch": 1.7662447257383966, + "grad_norm": 1.1414754390716553, + "learning_rate": 8.720113095012507e-05, + "loss": 0.6279728412628174, + "step": 4186 + }, + { + "epoch": 1.7670886075949368, + "grad_norm": 1.212802767753601, + "learning_rate": 8.718508118487652e-05, + "loss": 0.5894309282302856, + "step": 4188 + }, + { + "epoch": 1.7679324894514767, + "grad_norm": 1.5213478803634644, + "learning_rate": 8.716902284174388e-05, + "loss": 0.6124046444892883, + "step": 4190 + }, + { + "epoch": 1.768776371308017, + "grad_norm": 0.9973840713500977, + "learning_rate": 8.715295592443154e-05, + "loss": 0.5990801453590393, + "step": 4192 + }, + { + "epoch": 1.769620253164557, + "grad_norm": 1.1084294319152832, + "learning_rate": 8.713688043664579e-05, + "loss": 0.6485559344291687, + "step": 4194 + }, + { + "epoch": 1.770464135021097, + "grad_norm": 1.1401913166046143, + "learning_rate": 8.712079638209493e-05, + "loss": 0.7083099484443665, + "step": 4196 + }, + { + "epoch": 1.7713080168776372, + "grad_norm": 1.278105616569519, + "learning_rate": 8.71047037644893e-05, + "loss": 0.7237915992736816, + "step": 4198 + }, + { + "epoch": 1.7721518987341773, + "grad_norm": 1.2407530546188354, + "learning_rate": 8.708860258754108e-05, + "loss": 0.6259870529174805, + "step": 4200 + }, + { + "epoch": 1.7721518987341773, + "eval_loss": 0.6993561387062073, + "eval_runtime": 542.0281, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 3.887, + "step": 4200 + }, + { + "epoch": 1.7729957805907173, + "grad_norm": 1.102859616279602, + "learning_rate": 8.707249285496457e-05, + "loss": 0.6604248285293579, + "step": 4202 + }, + { + "epoch": 1.7738396624472574, + "grad_norm": 1.2478244304656982, + "learning_rate": 8.705637457047594e-05, + "loss": 0.6799775958061218, + "step": 4204 + }, + { + "epoch": 1.7746835443037976, + "grad_norm": 1.1178022623062134, + "learning_rate": 8.704024773779338e-05, + "loss": 0.6136477589607239, + "step": 4206 + }, + { + "epoch": 1.7755274261603375, + "grad_norm": 1.904076337814331, + "learning_rate": 8.702411236063703e-05, + "loss": 0.6568390130996704, + "step": 4208 + }, + { + "epoch": 1.7763713080168775, + "grad_norm": 1.0902835130691528, + "learning_rate": 8.700796844272903e-05, + "loss": 0.6404406428337097, + "step": 4210 + }, + { + "epoch": 1.7772151898734179, + "grad_norm": 1.1858288049697876, + "learning_rate": 8.699181598779347e-05, + "loss": 0.6924911737442017, + "step": 4212 + }, + { + "epoch": 1.7780590717299578, + "grad_norm": 1.0015727281570435, + "learning_rate": 8.69756549995564e-05, + "loss": 0.572692334651947, + "step": 4214 + }, + { + "epoch": 1.7789029535864977, + "grad_norm": 1.440079689025879, + "learning_rate": 8.695948548174583e-05, + "loss": 0.7196018695831299, + "step": 4216 + }, + { + "epoch": 1.7797468354430381, + "grad_norm": 1.1320992708206177, + "learning_rate": 8.69433074380918e-05, + "loss": 0.5870906710624695, + "step": 4218 + }, + { + "epoch": 1.780590717299578, + "grad_norm": 1.3156964778900146, + "learning_rate": 8.692712087232626e-05, + "loss": 0.6501539349555969, + "step": 4220 + }, + { + "epoch": 1.781434599156118, + "grad_norm": 1.1869803667068481, + "learning_rate": 8.691092578818311e-05, + "loss": 0.7017278075218201, + "step": 4222 + }, + { + "epoch": 1.7822784810126582, + "grad_norm": 0.9708380699157715, + "learning_rate": 8.689472218939829e-05, + "loss": 0.5954802632331848, + "step": 4224 + }, + { + "epoch": 1.7831223628691983, + "grad_norm": 1.0753228664398193, + "learning_rate": 8.687851007970962e-05, + "loss": 0.6494144797325134, + "step": 4226 + }, + { + "epoch": 1.7839662447257383, + "grad_norm": 1.1038413047790527, + "learning_rate": 8.686228946285695e-05, + "loss": 0.7247282862663269, + "step": 4228 + }, + { + "epoch": 1.7848101265822784, + "grad_norm": 0.9666786789894104, + "learning_rate": 8.684606034258206e-05, + "loss": 0.5673812627792358, + "step": 4230 + }, + { + "epoch": 1.7856540084388186, + "grad_norm": 1.1972676515579224, + "learning_rate": 8.682982272262869e-05, + "loss": 0.5950504541397095, + "step": 4232 + }, + { + "epoch": 1.7864978902953585, + "grad_norm": 1.23736572265625, + "learning_rate": 8.681357660674255e-05, + "loss": 0.6477514505386353, + "step": 4234 + }, + { + "epoch": 1.7873417721518987, + "grad_norm": 1.0238158702850342, + "learning_rate": 8.679732199867127e-05, + "loss": 0.6180200576782227, + "step": 4236 + }, + { + "epoch": 1.7881856540084389, + "grad_norm": 1.0333375930786133, + "learning_rate": 8.678105890216455e-05, + "loss": 0.5771099328994751, + "step": 4238 + }, + { + "epoch": 1.7890295358649788, + "grad_norm": 1.30390202999115, + "learning_rate": 8.676478732097393e-05, + "loss": 0.6592516899108887, + "step": 4240 + }, + { + "epoch": 1.789873417721519, + "grad_norm": 1.115160346031189, + "learning_rate": 8.674850725885294e-05, + "loss": 0.6662757396697998, + "step": 4242 + }, + { + "epoch": 1.7907172995780591, + "grad_norm": 1.2130142450332642, + "learning_rate": 8.67322187195571e-05, + "loss": 0.6673333048820496, + "step": 4244 + }, + { + "epoch": 1.791561181434599, + "grad_norm": 1.1505554914474487, + "learning_rate": 8.671592170684386e-05, + "loss": 0.6698325872421265, + "step": 4246 + }, + { + "epoch": 1.7924050632911392, + "grad_norm": 1.0758062601089478, + "learning_rate": 8.669961622447262e-05, + "loss": 0.6216199398040771, + "step": 4248 + }, + { + "epoch": 1.7932489451476794, + "grad_norm": 0.9300920367240906, + "learning_rate": 8.668330227620475e-05, + "loss": 0.6460495591163635, + "step": 4250 + }, + { + "epoch": 1.7940928270042193, + "grad_norm": 1.3860046863555908, + "learning_rate": 8.666697986580357e-05, + "loss": 0.6949506998062134, + "step": 4252 + }, + { + "epoch": 1.7949367088607595, + "grad_norm": 1.2287555932998657, + "learning_rate": 8.665064899703433e-05, + "loss": 0.6320405602455139, + "step": 4254 + }, + { + "epoch": 1.7957805907172997, + "grad_norm": 1.1585466861724854, + "learning_rate": 8.663430967366426e-05, + "loss": 0.6635019779205322, + "step": 4256 + }, + { + "epoch": 1.7966244725738396, + "grad_norm": 1.1007941961288452, + "learning_rate": 8.661796189946252e-05, + "loss": 0.645052969455719, + "step": 4258 + }, + { + "epoch": 1.7974683544303798, + "grad_norm": 1.2059847116470337, + "learning_rate": 8.660160567820023e-05, + "loss": 0.70420902967453, + "step": 4260 + }, + { + "epoch": 1.79831223628692, + "grad_norm": 1.0648717880249023, + "learning_rate": 8.658524101365044e-05, + "loss": 0.6263765096664429, + "step": 4262 + }, + { + "epoch": 1.7991561181434599, + "grad_norm": 1.017052412033081, + "learning_rate": 8.656886790958821e-05, + "loss": 0.6199937462806702, + "step": 4264 + }, + { + "epoch": 1.8, + "grad_norm": 1.1153450012207031, + "learning_rate": 8.655248636979045e-05, + "loss": 0.5891271233558655, + "step": 4266 + }, + { + "epoch": 1.8008438818565402, + "grad_norm": 1.0661747455596924, + "learning_rate": 8.65360963980361e-05, + "loss": 0.5442121028900146, + "step": 4268 + }, + { + "epoch": 1.8016877637130801, + "grad_norm": 1.3049758672714233, + "learning_rate": 8.6519697998106e-05, + "loss": 0.6988245248794556, + "step": 4270 + }, + { + "epoch": 1.80253164556962, + "grad_norm": 1.2679938077926636, + "learning_rate": 8.650329117378294e-05, + "loss": 0.7260398864746094, + "step": 4272 + }, + { + "epoch": 1.8033755274261605, + "grad_norm": 1.0899536609649658, + "learning_rate": 8.648687592885168e-05, + "loss": 0.5757678151130676, + "step": 4274 + }, + { + "epoch": 1.8042194092827004, + "grad_norm": 1.4088575839996338, + "learning_rate": 8.647045226709887e-05, + "loss": 0.7042108178138733, + "step": 4276 + }, + { + "epoch": 1.8050632911392404, + "grad_norm": 1.2143783569335938, + "learning_rate": 8.645402019231316e-05, + "loss": 0.641275942325592, + "step": 4278 + }, + { + "epoch": 1.8059071729957807, + "grad_norm": 1.4072896242141724, + "learning_rate": 8.64375797082851e-05, + "loss": 0.7657124996185303, + "step": 4280 + }, + { + "epoch": 1.8067510548523207, + "grad_norm": 1.2563380002975464, + "learning_rate": 8.642113081880718e-05, + "loss": 0.713768720626831, + "step": 4282 + }, + { + "epoch": 1.8075949367088606, + "grad_norm": 1.1195416450500488, + "learning_rate": 8.64046735276739e-05, + "loss": 0.6276429295539856, + "step": 4284 + }, + { + "epoch": 1.808438818565401, + "grad_norm": 1.2472422122955322, + "learning_rate": 8.638820783868158e-05, + "loss": 0.5641238689422607, + "step": 4286 + }, + { + "epoch": 1.809282700421941, + "grad_norm": 1.1974313259124756, + "learning_rate": 8.637173375562855e-05, + "loss": 0.6312015056610107, + "step": 4288 + }, + { + "epoch": 1.810126582278481, + "grad_norm": 1.1673604249954224, + "learning_rate": 8.63552512823151e-05, + "loss": 0.6674410104751587, + "step": 4290 + }, + { + "epoch": 1.810970464135021, + "grad_norm": 1.199095368385315, + "learning_rate": 8.633876042254337e-05, + "loss": 0.6772016286849976, + "step": 4292 + }, + { + "epoch": 1.8118143459915612, + "grad_norm": 1.2302746772766113, + "learning_rate": 8.632226118011752e-05, + "loss": 0.6621671915054321, + "step": 4294 + }, + { + "epoch": 1.8126582278481012, + "grad_norm": 1.304010033607483, + "learning_rate": 8.63057535588436e-05, + "loss": 0.6965363621711731, + "step": 4296 + }, + { + "epoch": 1.8135021097046413, + "grad_norm": 1.223366618156433, + "learning_rate": 8.62892375625296e-05, + "loss": 0.6300807595252991, + "step": 4298 + }, + { + "epoch": 1.8143459915611815, + "grad_norm": 1.028496265411377, + "learning_rate": 8.627271319498544e-05, + "loss": 0.5610660910606384, + "step": 4300 + }, + { + "epoch": 1.8143459915611815, + "eval_loss": 0.6981000900268555, + "eval_runtime": 514.4659, + "eval_samples_per_second": 4.096, + "eval_steps_per_second": 4.096, + "step": 4300 + }, + { + "epoch": 1.8151898734177214, + "grad_norm": 1.2050007581710815, + "learning_rate": 8.625618046002298e-05, + "loss": 0.6666551232337952, + "step": 4302 + }, + { + "epoch": 1.8160337552742616, + "grad_norm": 1.1233220100402832, + "learning_rate": 8.6239639361456e-05, + "loss": 0.6631835103034973, + "step": 4304 + }, + { + "epoch": 1.8168776371308017, + "grad_norm": 1.1262956857681274, + "learning_rate": 8.622308990310021e-05, + "loss": 0.6395270228385925, + "step": 4306 + }, + { + "epoch": 1.8177215189873417, + "grad_norm": 1.0448222160339355, + "learning_rate": 8.620653208877328e-05, + "loss": 0.6165015697479248, + "step": 4308 + }, + { + "epoch": 1.8185654008438819, + "grad_norm": 1.1555759906768799, + "learning_rate": 8.618996592229473e-05, + "loss": 0.5915844440460205, + "step": 4310 + }, + { + "epoch": 1.819409282700422, + "grad_norm": 1.5407506227493286, + "learning_rate": 8.617339140748608e-05, + "loss": 0.6491456627845764, + "step": 4312 + }, + { + "epoch": 1.820253164556962, + "grad_norm": 1.3690788745880127, + "learning_rate": 8.615680854817077e-05, + "loss": 0.6053901314735413, + "step": 4314 + }, + { + "epoch": 1.8210970464135021, + "grad_norm": 1.052583932876587, + "learning_rate": 8.614021734817413e-05, + "loss": 0.5821644067764282, + "step": 4316 + }, + { + "epoch": 1.8219409282700423, + "grad_norm": 1.090567708015442, + "learning_rate": 8.612361781132344e-05, + "loss": 0.645878255367279, + "step": 4318 + }, + { + "epoch": 1.8227848101265822, + "grad_norm": 1.122719645500183, + "learning_rate": 8.610700994144787e-05, + "loss": 0.6883123517036438, + "step": 4320 + }, + { + "epoch": 1.8236286919831224, + "grad_norm": 1.3273001909255981, + "learning_rate": 8.609039374237856e-05, + "loss": 0.6918330788612366, + "step": 4322 + }, + { + "epoch": 1.8244725738396625, + "grad_norm": 1.0628443956375122, + "learning_rate": 8.607376921794855e-05, + "loss": 0.6292204856872559, + "step": 4324 + }, + { + "epoch": 1.8253164556962025, + "grad_norm": 1.287466287612915, + "learning_rate": 8.605713637199279e-05, + "loss": 0.6136105060577393, + "step": 4326 + }, + { + "epoch": 1.8261603375527427, + "grad_norm": 1.1399345397949219, + "learning_rate": 8.604049520834816e-05, + "loss": 0.6099681854248047, + "step": 4328 + }, + { + "epoch": 1.8270042194092828, + "grad_norm": 1.1131435632705688, + "learning_rate": 8.602384573085345e-05, + "loss": 0.6267056465148926, + "step": 4330 + }, + { + "epoch": 1.8278481012658228, + "grad_norm": 1.1312925815582275, + "learning_rate": 8.600718794334939e-05, + "loss": 0.609437882900238, + "step": 4332 + }, + { + "epoch": 1.828691983122363, + "grad_norm": 1.3711494207382202, + "learning_rate": 8.599052184967859e-05, + "loss": 0.727881669998169, + "step": 4334 + }, + { + "epoch": 1.829535864978903, + "grad_norm": 1.1403605937957764, + "learning_rate": 8.597384745368562e-05, + "loss": 0.6771696209907532, + "step": 4336 + }, + { + "epoch": 1.830379746835443, + "grad_norm": 1.2769951820373535, + "learning_rate": 8.595716475921693e-05, + "loss": 0.6812924742698669, + "step": 4338 + }, + { + "epoch": 1.831223628691983, + "grad_norm": 1.055721402168274, + "learning_rate": 8.59404737701209e-05, + "loss": 0.6403515338897705, + "step": 4340 + }, + { + "epoch": 1.8320675105485233, + "grad_norm": 1.1047639846801758, + "learning_rate": 8.592377449024784e-05, + "loss": 0.663240373134613, + "step": 4342 + }, + { + "epoch": 1.8329113924050633, + "grad_norm": 1.0808883905410767, + "learning_rate": 8.590706692344991e-05, + "loss": 0.6398993134498596, + "step": 4344 + }, + { + "epoch": 1.8337552742616032, + "grad_norm": 1.2433407306671143, + "learning_rate": 8.589035107358125e-05, + "loss": 0.6838348507881165, + "step": 4346 + }, + { + "epoch": 1.8345991561181436, + "grad_norm": 1.031216025352478, + "learning_rate": 8.58736269444979e-05, + "loss": 0.640884280204773, + "step": 4348 + }, + { + "epoch": 1.8354430379746836, + "grad_norm": 1.1417057514190674, + "learning_rate": 8.585689454005776e-05, + "loss": 0.6346741914749146, + "step": 4350 + }, + { + "epoch": 1.8362869198312235, + "grad_norm": 1.210988998413086, + "learning_rate": 8.584015386412072e-05, + "loss": 0.6209521889686584, + "step": 4352 + }, + { + "epoch": 1.8371308016877637, + "grad_norm": 1.2120760679244995, + "learning_rate": 8.582340492054847e-05, + "loss": 0.6699252128601074, + "step": 4354 + }, + { + "epoch": 1.8379746835443038, + "grad_norm": 1.1768114566802979, + "learning_rate": 8.580664771320475e-05, + "loss": 0.6472980380058289, + "step": 4356 + }, + { + "epoch": 1.8388185654008438, + "grad_norm": 1.060070276260376, + "learning_rate": 8.578988224595506e-05, + "loss": 0.6440452933311462, + "step": 4358 + }, + { + "epoch": 1.839662447257384, + "grad_norm": 1.1366443634033203, + "learning_rate": 8.57731085226669e-05, + "loss": 0.5894474387168884, + "step": 4360 + }, + { + "epoch": 1.840506329113924, + "grad_norm": 1.1571751832962036, + "learning_rate": 8.575632654720963e-05, + "loss": 0.5868900418281555, + "step": 4362 + }, + { + "epoch": 1.841350210970464, + "grad_norm": 1.1983840465545654, + "learning_rate": 8.573953632345453e-05, + "loss": 0.5841533541679382, + "step": 4364 + }, + { + "epoch": 1.8421940928270042, + "grad_norm": 1.101806640625, + "learning_rate": 8.572273785527481e-05, + "loss": 0.5503215193748474, + "step": 4366 + }, + { + "epoch": 1.8430379746835444, + "grad_norm": 1.0327471494674683, + "learning_rate": 8.570593114654552e-05, + "loss": 0.6131128072738647, + "step": 4368 + }, + { + "epoch": 1.8438818565400843, + "grad_norm": 1.1421098709106445, + "learning_rate": 8.568911620114368e-05, + "loss": 0.6614060401916504, + "step": 4370 + }, + { + "epoch": 1.8447257383966245, + "grad_norm": 1.1707026958465576, + "learning_rate": 8.567229302294814e-05, + "loss": 0.6392307877540588, + "step": 4372 + }, + { + "epoch": 1.8455696202531646, + "grad_norm": 1.1704418659210205, + "learning_rate": 8.565546161583969e-05, + "loss": 0.6560825109481812, + "step": 4374 + }, + { + "epoch": 1.8464135021097046, + "grad_norm": 1.3618037700653076, + "learning_rate": 8.563862198370103e-05, + "loss": 0.6996290683746338, + "step": 4376 + }, + { + "epoch": 1.8472573839662447, + "grad_norm": 1.116645097732544, + "learning_rate": 8.562177413041674e-05, + "loss": 0.6776535511016846, + "step": 4378 + }, + { + "epoch": 1.8481012658227849, + "grad_norm": 1.1669151782989502, + "learning_rate": 8.560491805987327e-05, + "loss": 0.6390423774719238, + "step": 4380 + }, + { + "epoch": 1.8489451476793248, + "grad_norm": 1.2188117504119873, + "learning_rate": 8.558805377595904e-05, + "loss": 0.6554020047187805, + "step": 4382 + }, + { + "epoch": 1.849789029535865, + "grad_norm": 1.216829776763916, + "learning_rate": 8.557118128256425e-05, + "loss": 0.6291787624359131, + "step": 4384 + }, + { + "epoch": 1.8506329113924052, + "grad_norm": 1.0431596040725708, + "learning_rate": 8.555430058358111e-05, + "loss": 0.6484442949295044, + "step": 4386 + }, + { + "epoch": 1.851476793248945, + "grad_norm": 1.3015289306640625, + "learning_rate": 8.553741168290367e-05, + "loss": 0.7034047842025757, + "step": 4388 + }, + { + "epoch": 1.8523206751054853, + "grad_norm": 1.2062040567398071, + "learning_rate": 8.552051458442785e-05, + "loss": 0.644135594367981, + "step": 4390 + }, + { + "epoch": 1.8531645569620254, + "grad_norm": 1.238461971282959, + "learning_rate": 8.55036092920515e-05, + "loss": 0.6767282485961914, + "step": 4392 + }, + { + "epoch": 1.8540084388185654, + "grad_norm": 1.2978830337524414, + "learning_rate": 8.548669580967435e-05, + "loss": 0.7292267680168152, + "step": 4394 + }, + { + "epoch": 1.8548523206751055, + "grad_norm": 1.1448328495025635, + "learning_rate": 8.546977414119801e-05, + "loss": 0.6788421273231506, + "step": 4396 + }, + { + "epoch": 1.8556962025316457, + "grad_norm": 1.0685368776321411, + "learning_rate": 8.5452844290526e-05, + "loss": 0.6745942234992981, + "step": 4398 + }, + { + "epoch": 1.8565400843881856, + "grad_norm": 1.125707983970642, + "learning_rate": 8.543590626156368e-05, + "loss": 0.6351125836372375, + "step": 4400 + }, + { + "epoch": 1.8565400843881856, + "eval_loss": 0.6961485147476196, + "eval_runtime": 513.5724, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 4400 + }, + { + "epoch": 1.8573839662447258, + "grad_norm": 1.072179913520813, + "learning_rate": 8.541896005821835e-05, + "loss": 0.5840762257575989, + "step": 4402 + }, + { + "epoch": 1.858227848101266, + "grad_norm": 1.2572803497314453, + "learning_rate": 8.540200568439915e-05, + "loss": 0.6431074738502502, + "step": 4404 + }, + { + "epoch": 1.859071729957806, + "grad_norm": 1.3294413089752197, + "learning_rate": 8.538504314401718e-05, + "loss": 0.708808183670044, + "step": 4406 + }, + { + "epoch": 1.8599156118143458, + "grad_norm": 1.1775587797164917, + "learning_rate": 8.536807244098533e-05, + "loss": 0.6580085754394531, + "step": 4408 + }, + { + "epoch": 1.8607594936708862, + "grad_norm": 1.1880089044570923, + "learning_rate": 8.53510935792184e-05, + "loss": 0.6500136256217957, + "step": 4410 + }, + { + "epoch": 1.8616033755274262, + "grad_norm": 1.2166204452514648, + "learning_rate": 8.533410656263313e-05, + "loss": 0.6922352313995361, + "step": 4412 + }, + { + "epoch": 1.862447257383966, + "grad_norm": 1.0405415296554565, + "learning_rate": 8.531711139514808e-05, + "loss": 0.6761626601219177, + "step": 4414 + }, + { + "epoch": 1.8632911392405065, + "grad_norm": 1.0674270391464233, + "learning_rate": 8.530010808068371e-05, + "loss": 0.672576904296875, + "step": 4416 + }, + { + "epoch": 1.8641350210970464, + "grad_norm": 1.0584741830825806, + "learning_rate": 8.528309662316236e-05, + "loss": 0.5521218180656433, + "step": 4418 + }, + { + "epoch": 1.8649789029535864, + "grad_norm": 1.3619039058685303, + "learning_rate": 8.526607702650824e-05, + "loss": 0.6546680927276611, + "step": 4420 + }, + { + "epoch": 1.8658227848101265, + "grad_norm": 0.9904745221138, + "learning_rate": 8.524904929464745e-05, + "loss": 0.6043933629989624, + "step": 4422 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 1.3046703338623047, + "learning_rate": 8.523201343150795e-05, + "loss": 0.7106801271438599, + "step": 4424 + }, + { + "epoch": 1.8675105485232066, + "grad_norm": 1.1166832447052002, + "learning_rate": 8.52149694410196e-05, + "loss": 0.6456703543663025, + "step": 4426 + }, + { + "epoch": 1.8683544303797468, + "grad_norm": 1.1260632276535034, + "learning_rate": 8.519791732711412e-05, + "loss": 0.5963318347930908, + "step": 4428 + }, + { + "epoch": 1.869198312236287, + "grad_norm": 1.0990599393844604, + "learning_rate": 8.51808570937251e-05, + "loss": 0.6295356750488281, + "step": 4430 + }, + { + "epoch": 1.870042194092827, + "grad_norm": 1.3689274787902832, + "learning_rate": 8.516378874478801e-05, + "loss": 0.6984617114067078, + "step": 4432 + }, + { + "epoch": 1.870886075949367, + "grad_norm": 1.0986580848693848, + "learning_rate": 8.514671228424018e-05, + "loss": 0.5598900318145752, + "step": 4434 + }, + { + "epoch": 1.8717299578059072, + "grad_norm": 0.9570761322975159, + "learning_rate": 8.512962771602085e-05, + "loss": 0.6286435723304749, + "step": 4436 + }, + { + "epoch": 1.8725738396624472, + "grad_norm": 1.1480669975280762, + "learning_rate": 8.511253504407107e-05, + "loss": 0.5956313014030457, + "step": 4438 + }, + { + "epoch": 1.8734177215189873, + "grad_norm": 1.1132479906082153, + "learning_rate": 8.50954342723338e-05, + "loss": 0.6523844599723816, + "step": 4440 + }, + { + "epoch": 1.8742616033755275, + "grad_norm": 1.1569167375564575, + "learning_rate": 8.507832540475387e-05, + "loss": 0.6231355667114258, + "step": 4442 + }, + { + "epoch": 1.8751054852320674, + "grad_norm": 1.1327043771743774, + "learning_rate": 8.506120844527796e-05, + "loss": 0.660773754119873, + "step": 4444 + }, + { + "epoch": 1.8759493670886076, + "grad_norm": 0.8939630389213562, + "learning_rate": 8.504408339785463e-05, + "loss": 0.6319235563278198, + "step": 4446 + }, + { + "epoch": 1.8767932489451478, + "grad_norm": 1.1910638809204102, + "learning_rate": 8.50269502664343e-05, + "loss": 0.6753001809120178, + "step": 4448 + }, + { + "epoch": 1.8776371308016877, + "grad_norm": 1.1502408981323242, + "learning_rate": 8.500980905496923e-05, + "loss": 0.6300671696662903, + "step": 4450 + }, + { + "epoch": 1.8784810126582279, + "grad_norm": 1.0639009475708008, + "learning_rate": 8.49926597674136e-05, + "loss": 0.6196691989898682, + "step": 4452 + }, + { + "epoch": 1.879324894514768, + "grad_norm": 1.1072754859924316, + "learning_rate": 8.497550240772341e-05, + "loss": 0.7029181122779846, + "step": 4454 + }, + { + "epoch": 1.880168776371308, + "grad_norm": 1.0440188646316528, + "learning_rate": 8.495833697985652e-05, + "loss": 0.65432208776474, + "step": 4456 + }, + { + "epoch": 1.8810126582278481, + "grad_norm": 1.0646617412567139, + "learning_rate": 8.494116348777269e-05, + "loss": 0.6446614861488342, + "step": 4458 + }, + { + "epoch": 1.8818565400843883, + "grad_norm": 1.2163805961608887, + "learning_rate": 8.492398193543349e-05, + "loss": 0.6430497765541077, + "step": 4460 + }, + { + "epoch": 1.8827004219409282, + "grad_norm": 1.2715297937393188, + "learning_rate": 8.490679232680241e-05, + "loss": 0.6609845161437988, + "step": 4462 + }, + { + "epoch": 1.8835443037974684, + "grad_norm": 1.0435588359832764, + "learning_rate": 8.488959466584469e-05, + "loss": 0.5791062712669373, + "step": 4464 + }, + { + "epoch": 1.8843881856540086, + "grad_norm": 1.229202151298523, + "learning_rate": 8.487238895652759e-05, + "loss": 0.6312171220779419, + "step": 4466 + }, + { + "epoch": 1.8852320675105485, + "grad_norm": 1.0713022947311401, + "learning_rate": 8.485517520282008e-05, + "loss": 0.6698815226554871, + "step": 4468 + }, + { + "epoch": 1.8860759493670884, + "grad_norm": 1.0172312259674072, + "learning_rate": 8.483795340869305e-05, + "loss": 0.6283810138702393, + "step": 4470 + }, + { + "epoch": 1.8869198312236288, + "grad_norm": 1.2880207300186157, + "learning_rate": 8.482072357811926e-05, + "loss": 0.6659437417984009, + "step": 4472 + }, + { + "epoch": 1.8877637130801688, + "grad_norm": 1.0840508937835693, + "learning_rate": 8.480348571507329e-05, + "loss": 0.6190289258956909, + "step": 4474 + }, + { + "epoch": 1.8886075949367087, + "grad_norm": 1.1101994514465332, + "learning_rate": 8.478623982353156e-05, + "loss": 0.5760066509246826, + "step": 4476 + }, + { + "epoch": 1.889451476793249, + "grad_norm": 1.2388770580291748, + "learning_rate": 8.476898590747237e-05, + "loss": 0.6151811480522156, + "step": 4478 + }, + { + "epoch": 1.890295358649789, + "grad_norm": 0.9986408948898315, + "learning_rate": 8.475172397087591e-05, + "loss": 0.5991593599319458, + "step": 4480 + }, + { + "epoch": 1.891139240506329, + "grad_norm": 1.1380778551101685, + "learning_rate": 8.473445401772415e-05, + "loss": 0.7262179255485535, + "step": 4482 + }, + { + "epoch": 1.8919831223628694, + "grad_norm": 1.3933676481246948, + "learning_rate": 8.471717605200092e-05, + "loss": 0.5806916356086731, + "step": 4484 + }, + { + "epoch": 1.8928270042194093, + "grad_norm": 1.0242944955825806, + "learning_rate": 8.469989007769194e-05, + "loss": 0.617904782295227, + "step": 4486 + }, + { + "epoch": 1.8936708860759492, + "grad_norm": 1.0909028053283691, + "learning_rate": 8.468259609878475e-05, + "loss": 0.6488202810287476, + "step": 4488 + }, + { + "epoch": 1.8945147679324894, + "grad_norm": 1.042611002922058, + "learning_rate": 8.466529411926874e-05, + "loss": 0.6015118956565857, + "step": 4490 + }, + { + "epoch": 1.8953586497890296, + "grad_norm": 1.3965784311294556, + "learning_rate": 8.46479841431351e-05, + "loss": 0.7035272717475891, + "step": 4492 + }, + { + "epoch": 1.8962025316455695, + "grad_norm": 1.1486462354660034, + "learning_rate": 8.463066617437698e-05, + "loss": 0.6611229777336121, + "step": 4494 + }, + { + "epoch": 1.8970464135021097, + "grad_norm": 1.0845859050750732, + "learning_rate": 8.461334021698925e-05, + "loss": 0.6378056406974792, + "step": 4496 + }, + { + "epoch": 1.8978902953586498, + "grad_norm": 0.936612069606781, + "learning_rate": 8.459600627496869e-05, + "loss": 0.642429769039154, + "step": 4498 + }, + { + "epoch": 1.8987341772151898, + "grad_norm": 1.1905454397201538, + "learning_rate": 8.457866435231391e-05, + "loss": 0.6341768503189087, + "step": 4500 + }, + { + "epoch": 1.8987341772151898, + "eval_loss": 0.6938078999519348, + "eval_runtime": 513.615, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 4.102, + "step": 4500 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.673687407282723e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-4500/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-4500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-500/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-500/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-500/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-500/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..781b7af01cfbc68f2cd9d463b3c9a8387603b76d --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea7e298c73c76f6e746aed304202e1a80d01a42aac34a2cefcec6b80b6a4c732 +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-500/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..895585e9850130b91f0cf78695d9acd2facc6ec0 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bced920d3c43b3981591a6f21ad10f1770c72f45942c1863276821c994b830f +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-500/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..be71f67f4c32cb8143bc10f8ce883751332ecd81 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9711074652a371d99e5fe3d707395a6c541027f3e8cff8001f39c786e3d147fc +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-500/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8639face960f82fc6de576c82d2288214c7b93f --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc5faf250cd0db0115502e95cbf9731644d0155eca0681b6e62dec84919b6b40 +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-500/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..618b883f93cd5fa392f021bc4b7da9606d2daa06 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-500/trainer_state.json @@ -0,0 +1,1833 @@ +{ + "best_global_step": 500, + "best_metric": 0.9080732464790344, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-500", + "epoch": 0.2109704641350211, + "eval_steps": 100, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.1928835720736154e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-500/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5000/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5000/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5000/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c363358b11c8ba414ac45159929a3f83aeb3ceef --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d083c05758bdb44f6869a6f9404b83f60bfe5d08ce950b0a00c2f84bcf336779 +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5000/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bba86984c8d147a4184250b2b201cbb364dd92b7 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2abffb856fadeb4daeb389e5c74b24a670d7e9e4b48f43e4a1f83f9711f24c68 +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5000/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..21353ade3baddc6d391db67833f8f06266c9dadd --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad332796286baa38c331a99d0074fd0c8b9104a81282aa8631e4dc68cd06198f +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5000/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..65d6268e8b1bff1bbc91ef68cde8fa804c65d561 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e95d569981f05cebc7ab9314dbe8ca62f6f4a4bc7524af3959929589cf94ac8 +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5000/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1320ef662d3a4d2c0bc6acab9561088eae275b18 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/trainer_state.json @@ -0,0 +1,17943 @@ +{ + "best_global_step": 5000, + "best_metric": 0.6908889412879944, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-5000", + "epoch": 2.109704641350211, + "eval_steps": 100, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + }, + { + "epoch": 0.8447257383966245, + "grad_norm": 1.145434021949768, + "learning_rate": 9.893006338154401e-05, + "loss": 0.731850802898407, + "step": 2002 + }, + { + "epoch": 0.8455696202531645, + "grad_norm": 1.0618077516555786, + "learning_rate": 9.892511635084101e-05, + "loss": 0.6711665391921997, + "step": 2004 + }, + { + "epoch": 0.8464135021097047, + "grad_norm": 1.1657867431640625, + "learning_rate": 9.892015803405331e-05, + "loss": 0.6894803643226624, + "step": 2006 + }, + { + "epoch": 0.8472573839662447, + "grad_norm": 1.080140233039856, + "learning_rate": 9.891518843232467e-05, + "loss": 0.628146231174469, + "step": 2008 + }, + { + "epoch": 0.8481012658227848, + "grad_norm": 1.0664509534835815, + "learning_rate": 9.891020754680151e-05, + "loss": 0.740858793258667, + "step": 2010 + }, + { + "epoch": 0.8489451476793249, + "grad_norm": 1.5567615032196045, + "learning_rate": 9.89052153786328e-05, + "loss": 0.7763919234275818, + "step": 2012 + }, + { + "epoch": 0.849789029535865, + "grad_norm": 1.4347095489501953, + "learning_rate": 9.890021192897016e-05, + "loss": 0.8131396770477295, + "step": 2014 + }, + { + "epoch": 0.850632911392405, + "grad_norm": 1.1787892580032349, + "learning_rate": 9.889519719896776e-05, + "loss": 0.6829051375389099, + "step": 2016 + }, + { + "epoch": 0.8514767932489451, + "grad_norm": 1.239745855331421, + "learning_rate": 9.889017118978241e-05, + "loss": 0.7664558291435242, + "step": 2018 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 1.1224207878112793, + "learning_rate": 9.888513390257352e-05, + "loss": 0.7307376861572266, + "step": 2020 + }, + { + "epoch": 0.8531645569620253, + "grad_norm": 1.100536823272705, + "learning_rate": 9.88800853385031e-05, + "loss": 0.6786578893661499, + "step": 2022 + }, + { + "epoch": 0.8540084388185654, + "grad_norm": 1.25773024559021, + "learning_rate": 9.887502549873576e-05, + "loss": 0.7971984148025513, + "step": 2024 + }, + { + "epoch": 0.8548523206751055, + "grad_norm": 0.9980104565620422, + "learning_rate": 9.886995438443868e-05, + "loss": 0.6990941166877747, + "step": 2026 + }, + { + "epoch": 0.8556962025316456, + "grad_norm": 1.0464621782302856, + "learning_rate": 9.886487199678171e-05, + "loss": 0.763938307762146, + "step": 2028 + }, + { + "epoch": 0.8565400843881856, + "grad_norm": 1.2303017377853394, + "learning_rate": 9.885977833693724e-05, + "loss": 0.7165632247924805, + "step": 2030 + }, + { + "epoch": 0.8573839662447258, + "grad_norm": 1.2203325033187866, + "learning_rate": 9.885467340608027e-05, + "loss": 0.7586364150047302, + "step": 2032 + }, + { + "epoch": 0.8582278481012658, + "grad_norm": 1.113882064819336, + "learning_rate": 9.884955720538843e-05, + "loss": 0.703253984451294, + "step": 2034 + }, + { + "epoch": 0.8590717299578059, + "grad_norm": 1.1731632947921753, + "learning_rate": 9.88444297360419e-05, + "loss": 0.8530917763710022, + "step": 2036 + }, + { + "epoch": 0.859915611814346, + "grad_norm": 1.4592338800430298, + "learning_rate": 9.883929099922349e-05, + "loss": 0.8166638612747192, + "step": 2038 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.1279125213623047, + "learning_rate": 9.883414099611864e-05, + "loss": 0.6762415170669556, + "step": 2040 + }, + { + "epoch": 0.8616033755274262, + "grad_norm": 1.1587293148040771, + "learning_rate": 9.882897972791534e-05, + "loss": 0.6826539039611816, + "step": 2042 + }, + { + "epoch": 0.8624472573839662, + "grad_norm": 1.1909502744674683, + "learning_rate": 9.88238071958042e-05, + "loss": 0.7372410893440247, + "step": 2044 + }, + { + "epoch": 0.8632911392405064, + "grad_norm": 1.0340155363082886, + "learning_rate": 9.881862340097841e-05, + "loss": 0.699260950088501, + "step": 2046 + }, + { + "epoch": 0.8641350210970464, + "grad_norm": 1.1745870113372803, + "learning_rate": 9.881342834463379e-05, + "loss": 0.7689789533615112, + "step": 2048 + }, + { + "epoch": 0.8649789029535865, + "grad_norm": 1.0003606081008911, + "learning_rate": 9.880822202796872e-05, + "loss": 0.6877372860908508, + "step": 2050 + }, + { + "epoch": 0.8658227848101265, + "grad_norm": 1.2546781301498413, + "learning_rate": 9.88030044521842e-05, + "loss": 0.7632413506507874, + "step": 2052 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.1178704500198364, + "learning_rate": 9.879777561848385e-05, + "loss": 0.6776729822158813, + "step": 2054 + }, + { + "epoch": 0.8675105485232067, + "grad_norm": 1.523606777191162, + "learning_rate": 9.879253552807384e-05, + "loss": 0.7592973709106445, + "step": 2056 + }, + { + "epoch": 0.8683544303797468, + "grad_norm": 1.3490995168685913, + "learning_rate": 9.878728418216296e-05, + "loss": 0.8028839230537415, + "step": 2058 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 1.1851624250411987, + "learning_rate": 9.87820215819626e-05, + "loss": 0.7499933838844299, + "step": 2060 + }, + { + "epoch": 0.870042194092827, + "grad_norm": 1.1877925395965576, + "learning_rate": 9.877674772868672e-05, + "loss": 0.7324717044830322, + "step": 2062 + }, + { + "epoch": 0.8708860759493671, + "grad_norm": 1.2982885837554932, + "learning_rate": 9.877146262355194e-05, + "loss": 0.7456585168838501, + "step": 2064 + }, + { + "epoch": 0.8717299578059071, + "grad_norm": 1.043912649154663, + "learning_rate": 9.876616626777739e-05, + "loss": 0.7552799582481384, + "step": 2066 + }, + { + "epoch": 0.8725738396624473, + "grad_norm": 1.172580599784851, + "learning_rate": 9.876085866258487e-05, + "loss": 0.6964990496635437, + "step": 2068 + }, + { + "epoch": 0.8734177215189873, + "grad_norm": 1.26815927028656, + "learning_rate": 9.875553980919871e-05, + "loss": 0.7368612289428711, + "step": 2070 + }, + { + "epoch": 0.8742616033755274, + "grad_norm": 1.1268136501312256, + "learning_rate": 9.875020970884587e-05, + "loss": 0.7400802969932556, + "step": 2072 + }, + { + "epoch": 0.8751054852320675, + "grad_norm": 1.0556721687316895, + "learning_rate": 9.874486836275594e-05, + "loss": 0.6931334137916565, + "step": 2074 + }, + { + "epoch": 0.8759493670886076, + "grad_norm": 1.1967823505401611, + "learning_rate": 9.873951577216106e-05, + "loss": 0.7124089002609253, + "step": 2076 + }, + { + "epoch": 0.8767932489451477, + "grad_norm": 1.1753164529800415, + "learning_rate": 9.873415193829591e-05, + "loss": 0.7462030053138733, + "step": 2078 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 1.326923131942749, + "learning_rate": 9.872877686239789e-05, + "loss": 0.778078019618988, + "step": 2080 + }, + { + "epoch": 0.8784810126582279, + "grad_norm": 1.1472662687301636, + "learning_rate": 9.87233905457069e-05, + "loss": 0.6592919826507568, + "step": 2082 + }, + { + "epoch": 0.8793248945147679, + "grad_norm": 1.1162762641906738, + "learning_rate": 9.871799298946544e-05, + "loss": 0.661717414855957, + "step": 2084 + }, + { + "epoch": 0.880168776371308, + "grad_norm": 1.1694408655166626, + "learning_rate": 9.871258419491866e-05, + "loss": 0.6203670501708984, + "step": 2086 + }, + { + "epoch": 0.8810126582278481, + "grad_norm": 1.229691505432129, + "learning_rate": 9.870716416331425e-05, + "loss": 0.758888304233551, + "step": 2088 + }, + { + "epoch": 0.8818565400843882, + "grad_norm": 1.540377140045166, + "learning_rate": 9.870173289590251e-05, + "loss": 0.760649561882019, + "step": 2090 + }, + { + "epoch": 0.8827004219409282, + "grad_norm": 1.173628568649292, + "learning_rate": 9.869629039393632e-05, + "loss": 0.6981227397918701, + "step": 2092 + }, + { + "epoch": 0.8835443037974684, + "grad_norm": 1.1404013633728027, + "learning_rate": 9.869083665867116e-05, + "loss": 0.7808336615562439, + "step": 2094 + }, + { + "epoch": 0.8843881856540085, + "grad_norm": 1.1038721799850464, + "learning_rate": 9.868537169136511e-05, + "loss": 0.7540555596351624, + "step": 2096 + }, + { + "epoch": 0.8852320675105485, + "grad_norm": 1.1510080099105835, + "learning_rate": 9.867989549327885e-05, + "loss": 0.6650454998016357, + "step": 2098 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.166912317276001, + "learning_rate": 9.867440806567561e-05, + "loss": 0.673769474029541, + "step": 2100 + }, + { + "epoch": 0.8860759493670886, + "eval_loss": 0.7559094429016113, + "eval_runtime": 847.8311, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2100 + }, + { + "epoch": 0.8869198312236287, + "grad_norm": 1.227583885192871, + "learning_rate": 9.866890940982121e-05, + "loss": 0.8314241766929626, + "step": 2102 + }, + { + "epoch": 0.8877637130801688, + "grad_norm": 1.1813976764678955, + "learning_rate": 9.866339952698413e-05, + "loss": 0.6770843863487244, + "step": 2104 + }, + { + "epoch": 0.8886075949367088, + "grad_norm": 1.2471063137054443, + "learning_rate": 9.865787841843539e-05, + "loss": 0.7142292857170105, + "step": 2106 + }, + { + "epoch": 0.889451476793249, + "grad_norm": 1.1602860689163208, + "learning_rate": 9.865234608544858e-05, + "loss": 0.6981731653213501, + "step": 2108 + }, + { + "epoch": 0.890295358649789, + "grad_norm": 1.145677089691162, + "learning_rate": 9.864680252929992e-05, + "loss": 0.7019379138946533, + "step": 2110 + }, + { + "epoch": 0.8911392405063291, + "grad_norm": 1.2222462892532349, + "learning_rate": 9.86412477512682e-05, + "loss": 0.7690986394882202, + "step": 2112 + }, + { + "epoch": 0.8919831223628693, + "grad_norm": 1.1288166046142578, + "learning_rate": 9.863568175263478e-05, + "loss": 0.7241792678833008, + "step": 2114 + }, + { + "epoch": 0.8928270042194093, + "grad_norm": 1.1773978471755981, + "learning_rate": 9.863010453468364e-05, + "loss": 0.7392162084579468, + "step": 2116 + }, + { + "epoch": 0.8936708860759494, + "grad_norm": 1.102638840675354, + "learning_rate": 9.862451609870136e-05, + "loss": 0.7603078484535217, + "step": 2118 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.861891644597707e-05, + "loss": 0.6804911494255066, + "step": 2120 + }, + { + "epoch": 0.8953586497890296, + "grad_norm": 1.1381969451904297, + "learning_rate": 9.86133055778025e-05, + "loss": 0.787288248538971, + "step": 2122 + }, + { + "epoch": 0.8962025316455696, + "grad_norm": 1.2454546689987183, + "learning_rate": 9.860768349547196e-05, + "loss": 0.7282505035400391, + "step": 2124 + }, + { + "epoch": 0.8970464135021097, + "grad_norm": 1.2568305730819702, + "learning_rate": 9.860205020028237e-05, + "loss": 0.7554803490638733, + "step": 2126 + }, + { + "epoch": 0.8978902953586498, + "grad_norm": 1.1523523330688477, + "learning_rate": 9.859640569353321e-05, + "loss": 0.7126525044441223, + "step": 2128 + }, + { + "epoch": 0.8987341772151899, + "grad_norm": 1.314878225326538, + "learning_rate": 9.859074997652658e-05, + "loss": 0.7300811409950256, + "step": 2130 + }, + { + "epoch": 0.8995780590717299, + "grad_norm": 1.1272218227386475, + "learning_rate": 9.858508305056713e-05, + "loss": 0.7217329144477844, + "step": 2132 + }, + { + "epoch": 0.90042194092827, + "grad_norm": 1.10934317111969, + "learning_rate": 9.857940491696211e-05, + "loss": 0.714308500289917, + "step": 2134 + }, + { + "epoch": 0.9012658227848102, + "grad_norm": 1.1991039514541626, + "learning_rate": 9.857371557702136e-05, + "loss": 0.6613366007804871, + "step": 2136 + }, + { + "epoch": 0.9021097046413502, + "grad_norm": 1.3176918029785156, + "learning_rate": 9.85680150320573e-05, + "loss": 0.6972863078117371, + "step": 2138 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 1.1966592073440552, + "learning_rate": 9.856230328338496e-05, + "loss": 0.7299100160598755, + "step": 2140 + }, + { + "epoch": 0.9037974683544304, + "grad_norm": 1.2889270782470703, + "learning_rate": 9.85565803323219e-05, + "loss": 0.7145020961761475, + "step": 2142 + }, + { + "epoch": 0.9046413502109705, + "grad_norm": 1.2112789154052734, + "learning_rate": 9.855084618018828e-05, + "loss": 0.6717942953109741, + "step": 2144 + }, + { + "epoch": 0.9054852320675105, + "grad_norm": 1.2550239562988281, + "learning_rate": 9.85451008283069e-05, + "loss": 0.7460196018218994, + "step": 2146 + }, + { + "epoch": 0.9063291139240506, + "grad_norm": 1.2926387786865234, + "learning_rate": 9.853934427800309e-05, + "loss": 0.8300626873970032, + "step": 2148 + }, + { + "epoch": 0.9071729957805907, + "grad_norm": 1.0690672397613525, + "learning_rate": 9.853357653060478e-05, + "loss": 0.715215802192688, + "step": 2150 + }, + { + "epoch": 0.9080168776371308, + "grad_norm": 1.1021424531936646, + "learning_rate": 9.852779758744245e-05, + "loss": 0.7021427154541016, + "step": 2152 + }, + { + "epoch": 0.9088607594936708, + "grad_norm": 1.0713517665863037, + "learning_rate": 9.852200744984921e-05, + "loss": 0.7576406598091125, + "step": 2154 + }, + { + "epoch": 0.909704641350211, + "grad_norm": 1.277526617050171, + "learning_rate": 9.851620611916075e-05, + "loss": 0.7008846998214722, + "step": 2156 + }, + { + "epoch": 0.9105485232067511, + "grad_norm": 1.2434618473052979, + "learning_rate": 9.85103935967153e-05, + "loss": 0.7536613345146179, + "step": 2158 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 1.1654841899871826, + "learning_rate": 9.850456988385371e-05, + "loss": 0.7435567378997803, + "step": 2160 + }, + { + "epoch": 0.9122362869198313, + "grad_norm": 1.0718246698379517, + "learning_rate": 9.849873498191939e-05, + "loss": 0.7725666165351868, + "step": 2162 + }, + { + "epoch": 0.9130801687763713, + "grad_norm": 1.3425630331039429, + "learning_rate": 9.849288889225835e-05, + "loss": 0.7833593487739563, + "step": 2164 + }, + { + "epoch": 0.9139240506329114, + "grad_norm": 1.1989985704421997, + "learning_rate": 9.848703161621917e-05, + "loss": 0.7290158867835999, + "step": 2166 + }, + { + "epoch": 0.9147679324894514, + "grad_norm": 1.0549380779266357, + "learning_rate": 9.8481163155153e-05, + "loss": 0.6787996888160706, + "step": 2168 + }, + { + "epoch": 0.9156118143459916, + "grad_norm": 1.0757017135620117, + "learning_rate": 9.847528351041359e-05, + "loss": 0.7645748853683472, + "step": 2170 + }, + { + "epoch": 0.9164556962025316, + "grad_norm": 1.0636975765228271, + "learning_rate": 9.846939268335726e-05, + "loss": 0.6640698313713074, + "step": 2172 + }, + { + "epoch": 0.9172995780590717, + "grad_norm": 1.2038439512252808, + "learning_rate": 9.846349067534291e-05, + "loss": 0.7216284275054932, + "step": 2174 + }, + { + "epoch": 0.9181434599156119, + "grad_norm": 1.17854642868042, + "learning_rate": 9.845757748773203e-05, + "loss": 0.7244991660118103, + "step": 2176 + }, + { + "epoch": 0.9189873417721519, + "grad_norm": 1.0391159057617188, + "learning_rate": 9.845165312188864e-05, + "loss": 0.6043152809143066, + "step": 2178 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 1.2382071018218994, + "learning_rate": 9.844571757917944e-05, + "loss": 0.7791659832000732, + "step": 2180 + }, + { + "epoch": 0.920675105485232, + "grad_norm": 1.0855708122253418, + "learning_rate": 9.84397708609736e-05, + "loss": 0.7190433144569397, + "step": 2182 + }, + { + "epoch": 0.9215189873417722, + "grad_norm": 1.103308916091919, + "learning_rate": 9.843381296864291e-05, + "loss": 0.6648658514022827, + "step": 2184 + }, + { + "epoch": 0.9223628691983122, + "grad_norm": 1.073517918586731, + "learning_rate": 9.842784390356178e-05, + "loss": 0.6891760230064392, + "step": 2186 + }, + { + "epoch": 0.9232067510548523, + "grad_norm": 1.0806199312210083, + "learning_rate": 9.842186366710712e-05, + "loss": 0.6880859136581421, + "step": 2188 + }, + { + "epoch": 0.9240506329113924, + "grad_norm": 1.0631483793258667, + "learning_rate": 9.841587226065848e-05, + "loss": 0.6238307952880859, + "step": 2190 + }, + { + "epoch": 0.9248945147679325, + "grad_norm": 1.2630863189697266, + "learning_rate": 9.840986968559795e-05, + "loss": 0.6905744075775146, + "step": 2192 + }, + { + "epoch": 0.9257383966244725, + "grad_norm": 1.1307560205459595, + "learning_rate": 9.840385594331022e-05, + "loss": 0.7531564235687256, + "step": 2194 + }, + { + "epoch": 0.9265822784810127, + "grad_norm": 1.0294862985610962, + "learning_rate": 9.839783103518254e-05, + "loss": 0.6750671863555908, + "step": 2196 + }, + { + "epoch": 0.9274261603375528, + "grad_norm": 1.2446976900100708, + "learning_rate": 9.839179496260472e-05, + "loss": 0.7200804352760315, + "step": 2198 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 1.2673420906066895, + "learning_rate": 9.83857477269692e-05, + "loss": 0.7002623677253723, + "step": 2200 + }, + { + "epoch": 0.9282700421940928, + "eval_loss": 0.7497645616531372, + "eval_runtime": 856.8766, + "eval_samples_per_second": 2.459, + "eval_steps_per_second": 2.459, + "step": 2200 + }, + { + "epoch": 0.9291139240506329, + "grad_norm": 1.5114624500274658, + "learning_rate": 9.837968932967094e-05, + "loss": 0.7718265056610107, + "step": 2202 + }, + { + "epoch": 0.929957805907173, + "grad_norm": 1.2059369087219238, + "learning_rate": 9.837361977210751e-05, + "loss": 0.7204271554946899, + "step": 2204 + }, + { + "epoch": 0.9308016877637131, + "grad_norm": 1.2077301740646362, + "learning_rate": 9.836753905567902e-05, + "loss": 0.7371073961257935, + "step": 2206 + }, + { + "epoch": 0.9316455696202531, + "grad_norm": 1.120097279548645, + "learning_rate": 9.836144718178818e-05, + "loss": 0.6601167321205139, + "step": 2208 + }, + { + "epoch": 0.9324894514767933, + "grad_norm": 1.1755714416503906, + "learning_rate": 9.835534415184029e-05, + "loss": 0.6897423267364502, + "step": 2210 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3587000370025635, + "learning_rate": 9.834922996724317e-05, + "loss": 0.758438229560852, + "step": 2212 + }, + { + "epoch": 0.9341772151898734, + "grad_norm": 1.1898177862167358, + "learning_rate": 9.834310462940727e-05, + "loss": 0.7489214539527893, + "step": 2214 + }, + { + "epoch": 0.9350210970464135, + "grad_norm": 1.0814623832702637, + "learning_rate": 9.833696813974558e-05, + "loss": 0.6844488382339478, + "step": 2216 + }, + { + "epoch": 0.9358649789029536, + "grad_norm": 1.1060179471969604, + "learning_rate": 9.833082049967366e-05, + "loss": 0.6617586016654968, + "step": 2218 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 1.1780575513839722, + "learning_rate": 9.832466171060968e-05, + "loss": 0.7383584976196289, + "step": 2220 + }, + { + "epoch": 0.9375527426160337, + "grad_norm": 1.3734618425369263, + "learning_rate": 9.831849177397432e-05, + "loss": 0.7764308452606201, + "step": 2222 + }, + { + "epoch": 0.9383966244725739, + "grad_norm": 1.1367733478546143, + "learning_rate": 9.831231069119089e-05, + "loss": 0.6834397912025452, + "step": 2224 + }, + { + "epoch": 0.9392405063291139, + "grad_norm": 1.1695492267608643, + "learning_rate": 9.830611846368524e-05, + "loss": 0.7054480910301208, + "step": 2226 + }, + { + "epoch": 0.940084388185654, + "grad_norm": 1.0345736742019653, + "learning_rate": 9.829991509288579e-05, + "loss": 0.694448709487915, + "step": 2228 + }, + { + "epoch": 0.9409282700421941, + "grad_norm": 1.298105239868164, + "learning_rate": 9.829370058022356e-05, + "loss": 0.6839741468429565, + "step": 2230 + }, + { + "epoch": 0.9417721518987342, + "grad_norm": 1.2905502319335938, + "learning_rate": 9.828747492713209e-05, + "loss": 0.7886884212493896, + "step": 2232 + }, + { + "epoch": 0.9426160337552743, + "grad_norm": 1.12301504611969, + "learning_rate": 9.828123813504753e-05, + "loss": 0.7206413149833679, + "step": 2234 + }, + { + "epoch": 0.9434599156118143, + "grad_norm": 1.2644896507263184, + "learning_rate": 9.82749902054086e-05, + "loss": 0.7700693607330322, + "step": 2236 + }, + { + "epoch": 0.9443037974683545, + "grad_norm": 1.1626365184783936, + "learning_rate": 9.826873113965655e-05, + "loss": 0.7199711203575134, + "step": 2238 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 1.0728627443313599, + "learning_rate": 9.826246093923528e-05, + "loss": 0.7183539271354675, + "step": 2240 + }, + { + "epoch": 0.9459915611814346, + "grad_norm": 1.1444766521453857, + "learning_rate": 9.825617960559114e-05, + "loss": 0.7417964935302734, + "step": 2242 + }, + { + "epoch": 0.9468354430379747, + "grad_norm": 1.4059823751449585, + "learning_rate": 9.824988714017316e-05, + "loss": 0.7949740290641785, + "step": 2244 + }, + { + "epoch": 0.9476793248945148, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.824358354443286e-05, + "loss": 0.6433083415031433, + "step": 2246 + }, + { + "epoch": 0.9485232067510548, + "grad_norm": 1.0879144668579102, + "learning_rate": 9.823726881982438e-05, + "loss": 0.6519861817359924, + "step": 2248 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.2289162874221802, + "learning_rate": 9.82309429678044e-05, + "loss": 0.7280195355415344, + "step": 2250 + }, + { + "epoch": 0.950210970464135, + "grad_norm": 1.1755765676498413, + "learning_rate": 9.822460598983217e-05, + "loss": 0.7524687647819519, + "step": 2252 + }, + { + "epoch": 0.9510548523206751, + "grad_norm": 1.179807186126709, + "learning_rate": 9.821825788736949e-05, + "loss": 0.7543174624443054, + "step": 2254 + }, + { + "epoch": 0.9518987341772152, + "grad_norm": 1.1234289407730103, + "learning_rate": 9.821189866188079e-05, + "loss": 0.716377854347229, + "step": 2256 + }, + { + "epoch": 0.9527426160337553, + "grad_norm": 1.0324063301086426, + "learning_rate": 9.820552831483297e-05, + "loss": 0.6403332948684692, + "step": 2258 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 1.1459579467773438, + "learning_rate": 9.819914684769558e-05, + "loss": 0.7406947612762451, + "step": 2260 + }, + { + "epoch": 0.9544303797468354, + "grad_norm": 1.2886124849319458, + "learning_rate": 9.819275426194072e-05, + "loss": 0.749687671661377, + "step": 2262 + }, + { + "epoch": 0.9552742616033755, + "grad_norm": 1.3349844217300415, + "learning_rate": 9.818635055904299e-05, + "loss": 0.778410017490387, + "step": 2264 + }, + { + "epoch": 0.9561181434599156, + "grad_norm": 1.0994901657104492, + "learning_rate": 9.81799357404796e-05, + "loss": 0.6701914668083191, + "step": 2266 + }, + { + "epoch": 0.9569620253164557, + "grad_norm": 1.1787796020507812, + "learning_rate": 9.817350980773038e-05, + "loss": 0.7205135226249695, + "step": 2268 + }, + { + "epoch": 0.9578059071729957, + "grad_norm": 1.100813627243042, + "learning_rate": 9.816707276227763e-05, + "loss": 0.6897916197776794, + "step": 2270 + }, + { + "epoch": 0.9586497890295359, + "grad_norm": 1.1280698776245117, + "learning_rate": 9.816062460560627e-05, + "loss": 0.6763570308685303, + "step": 2272 + }, + { + "epoch": 0.959493670886076, + "grad_norm": 1.2322514057159424, + "learning_rate": 9.815416533920374e-05, + "loss": 0.6948683857917786, + "step": 2274 + }, + { + "epoch": 0.960337552742616, + "grad_norm": 1.3963630199432373, + "learning_rate": 9.814769496456008e-05, + "loss": 0.7876828908920288, + "step": 2276 + }, + { + "epoch": 0.9611814345991562, + "grad_norm": 1.2093676328659058, + "learning_rate": 9.814121348316792e-05, + "loss": 0.8191362619400024, + "step": 2278 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 1.2223572731018066, + "learning_rate": 9.813472089652233e-05, + "loss": 0.7162626385688782, + "step": 2280 + }, + { + "epoch": 0.9628691983122363, + "grad_norm": 1.1498078107833862, + "learning_rate": 9.812821720612111e-05, + "loss": 0.7183970212936401, + "step": 2282 + }, + { + "epoch": 0.9637130801687763, + "grad_norm": 1.1563853025436401, + "learning_rate": 9.812170241346449e-05, + "loss": 0.734487771987915, + "step": 2284 + }, + { + "epoch": 0.9645569620253165, + "grad_norm": 1.1823415756225586, + "learning_rate": 9.81151765200553e-05, + "loss": 0.7312371730804443, + "step": 2286 + }, + { + "epoch": 0.9654008438818565, + "grad_norm": 1.1336151361465454, + "learning_rate": 9.810863952739899e-05, + "loss": 0.7668377757072449, + "step": 2288 + }, + { + "epoch": 0.9662447257383966, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.810209143700347e-05, + "loss": 0.7100399732589722, + "step": 2290 + }, + { + "epoch": 0.9670886075949368, + "grad_norm": 1.1368129253387451, + "learning_rate": 9.809553225037926e-05, + "loss": 0.7169836163520813, + "step": 2292 + }, + { + "epoch": 0.9679324894514768, + "grad_norm": 1.141107439994812, + "learning_rate": 9.808896196903947e-05, + "loss": 0.7709535956382751, + "step": 2294 + }, + { + "epoch": 0.9687763713080169, + "grad_norm": 1.276405930519104, + "learning_rate": 9.808238059449971e-05, + "loss": 0.7300511002540588, + "step": 2296 + }, + { + "epoch": 0.9696202531645569, + "grad_norm": 0.9817046523094177, + "learning_rate": 9.80757881282782e-05, + "loss": 0.6259129047393799, + "step": 2298 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 1.3965257406234741, + "learning_rate": 9.806918457189566e-05, + "loss": 0.7361716032028198, + "step": 2300 + }, + { + "epoch": 0.9704641350210971, + "eval_loss": 0.7464568614959717, + "eval_runtime": 864.2128, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 2.438, + "step": 2300 + }, + { + "epoch": 0.9713080168776371, + "grad_norm": 1.2168612480163574, + "learning_rate": 9.806256992687544e-05, + "loss": 0.805477499961853, + "step": 2302 + }, + { + "epoch": 0.9721518987341772, + "grad_norm": 1.0418168306350708, + "learning_rate": 9.80559441947434e-05, + "loss": 0.6673368811607361, + "step": 2304 + }, + { + "epoch": 0.9729957805907173, + "grad_norm": 1.223128318786621, + "learning_rate": 9.804930737702796e-05, + "loss": 0.7585647106170654, + "step": 2306 + }, + { + "epoch": 0.9738396624472574, + "grad_norm": 1.264511227607727, + "learning_rate": 9.804265947526011e-05, + "loss": 0.7642034888267517, + "step": 2308 + }, + { + "epoch": 0.9746835443037974, + "grad_norm": 1.076887607574463, + "learning_rate": 9.803600049097339e-05, + "loss": 0.7094541192054749, + "step": 2310 + }, + { + "epoch": 0.9755274261603376, + "grad_norm": 1.0214987993240356, + "learning_rate": 9.802933042570392e-05, + "loss": 0.7370059490203857, + "step": 2312 + }, + { + "epoch": 0.9763713080168777, + "grad_norm": 1.3075295686721802, + "learning_rate": 9.802264928099035e-05, + "loss": 0.726834237575531, + "step": 2314 + }, + { + "epoch": 0.9772151898734177, + "grad_norm": 1.057386040687561, + "learning_rate": 9.801595705837385e-05, + "loss": 0.6742353439331055, + "step": 2316 + }, + { + "epoch": 0.9780590717299578, + "grad_norm": 1.3998085260391235, + "learning_rate": 9.800925375939825e-05, + "loss": 0.6862425208091736, + "step": 2318 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 1.080574631690979, + "learning_rate": 9.800253938560983e-05, + "loss": 0.6212031245231628, + "step": 2320 + }, + { + "epoch": 0.979746835443038, + "grad_norm": 1.3643771409988403, + "learning_rate": 9.799581393855748e-05, + "loss": 0.7522522211074829, + "step": 2322 + }, + { + "epoch": 0.980590717299578, + "grad_norm": 1.2455768585205078, + "learning_rate": 9.798907741979264e-05, + "loss": 0.7265716791152954, + "step": 2324 + }, + { + "epoch": 0.9814345991561182, + "grad_norm": 1.078774333000183, + "learning_rate": 9.798232983086927e-05, + "loss": 0.7160419225692749, + "step": 2326 + }, + { + "epoch": 0.9822784810126582, + "grad_norm": 1.3013948202133179, + "learning_rate": 9.797557117334394e-05, + "loss": 0.7991124391555786, + "step": 2328 + }, + { + "epoch": 0.9831223628691983, + "grad_norm": 1.2216732501983643, + "learning_rate": 9.796880144877572e-05, + "loss": 0.7193916440010071, + "step": 2330 + }, + { + "epoch": 0.9839662447257383, + "grad_norm": 1.1469542980194092, + "learning_rate": 9.796202065872627e-05, + "loss": 0.7184370756149292, + "step": 2332 + }, + { + "epoch": 0.9848101265822785, + "grad_norm": 1.0431830883026123, + "learning_rate": 9.795522880475979e-05, + "loss": 0.6474619507789612, + "step": 2334 + }, + { + "epoch": 0.9856540084388186, + "grad_norm": 1.1819576025009155, + "learning_rate": 9.794842588844299e-05, + "loss": 0.6392545700073242, + "step": 2336 + }, + { + "epoch": 0.9864978902953586, + "grad_norm": 1.1984983682632446, + "learning_rate": 9.794161191134525e-05, + "loss": 0.7358114719390869, + "step": 2338 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 1.3378512859344482, + "learning_rate": 9.793478687503834e-05, + "loss": 0.6762020587921143, + "step": 2340 + }, + { + "epoch": 0.9881856540084388, + "grad_norm": 1.272674560546875, + "learning_rate": 9.792795078109673e-05, + "loss": 0.7478934526443481, + "step": 2342 + }, + { + "epoch": 0.9890295358649789, + "grad_norm": 1.153746247291565, + "learning_rate": 9.792110363109733e-05, + "loss": 0.7316533923149109, + "step": 2344 + }, + { + "epoch": 0.9898734177215189, + "grad_norm": 1.1361702680587769, + "learning_rate": 9.791424542661967e-05, + "loss": 0.7078539133071899, + "step": 2346 + }, + { + "epoch": 0.9907172995780591, + "grad_norm": 1.3043115139007568, + "learning_rate": 9.790737616924581e-05, + "loss": 0.7945935130119324, + "step": 2348 + }, + { + "epoch": 0.9915611814345991, + "grad_norm": 1.1913264989852905, + "learning_rate": 9.790049586056034e-05, + "loss": 0.8247197866439819, + "step": 2350 + }, + { + "epoch": 0.9924050632911392, + "grad_norm": 1.1560171842575073, + "learning_rate": 9.789360450215041e-05, + "loss": 0.7099657654762268, + "step": 2352 + }, + { + "epoch": 0.9932489451476794, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.788670209560575e-05, + "loss": 0.7480318546295166, + "step": 2354 + }, + { + "epoch": 0.9940928270042194, + "grad_norm": 1.1584707498550415, + "learning_rate": 9.787978864251859e-05, + "loss": 0.6870889067649841, + "step": 2356 + }, + { + "epoch": 0.9949367088607595, + "grad_norm": 1.057478666305542, + "learning_rate": 9.787286414448375e-05, + "loss": 0.6114922165870667, + "step": 2358 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 1.1431775093078613, + "learning_rate": 9.786592860309856e-05, + "loss": 0.6955118179321289, + "step": 2360 + }, + { + "epoch": 0.9966244725738397, + "grad_norm": 1.232142448425293, + "learning_rate": 9.785898201996292e-05, + "loss": 0.735048770904541, + "step": 2362 + }, + { + "epoch": 0.9974683544303797, + "grad_norm": 1.1236306428909302, + "learning_rate": 9.785202439667928e-05, + "loss": 0.7150241136550903, + "step": 2364 + }, + { + "epoch": 0.9983122362869198, + "grad_norm": 1.0517534017562866, + "learning_rate": 9.784505573485263e-05, + "loss": 0.6870222687721252, + "step": 2366 + }, + { + "epoch": 0.99915611814346, + "grad_norm": 1.1747480630874634, + "learning_rate": 9.78380760360905e-05, + "loss": 0.7521567940711975, + "step": 2368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2790346145629883, + "learning_rate": 9.783108530200298e-05, + "loss": 0.7336234450340271, + "step": 2370 + }, + { + "epoch": 1.0008438818565402, + "grad_norm": 1.1216399669647217, + "learning_rate": 9.78240835342027e-05, + "loss": 0.6378109455108643, + "step": 2372 + }, + { + "epoch": 1.00168776371308, + "grad_norm": 1.267336368560791, + "learning_rate": 9.781707073430482e-05, + "loss": 0.6174905300140381, + "step": 2374 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 1.1342934370040894, + "learning_rate": 9.781004690392706e-05, + "loss": 0.6579123139381409, + "step": 2376 + }, + { + "epoch": 1.0033755274261604, + "grad_norm": 1.1317468881607056, + "learning_rate": 9.78030120446897e-05, + "loss": 0.6679617166519165, + "step": 2378 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 1.2992616891860962, + "learning_rate": 9.779596615821552e-05, + "loss": 0.7368149161338806, + "step": 2380 + }, + { + "epoch": 1.0050632911392405, + "grad_norm": 1.1714510917663574, + "learning_rate": 9.77889092461299e-05, + "loss": 0.6887164115905762, + "step": 2382 + }, + { + "epoch": 1.0059071729957807, + "grad_norm": 1.1670639514923096, + "learning_rate": 9.778184131006071e-05, + "loss": 0.681344211101532, + "step": 2384 + }, + { + "epoch": 1.0067510548523206, + "grad_norm": 1.2487291097640991, + "learning_rate": 9.77747623516384e-05, + "loss": 0.7342769503593445, + "step": 2386 + }, + { + "epoch": 1.0075949367088608, + "grad_norm": 1.2408956289291382, + "learning_rate": 9.776767237249595e-05, + "loss": 0.577454149723053, + "step": 2388 + }, + { + "epoch": 1.0084388185654007, + "grad_norm": 1.067991852760315, + "learning_rate": 9.776057137426889e-05, + "loss": 0.6588307023048401, + "step": 2390 + }, + { + "epoch": 1.009282700421941, + "grad_norm": 1.2821543216705322, + "learning_rate": 9.775345935859525e-05, + "loss": 0.7045041918754578, + "step": 2392 + }, + { + "epoch": 1.010126582278481, + "grad_norm": 1.3160134553909302, + "learning_rate": 9.774633632711569e-05, + "loss": 0.7141479253768921, + "step": 2394 + }, + { + "epoch": 1.010970464135021, + "grad_norm": 1.66774320602417, + "learning_rate": 9.773920228147329e-05, + "loss": 0.723293662071228, + "step": 2396 + }, + { + "epoch": 1.0118143459915612, + "grad_norm": 1.027588963508606, + "learning_rate": 9.77320572233138e-05, + "loss": 0.5812023878097534, + "step": 2398 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.406507968902588, + "learning_rate": 9.77249011542854e-05, + "loss": 0.7071458101272583, + "step": 2400 + }, + { + "epoch": 1.0126582278481013, + "eval_loss": 0.7421699166297913, + "eval_runtime": 854.2185, + "eval_samples_per_second": 2.467, + "eval_steps_per_second": 2.467, + "step": 2400 + }, + { + "epoch": 1.0135021097046413, + "grad_norm": 1.1236240863800049, + "learning_rate": 9.771773407603889e-05, + "loss": 0.7049722671508789, + "step": 2402 + }, + { + "epoch": 1.0143459915611814, + "grad_norm": 1.1924289464950562, + "learning_rate": 9.771055599022756e-05, + "loss": 0.635308027267456, + "step": 2404 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 1.1744966506958008, + "learning_rate": 9.770336689850727e-05, + "loss": 0.7286487817764282, + "step": 2406 + }, + { + "epoch": 1.0160337552742615, + "grad_norm": 1.2131173610687256, + "learning_rate": 9.769616680253639e-05, + "loss": 0.6828222274780273, + "step": 2408 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 1.0517828464508057, + "learning_rate": 9.768895570397585e-05, + "loss": 0.6652156114578247, + "step": 2410 + }, + { + "epoch": 1.0177215189873419, + "grad_norm": 1.1603758335113525, + "learning_rate": 9.768173360448912e-05, + "loss": 0.7278267741203308, + "step": 2412 + }, + { + "epoch": 1.0185654008438818, + "grad_norm": 1.3167752027511597, + "learning_rate": 9.767450050574218e-05, + "loss": 0.6082334518432617, + "step": 2414 + }, + { + "epoch": 1.019409282700422, + "grad_norm": 1.1754449605941772, + "learning_rate": 9.766725640940358e-05, + "loss": 0.67228102684021, + "step": 2416 + }, + { + "epoch": 1.0202531645569621, + "grad_norm": 1.060952067375183, + "learning_rate": 9.766000131714442e-05, + "loss": 0.5984366536140442, + "step": 2418 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 1.0826152563095093, + "learning_rate": 9.765273523063825e-05, + "loss": 0.690661609172821, + "step": 2420 + }, + { + "epoch": 1.0219409282700422, + "grad_norm": 1.423723816871643, + "learning_rate": 9.764545815156125e-05, + "loss": 0.7960668802261353, + "step": 2422 + }, + { + "epoch": 1.0227848101265822, + "grad_norm": 1.0882549285888672, + "learning_rate": 9.763817008159212e-05, + "loss": 0.6971074342727661, + "step": 2424 + }, + { + "epoch": 1.0236286919831223, + "grad_norm": 1.1053040027618408, + "learning_rate": 9.763087102241206e-05, + "loss": 0.6854458451271057, + "step": 2426 + }, + { + "epoch": 1.0244725738396625, + "grad_norm": 1.1975224018096924, + "learning_rate": 9.762356097570482e-05, + "loss": 0.6724489331245422, + "step": 2428 + }, + { + "epoch": 1.0253164556962024, + "grad_norm": 1.1692171096801758, + "learning_rate": 9.76162399431567e-05, + "loss": 0.7064506411552429, + "step": 2430 + }, + { + "epoch": 1.0261603375527426, + "grad_norm": 1.1927787065505981, + "learning_rate": 9.760890792645649e-05, + "loss": 0.6605257391929626, + "step": 2432 + }, + { + "epoch": 1.0270042194092828, + "grad_norm": 1.4147427082061768, + "learning_rate": 9.760156492729558e-05, + "loss": 0.6872501373291016, + "step": 2434 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 1.2503126859664917, + "learning_rate": 9.759421094736785e-05, + "loss": 0.7117500305175781, + "step": 2436 + }, + { + "epoch": 1.0286919831223629, + "grad_norm": 1.229978084564209, + "learning_rate": 9.758684598836971e-05, + "loss": 0.6740369200706482, + "step": 2438 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 1.4765945672988892, + "learning_rate": 9.757947005200014e-05, + "loss": 0.7215790748596191, + "step": 2440 + }, + { + "epoch": 1.030379746835443, + "grad_norm": 1.282632827758789, + "learning_rate": 9.757208313996061e-05, + "loss": 0.6961746215820312, + "step": 2442 + }, + { + "epoch": 1.0312236286919831, + "grad_norm": 1.259828805923462, + "learning_rate": 9.756468525395512e-05, + "loss": 0.6348349452018738, + "step": 2444 + }, + { + "epoch": 1.0320675105485233, + "grad_norm": 1.0984172821044922, + "learning_rate": 9.755727639569024e-05, + "loss": 0.6756057739257812, + "step": 2446 + }, + { + "epoch": 1.0329113924050632, + "grad_norm": 1.235835075378418, + "learning_rate": 9.754985656687506e-05, + "loss": 0.6968509554862976, + "step": 2448 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 1.273032546043396, + "learning_rate": 9.754242576922119e-05, + "loss": 0.6793950796127319, + "step": 2450 + }, + { + "epoch": 1.0345991561181433, + "grad_norm": 1.251996397972107, + "learning_rate": 9.753498400444274e-05, + "loss": 0.645270586013794, + "step": 2452 + }, + { + "epoch": 1.0354430379746835, + "grad_norm": 1.4310805797576904, + "learning_rate": 9.752753127425642e-05, + "loss": 0.7291322350502014, + "step": 2454 + }, + { + "epoch": 1.0362869198312237, + "grad_norm": 1.6582196950912476, + "learning_rate": 9.752006758038142e-05, + "loss": 0.7553019523620605, + "step": 2456 + }, + { + "epoch": 1.0371308016877636, + "grad_norm": 1.081773042678833, + "learning_rate": 9.751259292453947e-05, + "loss": 0.5637331008911133, + "step": 2458 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 1.1483876705169678, + "learning_rate": 9.750510730845483e-05, + "loss": 0.6012396216392517, + "step": 2460 + }, + { + "epoch": 1.038818565400844, + "grad_norm": 1.0879185199737549, + "learning_rate": 9.749761073385428e-05, + "loss": 0.6795822381973267, + "step": 2462 + }, + { + "epoch": 1.0396624472573839, + "grad_norm": 1.2378218173980713, + "learning_rate": 9.749010320246714e-05, + "loss": 0.6895145773887634, + "step": 2464 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 1.253233790397644, + "learning_rate": 9.748258471602527e-05, + "loss": 0.7124115228652954, + "step": 2466 + }, + { + "epoch": 1.0413502109704642, + "grad_norm": 1.3994864225387573, + "learning_rate": 9.747505527626302e-05, + "loss": 0.7304861545562744, + "step": 2468 + }, + { + "epoch": 1.0421940928270041, + "grad_norm": 1.2360669374465942, + "learning_rate": 9.74675148849173e-05, + "loss": 0.6845837831497192, + "step": 2470 + }, + { + "epoch": 1.0430379746835443, + "grad_norm": 1.126849889755249, + "learning_rate": 9.74599635437275e-05, + "loss": 0.6780203580856323, + "step": 2472 + }, + { + "epoch": 1.0438818565400845, + "grad_norm": 1.169788122177124, + "learning_rate": 9.745240125443562e-05, + "loss": 0.7550003528594971, + "step": 2474 + }, + { + "epoch": 1.0447257383966244, + "grad_norm": 1.1311867237091064, + "learning_rate": 9.744482801878612e-05, + "loss": 0.6910399198532104, + "step": 2476 + }, + { + "epoch": 1.0455696202531646, + "grad_norm": 1.1267731189727783, + "learning_rate": 9.743724383852597e-05, + "loss": 0.7164814472198486, + "step": 2478 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 1.2239704132080078, + "learning_rate": 9.742964871540472e-05, + "loss": 0.6428439617156982, + "step": 2480 + }, + { + "epoch": 1.0472573839662447, + "grad_norm": 1.1854743957519531, + "learning_rate": 9.742204265117443e-05, + "loss": 0.6994290351867676, + "step": 2482 + }, + { + "epoch": 1.0481012658227848, + "grad_norm": 1.0695894956588745, + "learning_rate": 9.741442564758964e-05, + "loss": 0.6725777983665466, + "step": 2484 + }, + { + "epoch": 1.048945147679325, + "grad_norm": 1.1799863576889038, + "learning_rate": 9.740679770640748e-05, + "loss": 0.6538674235343933, + "step": 2486 + }, + { + "epoch": 1.049789029535865, + "grad_norm": 1.295546293258667, + "learning_rate": 9.739915882938754e-05, + "loss": 0.780756950378418, + "step": 2488 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 1.2371755838394165, + "learning_rate": 9.739150901829198e-05, + "loss": 0.6657930612564087, + "step": 2490 + }, + { + "epoch": 1.051476793248945, + "grad_norm": 1.103037714958191, + "learning_rate": 9.738384827488547e-05, + "loss": 0.6675208210945129, + "step": 2492 + }, + { + "epoch": 1.0523206751054852, + "grad_norm": 1.1835435628890991, + "learning_rate": 9.737617660093517e-05, + "loss": 0.6693358421325684, + "step": 2494 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 1.003771424293518, + "learning_rate": 9.736849399821082e-05, + "loss": 0.624502956867218, + "step": 2496 + }, + { + "epoch": 1.0540084388185653, + "grad_norm": 1.1391769647598267, + "learning_rate": 9.736080046848463e-05, + "loss": 0.6350868344306946, + "step": 2498 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 1.376518726348877, + "learning_rate": 9.735309601353134e-05, + "loss": 0.6721012592315674, + "step": 2500 + }, + { + "epoch": 1.0548523206751055, + "eval_loss": 0.741338849067688, + "eval_runtime": 847.7478, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2500 + }, + { + "epoch": 1.0556962025316456, + "grad_norm": 1.194190502166748, + "learning_rate": 9.734538063512824e-05, + "loss": 0.6888233423233032, + "step": 2502 + }, + { + "epoch": 1.0565400843881856, + "grad_norm": 1.378830909729004, + "learning_rate": 9.733765433505513e-05, + "loss": 0.7095553278923035, + "step": 2504 + }, + { + "epoch": 1.0573839662447257, + "grad_norm": 1.1289541721343994, + "learning_rate": 9.732991711509428e-05, + "loss": 0.6734166145324707, + "step": 2506 + }, + { + "epoch": 1.058227848101266, + "grad_norm": 1.1858116388320923, + "learning_rate": 9.732216897703054e-05, + "loss": 0.7006195187568665, + "step": 2508 + }, + { + "epoch": 1.0590717299578059, + "grad_norm": 1.1365686655044556, + "learning_rate": 9.731440992265127e-05, + "loss": 0.6481205821037292, + "step": 2510 + }, + { + "epoch": 1.059915611814346, + "grad_norm": 1.2886228561401367, + "learning_rate": 9.730663995374632e-05, + "loss": 0.679282546043396, + "step": 2512 + }, + { + "epoch": 1.0607594936708862, + "grad_norm": 1.355322003364563, + "learning_rate": 9.729885907210808e-05, + "loss": 0.7656359672546387, + "step": 2514 + }, + { + "epoch": 1.0616033755274261, + "grad_norm": 1.1552364826202393, + "learning_rate": 9.729106727953142e-05, + "loss": 0.5996183156967163, + "step": 2516 + }, + { + "epoch": 1.0624472573839663, + "grad_norm": 1.1419235467910767, + "learning_rate": 9.728326457781381e-05, + "loss": 0.7599716782569885, + "step": 2518 + }, + { + "epoch": 1.0632911392405062, + "grad_norm": 1.2240079641342163, + "learning_rate": 9.727545096875512e-05, + "loss": 0.7150241732597351, + "step": 2520 + }, + { + "epoch": 1.0641350210970464, + "grad_norm": 1.2463440895080566, + "learning_rate": 9.726762645415785e-05, + "loss": 0.734352171421051, + "step": 2522 + }, + { + "epoch": 1.0649789029535865, + "grad_norm": 1.1680364608764648, + "learning_rate": 9.725979103582697e-05, + "loss": 0.6950796842575073, + "step": 2524 + }, + { + "epoch": 1.0658227848101265, + "grad_norm": 1.1680421829223633, + "learning_rate": 9.725194471556991e-05, + "loss": 0.7096341252326965, + "step": 2526 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.043717861175537, + "learning_rate": 9.724408749519671e-05, + "loss": 0.6486304402351379, + "step": 2528 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 1.1240284442901611, + "learning_rate": 9.723621937651985e-05, + "loss": 0.6519505381584167, + "step": 2530 + }, + { + "epoch": 1.0683544303797468, + "grad_norm": 1.185223937034607, + "learning_rate": 9.722834036135439e-05, + "loss": 0.6724293231964111, + "step": 2532 + }, + { + "epoch": 1.069198312236287, + "grad_norm": 1.3234196901321411, + "learning_rate": 9.722045045151784e-05, + "loss": 0.6886576414108276, + "step": 2534 + }, + { + "epoch": 1.070042194092827, + "grad_norm": 1.333084225654602, + "learning_rate": 9.721254964883024e-05, + "loss": 0.688493549823761, + "step": 2536 + }, + { + "epoch": 1.070886075949367, + "grad_norm": 1.2435462474822998, + "learning_rate": 9.720463795511419e-05, + "loss": 0.6527412533760071, + "step": 2538 + }, + { + "epoch": 1.0717299578059072, + "grad_norm": 1.1521880626678467, + "learning_rate": 9.719671537219472e-05, + "loss": 0.6508163809776306, + "step": 2540 + }, + { + "epoch": 1.0725738396624473, + "grad_norm": 1.015013575553894, + "learning_rate": 9.718878190189947e-05, + "loss": 0.6954023838043213, + "step": 2542 + }, + { + "epoch": 1.0734177215189873, + "grad_norm": 1.1507678031921387, + "learning_rate": 9.718083754605851e-05, + "loss": 0.7201322913169861, + "step": 2544 + }, + { + "epoch": 1.0742616033755275, + "grad_norm": 1.0569016933441162, + "learning_rate": 9.717288230650444e-05, + "loss": 0.6688649654388428, + "step": 2546 + }, + { + "epoch": 1.0751054852320676, + "grad_norm": 1.2178492546081543, + "learning_rate": 9.716491618507241e-05, + "loss": 0.7077898979187012, + "step": 2548 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 1.3587230443954468, + "learning_rate": 9.715693918360002e-05, + "loss": 0.7312119603157043, + "step": 2550 + }, + { + "epoch": 1.0767932489451477, + "grad_norm": 1.1930122375488281, + "learning_rate": 9.714895130392744e-05, + "loss": 0.6910589337348938, + "step": 2552 + }, + { + "epoch": 1.0776371308016879, + "grad_norm": 1.2440707683563232, + "learning_rate": 9.71409525478973e-05, + "loss": 0.7942836284637451, + "step": 2554 + }, + { + "epoch": 1.0784810126582278, + "grad_norm": 1.3755065202713013, + "learning_rate": 9.713294291735477e-05, + "loss": 0.6652286052703857, + "step": 2556 + }, + { + "epoch": 1.079324894514768, + "grad_norm": 1.165448784828186, + "learning_rate": 9.71249224141475e-05, + "loss": 0.6025735139846802, + "step": 2558 + }, + { + "epoch": 1.080168776371308, + "grad_norm": 1.2981204986572266, + "learning_rate": 9.711689104012569e-05, + "loss": 0.7343734502792358, + "step": 2560 + }, + { + "epoch": 1.081012658227848, + "grad_norm": 1.2040622234344482, + "learning_rate": 9.710884879714202e-05, + "loss": 0.6903306841850281, + "step": 2562 + }, + { + "epoch": 1.0818565400843883, + "grad_norm": 1.1835904121398926, + "learning_rate": 9.710079568705168e-05, + "loss": 0.69134920835495, + "step": 2564 + }, + { + "epoch": 1.0827004219409282, + "grad_norm": 1.3345229625701904, + "learning_rate": 9.709273171171235e-05, + "loss": 0.6471185088157654, + "step": 2566 + }, + { + "epoch": 1.0835443037974684, + "grad_norm": 1.0884469747543335, + "learning_rate": 9.708465687298425e-05, + "loss": 0.6302382349967957, + "step": 2568 + }, + { + "epoch": 1.0843881856540085, + "grad_norm": 1.1994211673736572, + "learning_rate": 9.707657117273007e-05, + "loss": 0.7329678535461426, + "step": 2570 + }, + { + "epoch": 1.0852320675105485, + "grad_norm": 1.2609503269195557, + "learning_rate": 9.706847461281507e-05, + "loss": 0.719862163066864, + "step": 2572 + }, + { + "epoch": 1.0860759493670886, + "grad_norm": 1.2686879634857178, + "learning_rate": 9.706036719510694e-05, + "loss": 0.7142901420593262, + "step": 2574 + }, + { + "epoch": 1.0869198312236288, + "grad_norm": 1.2763310670852661, + "learning_rate": 9.705224892147591e-05, + "loss": 0.7009075284004211, + "step": 2576 + }, + { + "epoch": 1.0877637130801687, + "grad_norm": 1.1704022884368896, + "learning_rate": 9.70441197937947e-05, + "loss": 0.6873779296875, + "step": 2578 + }, + { + "epoch": 1.0886075949367089, + "grad_norm": 1.0482875108718872, + "learning_rate": 9.703597981393856e-05, + "loss": 0.6437726020812988, + "step": 2580 + }, + { + "epoch": 1.0894514767932488, + "grad_norm": 1.28431236743927, + "learning_rate": 9.702782898378521e-05, + "loss": 0.6933431625366211, + "step": 2582 + }, + { + "epoch": 1.090295358649789, + "grad_norm": 1.0962283611297607, + "learning_rate": 9.701966730521491e-05, + "loss": 0.6488757133483887, + "step": 2584 + }, + { + "epoch": 1.0911392405063292, + "grad_norm": 1.2177873849868774, + "learning_rate": 9.70114947801104e-05, + "loss": 0.6385396122932434, + "step": 2586 + }, + { + "epoch": 1.091983122362869, + "grad_norm": 1.197059988975525, + "learning_rate": 9.70033114103569e-05, + "loss": 0.6826614737510681, + "step": 2588 + }, + { + "epoch": 1.0928270042194093, + "grad_norm": 1.1624075174331665, + "learning_rate": 9.699511719784217e-05, + "loss": 0.605629563331604, + "step": 2590 + }, + { + "epoch": 1.0936708860759494, + "grad_norm": 1.2975167036056519, + "learning_rate": 9.698691214445648e-05, + "loss": 0.734926700592041, + "step": 2592 + }, + { + "epoch": 1.0945147679324894, + "grad_norm": 1.215414047241211, + "learning_rate": 9.697869625209255e-05, + "loss": 0.7281333804130554, + "step": 2594 + }, + { + "epoch": 1.0953586497890295, + "grad_norm": 1.1862860918045044, + "learning_rate": 9.697046952264563e-05, + "loss": 0.7388250827789307, + "step": 2596 + }, + { + "epoch": 1.0962025316455697, + "grad_norm": 1.1127797365188599, + "learning_rate": 9.696223195801348e-05, + "loss": 0.6495320796966553, + "step": 2598 + }, + { + "epoch": 1.0970464135021096, + "grad_norm": 1.0863338708877563, + "learning_rate": 9.695398356009636e-05, + "loss": 0.7157143950462341, + "step": 2600 + }, + { + "epoch": 1.0970464135021096, + "eval_loss": 0.7377332448959351, + "eval_runtime": 859.6612, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 2600 + }, + { + "epoch": 1.0978902953586498, + "grad_norm": 1.1228652000427246, + "learning_rate": 9.694572433079699e-05, + "loss": 0.6597335934638977, + "step": 2602 + }, + { + "epoch": 1.09873417721519, + "grad_norm": 1.3077653646469116, + "learning_rate": 9.69374542720206e-05, + "loss": 0.6715680360794067, + "step": 2604 + }, + { + "epoch": 1.09957805907173, + "grad_norm": 1.241603970527649, + "learning_rate": 9.692917338567499e-05, + "loss": 0.6910243034362793, + "step": 2606 + }, + { + "epoch": 1.10042194092827, + "grad_norm": 1.1372551918029785, + "learning_rate": 9.692088167367037e-05, + "loss": 0.6519553065299988, + "step": 2608 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 1.2894765138626099, + "learning_rate": 9.691257913791949e-05, + "loss": 0.6542758941650391, + "step": 2610 + }, + { + "epoch": 1.1021097046413502, + "grad_norm": 1.0800915956497192, + "learning_rate": 9.690426578033755e-05, + "loss": 0.6886795163154602, + "step": 2612 + }, + { + "epoch": 1.1029535864978903, + "grad_norm": 1.3394384384155273, + "learning_rate": 9.689594160284233e-05, + "loss": 0.7512150406837463, + "step": 2614 + }, + { + "epoch": 1.1037974683544305, + "grad_norm": 1.2175323963165283, + "learning_rate": 9.688760660735402e-05, + "loss": 0.67207932472229, + "step": 2616 + }, + { + "epoch": 1.1046413502109704, + "grad_norm": 1.2181185483932495, + "learning_rate": 9.687926079579537e-05, + "loss": 0.6591740846633911, + "step": 2618 + }, + { + "epoch": 1.1054852320675106, + "grad_norm": 1.1740983724594116, + "learning_rate": 9.68709041700916e-05, + "loss": 0.6431041359901428, + "step": 2620 + }, + { + "epoch": 1.1063291139240505, + "grad_norm": 1.1792434453964233, + "learning_rate": 9.686253673217038e-05, + "loss": 0.6573615074157715, + "step": 2622 + }, + { + "epoch": 1.1071729957805907, + "grad_norm": 1.058391809463501, + "learning_rate": 9.685415848396196e-05, + "loss": 0.5576209425926208, + "step": 2624 + }, + { + "epoch": 1.1080168776371309, + "grad_norm": 1.3203206062316895, + "learning_rate": 9.684576942739903e-05, + "loss": 0.668684184551239, + "step": 2626 + }, + { + "epoch": 1.1088607594936708, + "grad_norm": 1.2391762733459473, + "learning_rate": 9.68373695644168e-05, + "loss": 0.6800089478492737, + "step": 2628 + }, + { + "epoch": 1.109704641350211, + "grad_norm": 1.2323405742645264, + "learning_rate": 9.682895889695292e-05, + "loss": 0.6433757543563843, + "step": 2630 + }, + { + "epoch": 1.1105485232067511, + "grad_norm": 1.2656551599502563, + "learning_rate": 9.682053742694759e-05, + "loss": 0.6628785729408264, + "step": 2632 + }, + { + "epoch": 1.111392405063291, + "grad_norm": 1.2984392642974854, + "learning_rate": 9.681210515634349e-05, + "loss": 0.6838971972465515, + "step": 2634 + }, + { + "epoch": 1.1122362869198312, + "grad_norm": 1.3200393915176392, + "learning_rate": 9.680366208708576e-05, + "loss": 0.7548647522926331, + "step": 2636 + }, + { + "epoch": 1.1130801687763714, + "grad_norm": 1.225388526916504, + "learning_rate": 9.679520822112208e-05, + "loss": 0.6553335189819336, + "step": 2638 + }, + { + "epoch": 1.1139240506329113, + "grad_norm": 1.2350653409957886, + "learning_rate": 9.678674356040259e-05, + "loss": 0.631401538848877, + "step": 2640 + }, + { + "epoch": 1.1147679324894515, + "grad_norm": 1.2325507402420044, + "learning_rate": 9.677826810687989e-05, + "loss": 0.6459156274795532, + "step": 2642 + }, + { + "epoch": 1.1156118143459917, + "grad_norm": 1.0008996725082397, + "learning_rate": 9.676978186250915e-05, + "loss": 0.6425284743309021, + "step": 2644 + }, + { + "epoch": 1.1164556962025316, + "grad_norm": 1.3767247200012207, + "learning_rate": 9.676128482924796e-05, + "loss": 0.6451422572135925, + "step": 2646 + }, + { + "epoch": 1.1172995780590718, + "grad_norm": 1.2070895433425903, + "learning_rate": 9.675277700905643e-05, + "loss": 0.6713272929191589, + "step": 2648 + }, + { + "epoch": 1.1181434599156117, + "grad_norm": 1.1582069396972656, + "learning_rate": 9.674425840389716e-05, + "loss": 0.6285044550895691, + "step": 2650 + }, + { + "epoch": 1.1189873417721519, + "grad_norm": 1.1641311645507812, + "learning_rate": 9.67357290157352e-05, + "loss": 0.624229907989502, + "step": 2652 + }, + { + "epoch": 1.119831223628692, + "grad_norm": 1.3071147203445435, + "learning_rate": 9.672718884653814e-05, + "loss": 0.7214919328689575, + "step": 2654 + }, + { + "epoch": 1.120675105485232, + "grad_norm": 1.2157800197601318, + "learning_rate": 9.671863789827602e-05, + "loss": 0.8062215447425842, + "step": 2656 + }, + { + "epoch": 1.1215189873417721, + "grad_norm": 1.2843927145004272, + "learning_rate": 9.671007617292138e-05, + "loss": 0.6362426280975342, + "step": 2658 + }, + { + "epoch": 1.1223628691983123, + "grad_norm": 1.1182712316513062, + "learning_rate": 9.670150367244927e-05, + "loss": 0.6181318163871765, + "step": 2660 + }, + { + "epoch": 1.1232067510548522, + "grad_norm": 1.566605806350708, + "learning_rate": 9.669292039883717e-05, + "loss": 0.6973897218704224, + "step": 2662 + }, + { + "epoch": 1.1240506329113924, + "grad_norm": 1.0726850032806396, + "learning_rate": 9.66843263540651e-05, + "loss": 0.6117324829101562, + "step": 2664 + }, + { + "epoch": 1.1248945147679326, + "grad_norm": 1.2953020334243774, + "learning_rate": 9.66757215401155e-05, + "loss": 0.642676830291748, + "step": 2666 + }, + { + "epoch": 1.1257383966244725, + "grad_norm": 1.1184383630752563, + "learning_rate": 9.66671059589734e-05, + "loss": 0.6757452487945557, + "step": 2668 + }, + { + "epoch": 1.1265822784810127, + "grad_norm": 1.2732970714569092, + "learning_rate": 9.66584796126262e-05, + "loss": 0.6861951947212219, + "step": 2670 + }, + { + "epoch": 1.1274261603375528, + "grad_norm": 1.2713000774383545, + "learning_rate": 9.664984250306383e-05, + "loss": 0.6727077960968018, + "step": 2672 + }, + { + "epoch": 1.1282700421940928, + "grad_norm": 1.269827961921692, + "learning_rate": 9.664119463227874e-05, + "loss": 0.7355974912643433, + "step": 2674 + }, + { + "epoch": 1.129113924050633, + "grad_norm": 1.3067172765731812, + "learning_rate": 9.663253600226581e-05, + "loss": 0.7121313214302063, + "step": 2676 + }, + { + "epoch": 1.129957805907173, + "grad_norm": 1.2958797216415405, + "learning_rate": 9.662386661502242e-05, + "loss": 0.6671369075775146, + "step": 2678 + }, + { + "epoch": 1.130801687763713, + "grad_norm": 1.2943401336669922, + "learning_rate": 9.661518647254842e-05, + "loss": 0.6153768301010132, + "step": 2680 + }, + { + "epoch": 1.1316455696202532, + "grad_norm": 1.1744167804718018, + "learning_rate": 9.660649557684616e-05, + "loss": 0.6070778965950012, + "step": 2682 + }, + { + "epoch": 1.1324894514767934, + "grad_norm": 1.159209132194519, + "learning_rate": 9.659779392992047e-05, + "loss": 0.676887035369873, + "step": 2684 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.1937510967254639, + "learning_rate": 9.658908153377866e-05, + "loss": 0.6086745262145996, + "step": 2686 + }, + { + "epoch": 1.1341772151898735, + "grad_norm": 1.1461687088012695, + "learning_rate": 9.658035839043049e-05, + "loss": 0.6493708491325378, + "step": 2688 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 2.066361665725708, + "learning_rate": 9.657162450188824e-05, + "loss": 0.6813004016876221, + "step": 2690 + }, + { + "epoch": 1.1358649789029536, + "grad_norm": 1.086910367012024, + "learning_rate": 9.656287987016664e-05, + "loss": 0.721062183380127, + "step": 2692 + }, + { + "epoch": 1.1367088607594937, + "grad_norm": 1.1869292259216309, + "learning_rate": 9.65541244972829e-05, + "loss": 0.5975021123886108, + "step": 2694 + }, + { + "epoch": 1.1375527426160337, + "grad_norm": 1.2456518411636353, + "learning_rate": 9.654535838525674e-05, + "loss": 0.6818324327468872, + "step": 2696 + }, + { + "epoch": 1.1383966244725738, + "grad_norm": 1.5271464586257935, + "learning_rate": 9.653658153611031e-05, + "loss": 0.6844469308853149, + "step": 2698 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.1403794288635254, + "learning_rate": 9.652779395186827e-05, + "loss": 0.6388684511184692, + "step": 2700 + }, + { + "epoch": 1.139240506329114, + "eval_loss": 0.7335711717605591, + "eval_runtime": 861.9651, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 2700 + }, + { + "epoch": 1.140084388185654, + "grad_norm": 1.1091634035110474, + "learning_rate": 9.651899563455775e-05, + "loss": 0.6154619455337524, + "step": 2702 + }, + { + "epoch": 1.140928270042194, + "grad_norm": 1.3280601501464844, + "learning_rate": 9.651018658620837e-05, + "loss": 0.629319429397583, + "step": 2704 + }, + { + "epoch": 1.1417721518987343, + "grad_norm": 1.226806402206421, + "learning_rate": 9.650136680885216e-05, + "loss": 0.6088175773620605, + "step": 2706 + }, + { + "epoch": 1.1426160337552742, + "grad_norm": 1.0593408346176147, + "learning_rate": 9.649253630452372e-05, + "loss": 0.6199659705162048, + "step": 2708 + }, + { + "epoch": 1.1434599156118144, + "grad_norm": 1.1112475395202637, + "learning_rate": 9.648369507526008e-05, + "loss": 0.7233364582061768, + "step": 2710 + }, + { + "epoch": 1.1443037974683543, + "grad_norm": 1.1737885475158691, + "learning_rate": 9.647484312310068e-05, + "loss": 0.6687955856323242, + "step": 2712 + }, + { + "epoch": 1.1451476793248945, + "grad_norm": 1.194532036781311, + "learning_rate": 9.646598045008756e-05, + "loss": 0.6508969068527222, + "step": 2714 + }, + { + "epoch": 1.1459915611814346, + "grad_norm": 1.069395899772644, + "learning_rate": 9.645710705826517e-05, + "loss": 0.6408317685127258, + "step": 2716 + }, + { + "epoch": 1.1468354430379746, + "grad_norm": 1.2429133653640747, + "learning_rate": 9.644822294968037e-05, + "loss": 0.650763750076294, + "step": 2718 + }, + { + "epoch": 1.1476793248945147, + "grad_norm": 1.2950133085250854, + "learning_rate": 9.64393281263826e-05, + "loss": 0.6952191591262817, + "step": 2720 + }, + { + "epoch": 1.148523206751055, + "grad_norm": 1.1972628831863403, + "learning_rate": 9.643042259042372e-05, + "loss": 0.6772956252098083, + "step": 2722 + }, + { + "epoch": 1.1493670886075948, + "grad_norm": 1.1670407056808472, + "learning_rate": 9.642150634385805e-05, + "loss": 0.6734447479248047, + "step": 2724 + }, + { + "epoch": 1.150210970464135, + "grad_norm": 1.120302677154541, + "learning_rate": 9.641257938874243e-05, + "loss": 0.6387717127799988, + "step": 2726 + }, + { + "epoch": 1.1510548523206752, + "grad_norm": 1.1241344213485718, + "learning_rate": 9.640364172713609e-05, + "loss": 0.6592874526977539, + "step": 2728 + }, + { + "epoch": 1.1518987341772151, + "grad_norm": 1.2627261877059937, + "learning_rate": 9.639469336110083e-05, + "loss": 0.7257466912269592, + "step": 2730 + }, + { + "epoch": 1.1527426160337553, + "grad_norm": 1.0528618097305298, + "learning_rate": 9.638573429270083e-05, + "loss": 0.572188138961792, + "step": 2732 + }, + { + "epoch": 1.1535864978902954, + "grad_norm": 1.212536334991455, + "learning_rate": 9.637676452400277e-05, + "loss": 0.678981602191925, + "step": 2734 + }, + { + "epoch": 1.1544303797468354, + "grad_norm": 1.152167797088623, + "learning_rate": 9.636778405707582e-05, + "loss": 0.6375001072883606, + "step": 2736 + }, + { + "epoch": 1.1552742616033755, + "grad_norm": 1.2400429248809814, + "learning_rate": 9.635879289399161e-05, + "loss": 0.7602289319038391, + "step": 2738 + }, + { + "epoch": 1.1561181434599157, + "grad_norm": 1.3488622903823853, + "learning_rate": 9.634979103682421e-05, + "loss": 0.6209543943405151, + "step": 2740 + }, + { + "epoch": 1.1569620253164556, + "grad_norm": 1.1999555826187134, + "learning_rate": 9.634077848765019e-05, + "loss": 0.6215830445289612, + "step": 2742 + }, + { + "epoch": 1.1578059071729958, + "grad_norm": 1.2008578777313232, + "learning_rate": 9.633175524854855e-05, + "loss": 0.6634654998779297, + "step": 2744 + }, + { + "epoch": 1.158649789029536, + "grad_norm": 1.3920676708221436, + "learning_rate": 9.63227213216008e-05, + "loss": 0.7515161633491516, + "step": 2746 + }, + { + "epoch": 1.159493670886076, + "grad_norm": 1.0551656484603882, + "learning_rate": 9.631367670889089e-05, + "loss": 0.724361777305603, + "step": 2748 + }, + { + "epoch": 1.160337552742616, + "grad_norm": 1.2820028066635132, + "learning_rate": 9.630462141250523e-05, + "loss": 0.6673553586006165, + "step": 2750 + }, + { + "epoch": 1.1611814345991562, + "grad_norm": 1.1452983617782593, + "learning_rate": 9.62955554345327e-05, + "loss": 0.7029784917831421, + "step": 2752 + }, + { + "epoch": 1.1620253164556962, + "grad_norm": 1.1808624267578125, + "learning_rate": 9.628647877706466e-05, + "loss": 0.7355457544326782, + "step": 2754 + }, + { + "epoch": 1.1628691983122363, + "grad_norm": 1.0574703216552734, + "learning_rate": 9.627739144219492e-05, + "loss": 0.6144933700561523, + "step": 2756 + }, + { + "epoch": 1.1637130801687763, + "grad_norm": 1.215733528137207, + "learning_rate": 9.626829343201974e-05, + "loss": 0.6843759417533875, + "step": 2758 + }, + { + "epoch": 1.1645569620253164, + "grad_norm": 1.1667706966400146, + "learning_rate": 9.625918474863787e-05, + "loss": 0.6197049617767334, + "step": 2760 + }, + { + "epoch": 1.1654008438818566, + "grad_norm": 1.3765631914138794, + "learning_rate": 9.62500653941505e-05, + "loss": 0.715958297252655, + "step": 2762 + }, + { + "epoch": 1.1662447257383965, + "grad_norm": 1.173715591430664, + "learning_rate": 9.62409353706613e-05, + "loss": 0.7433139085769653, + "step": 2764 + }, + { + "epoch": 1.1670886075949367, + "grad_norm": 1.1837430000305176, + "learning_rate": 9.623179468027637e-05, + "loss": 0.7174371480941772, + "step": 2766 + }, + { + "epoch": 1.1679324894514769, + "grad_norm": 1.1577154397964478, + "learning_rate": 9.622264332510432e-05, + "loss": 0.7184823751449585, + "step": 2768 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 1.165246605873108, + "learning_rate": 9.621348130725617e-05, + "loss": 0.693343460559845, + "step": 2770 + }, + { + "epoch": 1.169620253164557, + "grad_norm": 1.2853080034255981, + "learning_rate": 9.620430862884542e-05, + "loss": 0.6999852061271667, + "step": 2772 + }, + { + "epoch": 1.1704641350210971, + "grad_norm": 1.1782865524291992, + "learning_rate": 9.619512529198806e-05, + "loss": 0.6034331321716309, + "step": 2774 + }, + { + "epoch": 1.171308016877637, + "grad_norm": 1.4055447578430176, + "learning_rate": 9.61859312988025e-05, + "loss": 0.7588269710540771, + "step": 2776 + }, + { + "epoch": 1.1721518987341772, + "grad_norm": 1.1148805618286133, + "learning_rate": 9.617672665140957e-05, + "loss": 0.6913981437683105, + "step": 2778 + }, + { + "epoch": 1.1729957805907172, + "grad_norm": 1.1311042308807373, + "learning_rate": 9.616751135193266e-05, + "loss": 0.5976925492286682, + "step": 2780 + }, + { + "epoch": 1.1738396624472573, + "grad_norm": 1.2378602027893066, + "learning_rate": 9.615828540249754e-05, + "loss": 0.6897050142288208, + "step": 2782 + }, + { + "epoch": 1.1746835443037975, + "grad_norm": 1.3445732593536377, + "learning_rate": 9.614904880523248e-05, + "loss": 0.6772098541259766, + "step": 2784 + }, + { + "epoch": 1.1755274261603375, + "grad_norm": 1.3380862474441528, + "learning_rate": 9.613980156226815e-05, + "loss": 0.6354818344116211, + "step": 2786 + }, + { + "epoch": 1.1763713080168776, + "grad_norm": 1.0955157279968262, + "learning_rate": 9.613054367573773e-05, + "loss": 0.6541208028793335, + "step": 2788 + }, + { + "epoch": 1.1772151898734178, + "grad_norm": 1.0176626443862915, + "learning_rate": 9.612127514777686e-05, + "loss": 0.6472887992858887, + "step": 2790 + }, + { + "epoch": 1.1780590717299577, + "grad_norm": 1.2644864320755005, + "learning_rate": 9.611199598052357e-05, + "loss": 0.7511212229728699, + "step": 2792 + }, + { + "epoch": 1.1789029535864979, + "grad_norm": 1.248197317123413, + "learning_rate": 9.61027061761184e-05, + "loss": 0.696236789226532, + "step": 2794 + }, + { + "epoch": 1.179746835443038, + "grad_norm": 1.189935564994812, + "learning_rate": 9.609340573670436e-05, + "loss": 0.5962010622024536, + "step": 2796 + }, + { + "epoch": 1.180590717299578, + "grad_norm": 1.1760492324829102, + "learning_rate": 9.608409466442685e-05, + "loss": 0.5981685519218445, + "step": 2798 + }, + { + "epoch": 1.1814345991561181, + "grad_norm": 1.1820716857910156, + "learning_rate": 9.607477296143374e-05, + "loss": 0.6186091303825378, + "step": 2800 + }, + { + "epoch": 1.1814345991561181, + "eval_loss": 0.7298192977905273, + "eval_runtime": 849.544, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 2.48, + "step": 2800 + }, + { + "epoch": 1.1822784810126583, + "grad_norm": 1.0353888273239136, + "learning_rate": 9.606544062987541e-05, + "loss": 0.5859389901161194, + "step": 2802 + }, + { + "epoch": 1.1831223628691983, + "grad_norm": 1.3141933679580688, + "learning_rate": 9.605609767190464e-05, + "loss": 0.6573460698127747, + "step": 2804 + }, + { + "epoch": 1.1839662447257384, + "grad_norm": 1.1209372282028198, + "learning_rate": 9.604674408967664e-05, + "loss": 0.6991921067237854, + "step": 2806 + }, + { + "epoch": 1.1848101265822786, + "grad_norm": 1.2830493450164795, + "learning_rate": 9.603737988534913e-05, + "loss": 0.6438087821006775, + "step": 2808 + }, + { + "epoch": 1.1856540084388185, + "grad_norm": 1.1427195072174072, + "learning_rate": 9.602800506108225e-05, + "loss": 0.6452094316482544, + "step": 2810 + }, + { + "epoch": 1.1864978902953587, + "grad_norm": 1.316420078277588, + "learning_rate": 9.601861961903857e-05, + "loss": 0.6745601296424866, + "step": 2812 + }, + { + "epoch": 1.1873417721518988, + "grad_norm": 1.1643308401107788, + "learning_rate": 9.600922356138317e-05, + "loss": 0.6761514544487, + "step": 2814 + }, + { + "epoch": 1.1881856540084388, + "grad_norm": 1.036056399345398, + "learning_rate": 9.59998168902835e-05, + "loss": 0.6453908681869507, + "step": 2816 + }, + { + "epoch": 1.189029535864979, + "grad_norm": 1.2211129665374756, + "learning_rate": 9.599039960790954e-05, + "loss": 0.6576406359672546, + "step": 2818 + }, + { + "epoch": 1.189873417721519, + "grad_norm": 1.084114670753479, + "learning_rate": 9.598097171643364e-05, + "loss": 0.6214181780815125, + "step": 2820 + }, + { + "epoch": 1.190717299578059, + "grad_norm": 1.1297314167022705, + "learning_rate": 9.597153321803064e-05, + "loss": 0.6381646990776062, + "step": 2822 + }, + { + "epoch": 1.1915611814345992, + "grad_norm": 1.2568120956420898, + "learning_rate": 9.596208411487784e-05, + "loss": 0.7129076719284058, + "step": 2824 + }, + { + "epoch": 1.1924050632911392, + "grad_norm": 1.07041335105896, + "learning_rate": 9.595262440915493e-05, + "loss": 0.7123546004295349, + "step": 2826 + }, + { + "epoch": 1.1932489451476793, + "grad_norm": 1.3950074911117554, + "learning_rate": 9.594315410304413e-05, + "loss": 0.7263038158416748, + "step": 2828 + }, + { + "epoch": 1.1940928270042195, + "grad_norm": 1.2470672130584717, + "learning_rate": 9.593367319873002e-05, + "loss": 0.6863036751747131, + "step": 2830 + }, + { + "epoch": 1.1949367088607594, + "grad_norm": 1.2065461874008179, + "learning_rate": 9.592418169839968e-05, + "loss": 0.745354175567627, + "step": 2832 + }, + { + "epoch": 1.1957805907172996, + "grad_norm": 1.1710152626037598, + "learning_rate": 9.591467960424261e-05, + "loss": 0.6401656866073608, + "step": 2834 + }, + { + "epoch": 1.1966244725738397, + "grad_norm": 1.3324087858200073, + "learning_rate": 9.590516691845077e-05, + "loss": 0.7402615547180176, + "step": 2836 + }, + { + "epoch": 1.1974683544303797, + "grad_norm": 1.0100195407867432, + "learning_rate": 9.589564364321855e-05, + "loss": 0.5723769068717957, + "step": 2838 + }, + { + "epoch": 1.1983122362869199, + "grad_norm": 1.2706246376037598, + "learning_rate": 9.588610978074277e-05, + "loss": 0.6618966460227966, + "step": 2840 + }, + { + "epoch": 1.1991561181434598, + "grad_norm": 1.1921758651733398, + "learning_rate": 9.587656533322273e-05, + "loss": 0.7090804576873779, + "step": 2842 + }, + { + "epoch": 1.2, + "grad_norm": 1.36713445186615, + "learning_rate": 9.586701030286014e-05, + "loss": 0.6930652856826782, + "step": 2844 + }, + { + "epoch": 1.2008438818565401, + "grad_norm": 1.3084295988082886, + "learning_rate": 9.585744469185917e-05, + "loss": 0.7386236190795898, + "step": 2846 + }, + { + "epoch": 1.20168776371308, + "grad_norm": 1.198922038078308, + "learning_rate": 9.584786850242642e-05, + "loss": 0.6179903149604797, + "step": 2848 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.2106369733810425, + "learning_rate": 9.583828173677092e-05, + "loss": 0.7027528882026672, + "step": 2850 + }, + { + "epoch": 1.2033755274261604, + "grad_norm": 1.2959522008895874, + "learning_rate": 9.582868439710418e-05, + "loss": 0.6612945199012756, + "step": 2852 + }, + { + "epoch": 1.2042194092827003, + "grad_norm": 1.1441705226898193, + "learning_rate": 9.58190764856401e-05, + "loss": 0.7085917592048645, + "step": 2854 + }, + { + "epoch": 1.2050632911392405, + "grad_norm": 1.1586185693740845, + "learning_rate": 9.580945800459504e-05, + "loss": 0.7480600476264954, + "step": 2856 + }, + { + "epoch": 1.2059071729957807, + "grad_norm": 1.2068266868591309, + "learning_rate": 9.579982895618783e-05, + "loss": 0.7185836434364319, + "step": 2858 + }, + { + "epoch": 1.2067510548523206, + "grad_norm": 1.2188525199890137, + "learning_rate": 9.579018934263966e-05, + "loss": 0.6737306118011475, + "step": 2860 + }, + { + "epoch": 1.2075949367088608, + "grad_norm": 1.1513181924819946, + "learning_rate": 9.578053916617423e-05, + "loss": 0.7239293456077576, + "step": 2862 + }, + { + "epoch": 1.208438818565401, + "grad_norm": 1.2063703536987305, + "learning_rate": 9.577087842901764e-05, + "loss": 0.6416276097297668, + "step": 2864 + }, + { + "epoch": 1.2092827004219409, + "grad_norm": 1.102460503578186, + "learning_rate": 9.576120713339844e-05, + "loss": 0.697213351726532, + "step": 2866 + }, + { + "epoch": 1.210126582278481, + "grad_norm": 1.2484638690948486, + "learning_rate": 9.575152528154763e-05, + "loss": 0.6664742231369019, + "step": 2868 + }, + { + "epoch": 1.2109704641350212, + "grad_norm": 1.4476624727249146, + "learning_rate": 9.57418328756986e-05, + "loss": 0.6914868354797363, + "step": 2870 + }, + { + "epoch": 1.2118143459915611, + "grad_norm": 1.0130122900009155, + "learning_rate": 9.573212991808722e-05, + "loss": 0.662024736404419, + "step": 2872 + }, + { + "epoch": 1.2126582278481013, + "grad_norm": 1.014470100402832, + "learning_rate": 9.572241641095177e-05, + "loss": 0.6330409646034241, + "step": 2874 + }, + { + "epoch": 1.2135021097046415, + "grad_norm": 1.1803333759307861, + "learning_rate": 9.571269235653298e-05, + "loss": 0.6607463955879211, + "step": 2876 + }, + { + "epoch": 1.2143459915611814, + "grad_norm": 1.261366844177246, + "learning_rate": 9.570295775707398e-05, + "loss": 0.6925629377365112, + "step": 2878 + }, + { + "epoch": 1.2151898734177216, + "grad_norm": 1.226670503616333, + "learning_rate": 9.569321261482037e-05, + "loss": 0.7070510983467102, + "step": 2880 + }, + { + "epoch": 1.2160337552742617, + "grad_norm": 1.164565920829773, + "learning_rate": 9.568345693202016e-05, + "loss": 0.7243561744689941, + "step": 2882 + }, + { + "epoch": 1.2168776371308017, + "grad_norm": 1.060331106185913, + "learning_rate": 9.567369071092382e-05, + "loss": 0.6316909790039062, + "step": 2884 + }, + { + "epoch": 1.2177215189873418, + "grad_norm": 1.1998693943023682, + "learning_rate": 9.566391395378419e-05, + "loss": 0.6139125227928162, + "step": 2886 + }, + { + "epoch": 1.2185654008438818, + "grad_norm": 1.1875834465026855, + "learning_rate": 9.565412666285661e-05, + "loss": 0.688897430896759, + "step": 2888 + }, + { + "epoch": 1.219409282700422, + "grad_norm": 1.199174404144287, + "learning_rate": 9.564432884039882e-05, + "loss": 0.684590756893158, + "step": 2890 + }, + { + "epoch": 1.220253164556962, + "grad_norm": 1.2428219318389893, + "learning_rate": 9.563452048867099e-05, + "loss": 0.67433100938797, + "step": 2892 + }, + { + "epoch": 1.221097046413502, + "grad_norm": 1.0826431512832642, + "learning_rate": 9.562470160993568e-05, + "loss": 0.6959785223007202, + "step": 2894 + }, + { + "epoch": 1.2219409282700422, + "grad_norm": 1.3140246868133545, + "learning_rate": 9.561487220645797e-05, + "loss": 0.6443175673484802, + "step": 2896 + }, + { + "epoch": 1.2227848101265824, + "grad_norm": 1.2758334875106812, + "learning_rate": 9.560503228050529e-05, + "loss": 0.6715332865715027, + "step": 2898 + }, + { + "epoch": 1.2236286919831223, + "grad_norm": 1.3326421976089478, + "learning_rate": 9.559518183434753e-05, + "loss": 0.6896081566810608, + "step": 2900 + }, + { + "epoch": 1.2236286919831223, + "eval_loss": 0.7281573414802551, + "eval_runtime": 854.563, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 2.466, + "step": 2900 + }, + { + "epoch": 1.2244725738396625, + "grad_norm": 1.3225606679916382, + "learning_rate": 9.558532087025697e-05, + "loss": 0.6797633171081543, + "step": 2902 + }, + { + "epoch": 1.2253164556962026, + "grad_norm": 1.3058340549468994, + "learning_rate": 9.55754493905084e-05, + "loss": 0.6510948538780212, + "step": 2904 + }, + { + "epoch": 1.2261603375527426, + "grad_norm": 1.140268087387085, + "learning_rate": 9.556556739737892e-05, + "loss": 0.6481176614761353, + "step": 2906 + }, + { + "epoch": 1.2270042194092827, + "grad_norm": 1.465113639831543, + "learning_rate": 9.555567489314816e-05, + "loss": 0.7533771991729736, + "step": 2908 + }, + { + "epoch": 1.2278481012658227, + "grad_norm": 1.1468979120254517, + "learning_rate": 9.554577188009812e-05, + "loss": 0.6924305558204651, + "step": 2910 + }, + { + "epoch": 1.2286919831223628, + "grad_norm": 1.2193517684936523, + "learning_rate": 9.553585836051321e-05, + "loss": 0.7082820534706116, + "step": 2912 + }, + { + "epoch": 1.229535864978903, + "grad_norm": 1.2015037536621094, + "learning_rate": 9.552593433668034e-05, + "loss": 0.6735695004463196, + "step": 2914 + }, + { + "epoch": 1.230379746835443, + "grad_norm": 1.1915435791015625, + "learning_rate": 9.551599981088874e-05, + "loss": 0.7312048673629761, + "step": 2916 + }, + { + "epoch": 1.231223628691983, + "grad_norm": 1.2849410772323608, + "learning_rate": 9.550605478543013e-05, + "loss": 0.6590308547019958, + "step": 2918 + }, + { + "epoch": 1.2320675105485233, + "grad_norm": 1.192238688468933, + "learning_rate": 9.549609926259866e-05, + "loss": 0.6237715482711792, + "step": 2920 + }, + { + "epoch": 1.2329113924050632, + "grad_norm": 1.141845703125, + "learning_rate": 9.548613324469085e-05, + "loss": 0.6546295881271362, + "step": 2922 + }, + { + "epoch": 1.2337552742616034, + "grad_norm": 1.1662311553955078, + "learning_rate": 9.547615673400566e-05, + "loss": 0.5800934433937073, + "step": 2924 + }, + { + "epoch": 1.2345991561181435, + "grad_norm": 1.120578646659851, + "learning_rate": 9.546616973284453e-05, + "loss": 0.6487136483192444, + "step": 2926 + }, + { + "epoch": 1.2354430379746835, + "grad_norm": 1.0884860754013062, + "learning_rate": 9.54561722435112e-05, + "loss": 0.7515342235565186, + "step": 2928 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 1.4208670854568481, + "learning_rate": 9.544616426831196e-05, + "loss": 0.7162003517150879, + "step": 2930 + }, + { + "epoch": 1.2371308016877638, + "grad_norm": 1.083389401435852, + "learning_rate": 9.543614580955543e-05, + "loss": 0.708450198173523, + "step": 2932 + }, + { + "epoch": 1.2379746835443037, + "grad_norm": 1.141364336013794, + "learning_rate": 9.542611686955268e-05, + "loss": 0.6255859732627869, + "step": 2934 + }, + { + "epoch": 1.238818565400844, + "grad_norm": 1.122036099433899, + "learning_rate": 9.54160774506172e-05, + "loss": 0.6485402584075928, + "step": 2936 + }, + { + "epoch": 1.239662447257384, + "grad_norm": 1.3514165878295898, + "learning_rate": 9.540602755506487e-05, + "loss": 0.6735473871231079, + "step": 2938 + }, + { + "epoch": 1.240506329113924, + "grad_norm": 1.1762629747390747, + "learning_rate": 9.539596718521403e-05, + "loss": 0.6154970526695251, + "step": 2940 + }, + { + "epoch": 1.2413502109704642, + "grad_norm": 1.1609408855438232, + "learning_rate": 9.53858963433854e-05, + "loss": 0.6410251259803772, + "step": 2942 + }, + { + "epoch": 1.2421940928270043, + "grad_norm": 1.1750361919403076, + "learning_rate": 9.537581503190214e-05, + "loss": 0.6841039657592773, + "step": 2944 + }, + { + "epoch": 1.2430379746835443, + "grad_norm": 1.3125680685043335, + "learning_rate": 9.536572325308982e-05, + "loss": 0.7293462753295898, + "step": 2946 + }, + { + "epoch": 1.2438818565400844, + "grad_norm": 1.1737277507781982, + "learning_rate": 9.53556210092764e-05, + "loss": 0.7713663578033447, + "step": 2948 + }, + { + "epoch": 1.2447257383966246, + "grad_norm": 1.1702152490615845, + "learning_rate": 9.53455083027923e-05, + "loss": 0.6612298488616943, + "step": 2950 + }, + { + "epoch": 1.2455696202531645, + "grad_norm": 1.2594486474990845, + "learning_rate": 9.533538513597028e-05, + "loss": 0.6725803017616272, + "step": 2952 + }, + { + "epoch": 1.2464135021097047, + "grad_norm": 1.180816411972046, + "learning_rate": 9.532525151114562e-05, + "loss": 0.6421069502830505, + "step": 2954 + }, + { + "epoch": 1.2472573839662446, + "grad_norm": 1.25814688205719, + "learning_rate": 9.531510743065593e-05, + "loss": 0.7042996287345886, + "step": 2956 + }, + { + "epoch": 1.2481012658227848, + "grad_norm": 1.2101783752441406, + "learning_rate": 9.530495289684122e-05, + "loss": 0.7359137535095215, + "step": 2958 + }, + { + "epoch": 1.248945147679325, + "grad_norm": 1.1438405513763428, + "learning_rate": 9.5294787912044e-05, + "loss": 0.6186386346817017, + "step": 2960 + }, + { + "epoch": 1.249789029535865, + "grad_norm": 1.163364291191101, + "learning_rate": 9.52846124786091e-05, + "loss": 0.6243056058883667, + "step": 2962 + }, + { + "epoch": 1.250632911392405, + "grad_norm": 1.0695953369140625, + "learning_rate": 9.52744265988838e-05, + "loss": 0.6568763852119446, + "step": 2964 + }, + { + "epoch": 1.2514767932489452, + "grad_norm": 1.2228879928588867, + "learning_rate": 9.52642302752178e-05, + "loss": 0.6486776471138, + "step": 2966 + }, + { + "epoch": 1.2523206751054852, + "grad_norm": 1.2262967824935913, + "learning_rate": 9.52540235099632e-05, + "loss": 0.6293455958366394, + "step": 2968 + }, + { + "epoch": 1.2531645569620253, + "grad_norm": 1.0862956047058105, + "learning_rate": 9.524380630547449e-05, + "loss": 0.6549884080886841, + "step": 2970 + }, + { + "epoch": 1.2540084388185653, + "grad_norm": 1.1721880435943604, + "learning_rate": 9.52335786641086e-05, + "loss": 0.6126490831375122, + "step": 2972 + }, + { + "epoch": 1.2548523206751054, + "grad_norm": 1.2452391386032104, + "learning_rate": 9.522334058822483e-05, + "loss": 0.7078590393066406, + "step": 2974 + }, + { + "epoch": 1.2556962025316456, + "grad_norm": 1.2290222644805908, + "learning_rate": 9.521309208018492e-05, + "loss": 0.6166214942932129, + "step": 2976 + }, + { + "epoch": 1.2565400843881855, + "grad_norm": 1.1823618412017822, + "learning_rate": 9.520283314235299e-05, + "loss": 0.666228175163269, + "step": 2978 + }, + { + "epoch": 1.2573839662447257, + "grad_norm": 1.1702475547790527, + "learning_rate": 9.51925637770956e-05, + "loss": 0.7436795830726624, + "step": 2980 + }, + { + "epoch": 1.2582278481012659, + "grad_norm": 1.0879321098327637, + "learning_rate": 9.518228398678168e-05, + "loss": 0.7120893001556396, + "step": 2982 + }, + { + "epoch": 1.2590717299578058, + "grad_norm": 1.1608418226242065, + "learning_rate": 9.517199377378261e-05, + "loss": 0.6931713223457336, + "step": 2984 + }, + { + "epoch": 1.259915611814346, + "grad_norm": 1.1289087533950806, + "learning_rate": 9.51616931404721e-05, + "loss": 0.6803538799285889, + "step": 2986 + }, + { + "epoch": 1.2607594936708861, + "grad_norm": 1.1622236967086792, + "learning_rate": 9.515138208922633e-05, + "loss": 0.6499706506729126, + "step": 2988 + }, + { + "epoch": 1.261603375527426, + "grad_norm": 1.2492594718933105, + "learning_rate": 9.514106062242386e-05, + "loss": 0.6132655739784241, + "step": 2990 + }, + { + "epoch": 1.2624472573839662, + "grad_norm": 1.1538822650909424, + "learning_rate": 9.513072874244567e-05, + "loss": 0.6309265494346619, + "step": 2992 + }, + { + "epoch": 1.2632911392405064, + "grad_norm": 1.0828478336334229, + "learning_rate": 9.512038645167509e-05, + "loss": 0.6297751665115356, + "step": 2994 + }, + { + "epoch": 1.2641350210970463, + "grad_norm": 1.2440937757492065, + "learning_rate": 9.511003375249792e-05, + "loss": 0.6335258483886719, + "step": 2996 + }, + { + "epoch": 1.2649789029535865, + "grad_norm": 1.1259970664978027, + "learning_rate": 9.50996706473023e-05, + "loss": 0.6513770818710327, + "step": 2998 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.1530309915542603, + "learning_rate": 9.508929713847884e-05, + "loss": 0.6490892767906189, + "step": 3000 + }, + { + "epoch": 1.2658227848101267, + "eval_loss": 0.72515869140625, + "eval_runtime": 868.0515, + "eval_samples_per_second": 2.427, + "eval_steps_per_second": 2.427, + "step": 3000 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.2257169485092163, + "learning_rate": 9.507891322842048e-05, + "loss": 0.6936060786247253, + "step": 3002 + }, + { + "epoch": 1.2675105485232068, + "grad_norm": 1.0380109548568726, + "learning_rate": 9.506851891952259e-05, + "loss": 0.5941951870918274, + "step": 3004 + }, + { + "epoch": 1.268354430379747, + "grad_norm": 1.2830222845077515, + "learning_rate": 9.505811421418296e-05, + "loss": 0.648429811000824, + "step": 3006 + }, + { + "epoch": 1.2691983122362869, + "grad_norm": 1.2212986946105957, + "learning_rate": 9.504769911480171e-05, + "loss": 0.6868565678596497, + "step": 3008 + }, + { + "epoch": 1.270042194092827, + "grad_norm": 1.104656457901001, + "learning_rate": 9.503727362378145e-05, + "loss": 0.6777986288070679, + "step": 3010 + }, + { + "epoch": 1.2708860759493672, + "grad_norm": 1.1449005603790283, + "learning_rate": 9.502683774352713e-05, + "loss": 0.6581128239631653, + "step": 3012 + }, + { + "epoch": 1.2717299578059071, + "grad_norm": 1.2753362655639648, + "learning_rate": 9.501639147644608e-05, + "loss": 0.689930260181427, + "step": 3014 + }, + { + "epoch": 1.2725738396624473, + "grad_norm": 1.3367106914520264, + "learning_rate": 9.500593482494809e-05, + "loss": 0.7549214363098145, + "step": 3016 + }, + { + "epoch": 1.2734177215189875, + "grad_norm": 1.2309048175811768, + "learning_rate": 9.499546779144528e-05, + "loss": 0.6713513135910034, + "step": 3018 + }, + { + "epoch": 1.2742616033755274, + "grad_norm": 1.3833240270614624, + "learning_rate": 9.49849903783522e-05, + "loss": 0.7045458555221558, + "step": 3020 + }, + { + "epoch": 1.2751054852320676, + "grad_norm": 1.1402570009231567, + "learning_rate": 9.49745025880858e-05, + "loss": 0.708249568939209, + "step": 3022 + }, + { + "epoch": 1.2759493670886077, + "grad_norm": 1.0476267337799072, + "learning_rate": 9.496400442306541e-05, + "loss": 0.616210401058197, + "step": 3024 + }, + { + "epoch": 1.2767932489451477, + "grad_norm": 1.1045979261398315, + "learning_rate": 9.495349588571274e-05, + "loss": 0.6691827178001404, + "step": 3026 + }, + { + "epoch": 1.2776371308016878, + "grad_norm": 1.1760368347167969, + "learning_rate": 9.494297697845194e-05, + "loss": 0.6198306083679199, + "step": 3028 + }, + { + "epoch": 1.2784810126582278, + "grad_norm": 1.0015549659729004, + "learning_rate": 9.493244770370946e-05, + "loss": 0.5756480097770691, + "step": 3030 + }, + { + "epoch": 1.279324894514768, + "grad_norm": 1.2190428972244263, + "learning_rate": 9.492190806391427e-05, + "loss": 0.6794419884681702, + "step": 3032 + }, + { + "epoch": 1.2801687763713079, + "grad_norm": 1.0210410356521606, + "learning_rate": 9.491135806149762e-05, + "loss": 0.5847988724708557, + "step": 3034 + }, + { + "epoch": 1.281012658227848, + "grad_norm": 1.0678503513336182, + "learning_rate": 9.490079769889319e-05, + "loss": 0.6760231256484985, + "step": 3036 + }, + { + "epoch": 1.2818565400843882, + "grad_norm": 1.1811012029647827, + "learning_rate": 9.489022697853709e-05, + "loss": 0.7188448309898376, + "step": 3038 + }, + { + "epoch": 1.2827004219409281, + "grad_norm": 1.1134302616119385, + "learning_rate": 9.487964590286776e-05, + "loss": 0.674904465675354, + "step": 3040 + }, + { + "epoch": 1.2835443037974683, + "grad_norm": 1.1868232488632202, + "learning_rate": 9.486905447432603e-05, + "loss": 0.6016344428062439, + "step": 3042 + }, + { + "epoch": 1.2843881856540085, + "grad_norm": 1.1586613655090332, + "learning_rate": 9.485845269535517e-05, + "loss": 0.6965603828430176, + "step": 3044 + }, + { + "epoch": 1.2852320675105484, + "grad_norm": 1.149837613105774, + "learning_rate": 9.48478405684008e-05, + "loss": 0.656144380569458, + "step": 3046 + }, + { + "epoch": 1.2860759493670886, + "grad_norm": 1.228752613067627, + "learning_rate": 9.48372180959109e-05, + "loss": 0.6388653516769409, + "step": 3048 + }, + { + "epoch": 1.2869198312236287, + "grad_norm": 1.2403100728988647, + "learning_rate": 9.482658528033595e-05, + "loss": 0.6255465745925903, + "step": 3050 + }, + { + "epoch": 1.2877637130801687, + "grad_norm": 1.2483839988708496, + "learning_rate": 9.481594212412865e-05, + "loss": 0.6828253269195557, + "step": 3052 + }, + { + "epoch": 1.2886075949367088, + "grad_norm": 1.4161021709442139, + "learning_rate": 9.480528862974422e-05, + "loss": 0.7072080373764038, + "step": 3054 + }, + { + "epoch": 1.289451476793249, + "grad_norm": 1.1500437259674072, + "learning_rate": 9.479462479964021e-05, + "loss": 0.6082415580749512, + "step": 3056 + }, + { + "epoch": 1.290295358649789, + "grad_norm": 1.196595549583435, + "learning_rate": 9.478395063627654e-05, + "loss": 0.6653015613555908, + "step": 3058 + }, + { + "epoch": 1.2911392405063291, + "grad_norm": 1.2832285165786743, + "learning_rate": 9.477326614211557e-05, + "loss": 0.7095832824707031, + "step": 3060 + }, + { + "epoch": 1.2919831223628693, + "grad_norm": 1.2234288454055786, + "learning_rate": 9.476257131962198e-05, + "loss": 0.7183426022529602, + "step": 3062 + }, + { + "epoch": 1.2928270042194092, + "grad_norm": 1.2350459098815918, + "learning_rate": 9.475186617126286e-05, + "loss": 0.713284432888031, + "step": 3064 + }, + { + "epoch": 1.2936708860759494, + "grad_norm": 1.2079555988311768, + "learning_rate": 9.47411506995077e-05, + "loss": 0.6580002307891846, + "step": 3066 + }, + { + "epoch": 1.2945147679324895, + "grad_norm": 1.129796028137207, + "learning_rate": 9.473042490682835e-05, + "loss": 0.5967763662338257, + "step": 3068 + }, + { + "epoch": 1.2953586497890295, + "grad_norm": 1.1706618070602417, + "learning_rate": 9.471968879569901e-05, + "loss": 0.6724388003349304, + "step": 3070 + }, + { + "epoch": 1.2962025316455696, + "grad_norm": 1.0336005687713623, + "learning_rate": 9.470894236859635e-05, + "loss": 0.6527577638626099, + "step": 3072 + }, + { + "epoch": 1.2970464135021098, + "grad_norm": 1.1124558448791504, + "learning_rate": 9.469818562799932e-05, + "loss": 0.677132785320282, + "step": 3074 + }, + { + "epoch": 1.2978902953586497, + "grad_norm": 1.158069372177124, + "learning_rate": 9.468741857638933e-05, + "loss": 0.649718165397644, + "step": 3076 + }, + { + "epoch": 1.29873417721519, + "grad_norm": 1.092926263809204, + "learning_rate": 9.46766412162501e-05, + "loss": 0.6872133612632751, + "step": 3078 + }, + { + "epoch": 1.29957805907173, + "grad_norm": 1.1324822902679443, + "learning_rate": 9.466585355006777e-05, + "loss": 0.6495246291160583, + "step": 3080 + }, + { + "epoch": 1.30042194092827, + "grad_norm": 1.5882837772369385, + "learning_rate": 9.465505558033086e-05, + "loss": 0.6730570197105408, + "step": 3082 + }, + { + "epoch": 1.3012658227848102, + "grad_norm": 0.9866069555282593, + "learning_rate": 9.464424730953023e-05, + "loss": 0.5677527785301208, + "step": 3084 + }, + { + "epoch": 1.3021097046413503, + "grad_norm": 1.1560224294662476, + "learning_rate": 9.463342874015917e-05, + "loss": 0.6247856020927429, + "step": 3086 + }, + { + "epoch": 1.3029535864978903, + "grad_norm": 1.135939359664917, + "learning_rate": 9.462259987471329e-05, + "loss": 0.6889358758926392, + "step": 3088 + }, + { + "epoch": 1.3037974683544304, + "grad_norm": 1.3935760259628296, + "learning_rate": 9.461176071569063e-05, + "loss": 0.7097522020339966, + "step": 3090 + }, + { + "epoch": 1.3046413502109704, + "grad_norm": 1.153518795967102, + "learning_rate": 9.460091126559155e-05, + "loss": 0.7044580578804016, + "step": 3092 + }, + { + "epoch": 1.3054852320675105, + "grad_norm": 1.2112717628479004, + "learning_rate": 9.45900515269188e-05, + "loss": 0.6119300723075867, + "step": 3094 + }, + { + "epoch": 1.3063291139240507, + "grad_norm": 1.295591115951538, + "learning_rate": 9.457918150217754e-05, + "loss": 0.7150222063064575, + "step": 3096 + }, + { + "epoch": 1.3071729957805907, + "grad_norm": 1.1175775527954102, + "learning_rate": 9.456830119387527e-05, + "loss": 0.6043334007263184, + "step": 3098 + }, + { + "epoch": 1.3080168776371308, + "grad_norm": 1.4022588729858398, + "learning_rate": 9.455741060452186e-05, + "loss": 0.6354425549507141, + "step": 3100 + }, + { + "epoch": 1.3080168776371308, + "eval_loss": 0.7225774526596069, + "eval_runtime": 862.4006, + "eval_samples_per_second": 2.443, + "eval_steps_per_second": 2.443, + "step": 3100 + }, + { + "epoch": 1.3088607594936708, + "grad_norm": 1.1657692193984985, + "learning_rate": 9.454650973662957e-05, + "loss": 0.7281571626663208, + "step": 3102 + }, + { + "epoch": 1.309704641350211, + "grad_norm": 1.6169127225875854, + "learning_rate": 9.453559859271301e-05, + "loss": 0.8038214445114136, + "step": 3104 + }, + { + "epoch": 1.310548523206751, + "grad_norm": 1.1256520748138428, + "learning_rate": 9.452467717528918e-05, + "loss": 0.6488606333732605, + "step": 3106 + }, + { + "epoch": 1.311392405063291, + "grad_norm": 1.1224530935287476, + "learning_rate": 9.451374548687745e-05, + "loss": 0.6897066235542297, + "step": 3108 + }, + { + "epoch": 1.3122362869198312, + "grad_norm": 1.1123055219650269, + "learning_rate": 9.450280352999952e-05, + "loss": 0.6332913041114807, + "step": 3110 + }, + { + "epoch": 1.3130801687763713, + "grad_norm": 1.1688940525054932, + "learning_rate": 9.449185130717952e-05, + "loss": 0.7426630854606628, + "step": 3112 + }, + { + "epoch": 1.3139240506329113, + "grad_norm": 1.1898044347763062, + "learning_rate": 9.44808888209439e-05, + "loss": 0.7156099677085876, + "step": 3114 + }, + { + "epoch": 1.3147679324894515, + "grad_norm": 1.3030686378479004, + "learning_rate": 9.44699160738215e-05, + "loss": 0.7150979042053223, + "step": 3116 + }, + { + "epoch": 1.3156118143459916, + "grad_norm": 1.1539074182510376, + "learning_rate": 9.445893306834352e-05, + "loss": 0.6687285900115967, + "step": 3118 + }, + { + "epoch": 1.3164556962025316, + "grad_norm": 1.311808466911316, + "learning_rate": 9.444793980704355e-05, + "loss": 0.7340983152389526, + "step": 3120 + }, + { + "epoch": 1.3172995780590717, + "grad_norm": 1.3325430154800415, + "learning_rate": 9.44369362924575e-05, + "loss": 0.6620677709579468, + "step": 3122 + }, + { + "epoch": 1.3181434599156119, + "grad_norm": 1.201518177986145, + "learning_rate": 9.442592252712365e-05, + "loss": 0.6169955134391785, + "step": 3124 + }, + { + "epoch": 1.3189873417721518, + "grad_norm": 1.2124013900756836, + "learning_rate": 9.441489851358272e-05, + "loss": 0.6696792840957642, + "step": 3126 + }, + { + "epoch": 1.319831223628692, + "grad_norm": 1.2186850309371948, + "learning_rate": 9.440386425437768e-05, + "loss": 0.7303428649902344, + "step": 3128 + }, + { + "epoch": 1.3206751054852321, + "grad_norm": 1.3780523538589478, + "learning_rate": 9.439281975205396e-05, + "loss": 0.7093026638031006, + "step": 3130 + }, + { + "epoch": 1.321518987341772, + "grad_norm": 1.233353614807129, + "learning_rate": 9.438176500915932e-05, + "loss": 0.6821767687797546, + "step": 3132 + }, + { + "epoch": 1.3223628691983123, + "grad_norm": 1.2425329685211182, + "learning_rate": 9.437070002824385e-05, + "loss": 0.700680136680603, + "step": 3134 + }, + { + "epoch": 1.3232067510548524, + "grad_norm": 1.1600432395935059, + "learning_rate": 9.435962481186003e-05, + "loss": 0.6173145771026611, + "step": 3136 + }, + { + "epoch": 1.3240506329113924, + "grad_norm": 1.279336929321289, + "learning_rate": 9.434853936256272e-05, + "loss": 0.6597106456756592, + "step": 3138 + }, + { + "epoch": 1.3248945147679325, + "grad_norm": 1.1787258386611938, + "learning_rate": 9.433744368290909e-05, + "loss": 0.6655287742614746, + "step": 3140 + }, + { + "epoch": 1.3257383966244727, + "grad_norm": 1.3658509254455566, + "learning_rate": 9.432633777545874e-05, + "loss": 0.6312944889068604, + "step": 3142 + }, + { + "epoch": 1.3265822784810126, + "grad_norm": 1.1220000982284546, + "learning_rate": 9.431522164277356e-05, + "loss": 0.6696156859397888, + "step": 3144 + }, + { + "epoch": 1.3274261603375528, + "grad_norm": 1.224761724472046, + "learning_rate": 9.430409528741783e-05, + "loss": 0.6586571335792542, + "step": 3146 + }, + { + "epoch": 1.328270042194093, + "grad_norm": 1.227510929107666, + "learning_rate": 9.429295871195821e-05, + "loss": 0.64905846118927, + "step": 3148 + }, + { + "epoch": 1.3291139240506329, + "grad_norm": 1.1359103918075562, + "learning_rate": 9.428181191896366e-05, + "loss": 0.6407933831214905, + "step": 3150 + }, + { + "epoch": 1.329957805907173, + "grad_norm": 1.2729473114013672, + "learning_rate": 9.427065491100556e-05, + "loss": 0.7004884481430054, + "step": 3152 + }, + { + "epoch": 1.3308016877637132, + "grad_norm": 1.1182841062545776, + "learning_rate": 9.42594876906576e-05, + "loss": 0.6835907101631165, + "step": 3154 + }, + { + "epoch": 1.3316455696202532, + "grad_norm": 1.2309781312942505, + "learning_rate": 9.424831026049585e-05, + "loss": 0.7476315498352051, + "step": 3156 + }, + { + "epoch": 1.3324894514767933, + "grad_norm": 1.0857728719711304, + "learning_rate": 9.423712262309873e-05, + "loss": 0.6811426281929016, + "step": 3158 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.299680233001709, + "learning_rate": 9.4225924781047e-05, + "loss": 0.6403942108154297, + "step": 3160 + }, + { + "epoch": 1.3341772151898734, + "grad_norm": 1.226472020149231, + "learning_rate": 9.421471673692382e-05, + "loss": 0.6758930683135986, + "step": 3162 + }, + { + "epoch": 1.3350210970464136, + "grad_norm": 1.1403205394744873, + "learning_rate": 9.420349849331463e-05, + "loss": 0.7119444608688354, + "step": 3164 + }, + { + "epoch": 1.3358649789029535, + "grad_norm": 1.2888442277908325, + "learning_rate": 9.419227005280729e-05, + "loss": 0.7411463260650635, + "step": 3166 + }, + { + "epoch": 1.3367088607594937, + "grad_norm": 1.1929190158843994, + "learning_rate": 9.418103141799197e-05, + "loss": 0.5992606282234192, + "step": 3168 + }, + { + "epoch": 1.3375527426160336, + "grad_norm": 1.2574355602264404, + "learning_rate": 9.416978259146122e-05, + "loss": 0.6728890538215637, + "step": 3170 + }, + { + "epoch": 1.3383966244725738, + "grad_norm": 0.9653727412223816, + "learning_rate": 9.415852357580992e-05, + "loss": 0.6294883489608765, + "step": 3172 + }, + { + "epoch": 1.339240506329114, + "grad_norm": 1.2107670307159424, + "learning_rate": 9.414725437363532e-05, + "loss": 0.6816665530204773, + "step": 3174 + }, + { + "epoch": 1.340084388185654, + "grad_norm": 1.024849534034729, + "learning_rate": 9.4135974987537e-05, + "loss": 0.6186381578445435, + "step": 3176 + }, + { + "epoch": 1.340928270042194, + "grad_norm": 1.1556614637374878, + "learning_rate": 9.41246854201169e-05, + "loss": 0.6071005463600159, + "step": 3178 + }, + { + "epoch": 1.3417721518987342, + "grad_norm": 1.2382808923721313, + "learning_rate": 9.41133856739793e-05, + "loss": 0.7871434092521667, + "step": 3180 + }, + { + "epoch": 1.3426160337552742, + "grad_norm": 1.0499578714370728, + "learning_rate": 9.410207575173082e-05, + "loss": 0.6578201651573181, + "step": 3182 + }, + { + "epoch": 1.3434599156118143, + "grad_norm": 1.2048250436782837, + "learning_rate": 9.409075565598049e-05, + "loss": 0.6271620392799377, + "step": 3184 + }, + { + "epoch": 1.3443037974683545, + "grad_norm": 1.0287591218948364, + "learning_rate": 9.407942538933958e-05, + "loss": 0.5773864388465881, + "step": 3186 + }, + { + "epoch": 1.3451476793248944, + "grad_norm": 1.1125097274780273, + "learning_rate": 9.406808495442181e-05, + "loss": 0.6745175719261169, + "step": 3188 + }, + { + "epoch": 1.3459915611814346, + "grad_norm": 1.036125898361206, + "learning_rate": 9.405673435384319e-05, + "loss": 0.6001214385032654, + "step": 3190 + }, + { + "epoch": 1.3468354430379748, + "grad_norm": 1.2771985530853271, + "learning_rate": 9.404537359022207e-05, + "loss": 0.6703945994377136, + "step": 3192 + }, + { + "epoch": 1.3476793248945147, + "grad_norm": 1.0891097784042358, + "learning_rate": 9.403400266617918e-05, + "loss": 0.6159096360206604, + "step": 3194 + }, + { + "epoch": 1.3485232067510549, + "grad_norm": 1.1926233768463135, + "learning_rate": 9.402262158433755e-05, + "loss": 0.6439315676689148, + "step": 3196 + }, + { + "epoch": 1.349367088607595, + "grad_norm": 1.272557020187378, + "learning_rate": 9.40112303473226e-05, + "loss": 0.7125352025032043, + "step": 3198 + }, + { + "epoch": 1.350210970464135, + "grad_norm": 1.052037239074707, + "learning_rate": 9.399982895776207e-05, + "loss": 0.594719648361206, + "step": 3200 + }, + { + "epoch": 1.350210970464135, + "eval_loss": 0.7200453281402588, + "eval_runtime": 846.2953, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 3200 + }, + { + "epoch": 1.3510548523206751, + "grad_norm": 1.204728126525879, + "learning_rate": 9.398841741828601e-05, + "loss": 0.6390520334243774, + "step": 3202 + }, + { + "epoch": 1.3518987341772153, + "grad_norm": 1.0873899459838867, + "learning_rate": 9.397699573152689e-05, + "loss": 0.6010531187057495, + "step": 3204 + }, + { + "epoch": 1.3527426160337552, + "grad_norm": 1.3124359846115112, + "learning_rate": 9.396556390011944e-05, + "loss": 0.724280834197998, + "step": 3206 + }, + { + "epoch": 1.3535864978902954, + "grad_norm": 1.2179948091506958, + "learning_rate": 9.395412192670075e-05, + "loss": 0.6430405378341675, + "step": 3208 + }, + { + "epoch": 1.3544303797468356, + "grad_norm": 1.2617219686508179, + "learning_rate": 9.394266981391031e-05, + "loss": 0.7188641428947449, + "step": 3210 + }, + { + "epoch": 1.3552742616033755, + "grad_norm": 1.2151501178741455, + "learning_rate": 9.393120756438988e-05, + "loss": 0.6724364757537842, + "step": 3212 + }, + { + "epoch": 1.3561181434599157, + "grad_norm": 1.221528172492981, + "learning_rate": 9.391973518078357e-05, + "loss": 0.6340664625167847, + "step": 3214 + }, + { + "epoch": 1.3569620253164558, + "grad_norm": 1.3180092573165894, + "learning_rate": 9.390825266573786e-05, + "loss": 0.6914255023002625, + "step": 3216 + }, + { + "epoch": 1.3578059071729958, + "grad_norm": 1.103994369506836, + "learning_rate": 9.38967600219015e-05, + "loss": 0.6137136220932007, + "step": 3218 + }, + { + "epoch": 1.358649789029536, + "grad_norm": 1.33389413356781, + "learning_rate": 9.38852572519257e-05, + "loss": 0.7173700332641602, + "step": 3220 + }, + { + "epoch": 1.3594936708860759, + "grad_norm": 1.1074159145355225, + "learning_rate": 9.387374435846386e-05, + "loss": 0.5942243933677673, + "step": 3222 + }, + { + "epoch": 1.360337552742616, + "grad_norm": 1.1157063245773315, + "learning_rate": 9.386222134417182e-05, + "loss": 0.6362866163253784, + "step": 3224 + }, + { + "epoch": 1.3611814345991562, + "grad_norm": 1.1717792749404907, + "learning_rate": 9.38506882117077e-05, + "loss": 0.6784523129463196, + "step": 3226 + }, + { + "epoch": 1.3620253164556961, + "grad_norm": 1.0946043729782104, + "learning_rate": 9.383914496373197e-05, + "loss": 0.6647377014160156, + "step": 3228 + }, + { + "epoch": 1.3628691983122363, + "grad_norm": 1.1519699096679688, + "learning_rate": 9.382759160290746e-05, + "loss": 0.6302075982093811, + "step": 3230 + }, + { + "epoch": 1.3637130801687762, + "grad_norm": 0.9928684830665588, + "learning_rate": 9.381602813189929e-05, + "loss": 0.5979090332984924, + "step": 3232 + }, + { + "epoch": 1.3645569620253164, + "grad_norm": 1.2488124370574951, + "learning_rate": 9.380445455337492e-05, + "loss": 0.6949353218078613, + "step": 3234 + }, + { + "epoch": 1.3654008438818566, + "grad_norm": 1.3884797096252441, + "learning_rate": 9.379287087000416e-05, + "loss": 0.7225558161735535, + "step": 3236 + }, + { + "epoch": 1.3662447257383965, + "grad_norm": 1.2981176376342773, + "learning_rate": 9.378127708445917e-05, + "loss": 0.6993390917778015, + "step": 3238 + }, + { + "epoch": 1.3670886075949367, + "grad_norm": 0.9884640574455261, + "learning_rate": 9.376967319941438e-05, + "loss": 0.6983805894851685, + "step": 3240 + }, + { + "epoch": 1.3679324894514768, + "grad_norm": 1.2051894664764404, + "learning_rate": 9.375805921754659e-05, + "loss": 0.7062534689903259, + "step": 3242 + }, + { + "epoch": 1.3687763713080168, + "grad_norm": 1.1943434476852417, + "learning_rate": 9.374643514153494e-05, + "loss": 0.6405107378959656, + "step": 3244 + }, + { + "epoch": 1.369620253164557, + "grad_norm": 1.249214768409729, + "learning_rate": 9.373480097406086e-05, + "loss": 0.6844781637191772, + "step": 3246 + }, + { + "epoch": 1.370464135021097, + "grad_norm": 1.1847131252288818, + "learning_rate": 9.372315671780813e-05, + "loss": 0.6048306226730347, + "step": 3248 + }, + { + "epoch": 1.371308016877637, + "grad_norm": 1.125545859336853, + "learning_rate": 9.37115023754629e-05, + "loss": 0.6772685050964355, + "step": 3250 + }, + { + "epoch": 1.3721518987341772, + "grad_norm": 1.466615915298462, + "learning_rate": 9.369983794971354e-05, + "loss": 0.7536272406578064, + "step": 3252 + }, + { + "epoch": 1.3729957805907174, + "grad_norm": 1.066699504852295, + "learning_rate": 9.368816344325084e-05, + "loss": 0.6640655398368835, + "step": 3254 + }, + { + "epoch": 1.3738396624472573, + "grad_norm": 1.4793988466262817, + "learning_rate": 9.367647885876787e-05, + "loss": 0.7029458284378052, + "step": 3256 + }, + { + "epoch": 1.3746835443037975, + "grad_norm": 1.258540153503418, + "learning_rate": 9.366478419896006e-05, + "loss": 0.7231863737106323, + "step": 3258 + }, + { + "epoch": 1.3755274261603376, + "grad_norm": 1.176106333732605, + "learning_rate": 9.365307946652512e-05, + "loss": 0.6679144501686096, + "step": 3260 + }, + { + "epoch": 1.3763713080168776, + "grad_norm": 1.3301753997802734, + "learning_rate": 9.364136466416316e-05, + "loss": 0.6282188296318054, + "step": 3262 + }, + { + "epoch": 1.3772151898734177, + "grad_norm": 1.3616732358932495, + "learning_rate": 9.362963979457648e-05, + "loss": 0.6870840191841125, + "step": 3264 + }, + { + "epoch": 1.378059071729958, + "grad_norm": 1.1982418298721313, + "learning_rate": 9.361790486046985e-05, + "loss": 0.6823731660842896, + "step": 3266 + }, + { + "epoch": 1.3789029535864978, + "grad_norm": 1.1869033575057983, + "learning_rate": 9.360615986455024e-05, + "loss": 0.6582897305488586, + "step": 3268 + }, + { + "epoch": 1.379746835443038, + "grad_norm": 1.1192975044250488, + "learning_rate": 9.359440480952703e-05, + "loss": 0.716654360294342, + "step": 3270 + }, + { + "epoch": 1.3805907172995782, + "grad_norm": 1.2210016250610352, + "learning_rate": 9.358263969811189e-05, + "loss": 0.6880061626434326, + "step": 3272 + }, + { + "epoch": 1.381434599156118, + "grad_norm": 1.0358284711837769, + "learning_rate": 9.357086453301878e-05, + "loss": 0.666864812374115, + "step": 3274 + }, + { + "epoch": 1.3822784810126583, + "grad_norm": 1.2790803909301758, + "learning_rate": 9.355907931696401e-05, + "loss": 0.6872087121009827, + "step": 3276 + }, + { + "epoch": 1.3831223628691984, + "grad_norm": 1.182991623878479, + "learning_rate": 9.354728405266623e-05, + "loss": 0.5929665565490723, + "step": 3278 + }, + { + "epoch": 1.3839662447257384, + "grad_norm": 1.1071184873580933, + "learning_rate": 9.353547874284634e-05, + "loss": 0.5928181409835815, + "step": 3280 + }, + { + "epoch": 1.3848101265822785, + "grad_norm": 1.3139623403549194, + "learning_rate": 9.352366339022763e-05, + "loss": 0.6783652901649475, + "step": 3282 + }, + { + "epoch": 1.3856540084388187, + "grad_norm": 1.2534632682800293, + "learning_rate": 9.351183799753567e-05, + "loss": 0.7652941346168518, + "step": 3284 + }, + { + "epoch": 1.3864978902953586, + "grad_norm": 1.4487930536270142, + "learning_rate": 9.350000256749833e-05, + "loss": 0.7430433630943298, + "step": 3286 + }, + { + "epoch": 1.3873417721518988, + "grad_norm": 1.0786021947860718, + "learning_rate": 9.348815710284584e-05, + "loss": 0.5854598879814148, + "step": 3288 + }, + { + "epoch": 1.3881856540084387, + "grad_norm": 1.0544480085372925, + "learning_rate": 9.347630160631071e-05, + "loss": 0.6365222334861755, + "step": 3290 + }, + { + "epoch": 1.389029535864979, + "grad_norm": 0.9989988207817078, + "learning_rate": 9.346443608062778e-05, + "loss": 0.6485803127288818, + "step": 3292 + }, + { + "epoch": 1.389873417721519, + "grad_norm": 1.100951910018921, + "learning_rate": 9.345256052853419e-05, + "loss": 0.6417753100395203, + "step": 3294 + }, + { + "epoch": 1.390717299578059, + "grad_norm": 1.1398471593856812, + "learning_rate": 9.344067495276942e-05, + "loss": 0.6333693861961365, + "step": 3296 + }, + { + "epoch": 1.3915611814345992, + "grad_norm": 1.1745941638946533, + "learning_rate": 9.342877935607521e-05, + "loss": 0.677288293838501, + "step": 3298 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 1.2651115655899048, + "learning_rate": 9.34168737411957e-05, + "loss": 0.7408396005630493, + "step": 3300 + }, + { + "epoch": 1.3924050632911391, + "eval_loss": 0.7173135876655579, + "eval_runtime": 853.5344, + "eval_samples_per_second": 2.469, + "eval_steps_per_second": 2.469, + "step": 3300 + }, + { + "epoch": 1.3932489451476793, + "grad_norm": 1.0747730731964111, + "learning_rate": 9.340495811087723e-05, + "loss": 0.6810371279716492, + "step": 3302 + }, + { + "epoch": 1.3940928270042194, + "grad_norm": 1.2857651710510254, + "learning_rate": 9.339303246786854e-05, + "loss": 0.6693953275680542, + "step": 3304 + }, + { + "epoch": 1.3949367088607594, + "grad_norm": 1.4544212818145752, + "learning_rate": 9.338109681492063e-05, + "loss": 0.7019274234771729, + "step": 3306 + }, + { + "epoch": 1.3957805907172995, + "grad_norm": 1.687755823135376, + "learning_rate": 9.336915115478685e-05, + "loss": 0.6074224710464478, + "step": 3308 + }, + { + "epoch": 1.3966244725738397, + "grad_norm": 1.1645431518554688, + "learning_rate": 9.33571954902228e-05, + "loss": 0.6981383562088013, + "step": 3310 + }, + { + "epoch": 1.3974683544303796, + "grad_norm": 1.6173527240753174, + "learning_rate": 9.334522982398646e-05, + "loss": 0.7282926440238953, + "step": 3312 + }, + { + "epoch": 1.3983122362869198, + "grad_norm": 1.3132909536361694, + "learning_rate": 9.333325415883804e-05, + "loss": 0.6574883460998535, + "step": 3314 + }, + { + "epoch": 1.39915611814346, + "grad_norm": 1.1629762649536133, + "learning_rate": 9.332126849754014e-05, + "loss": 0.6559937596321106, + "step": 3316 + }, + { + "epoch": 1.4, + "grad_norm": 1.1666897535324097, + "learning_rate": 9.33092728428576e-05, + "loss": 0.683718740940094, + "step": 3318 + }, + { + "epoch": 1.40084388185654, + "grad_norm": 1.2269554138183594, + "learning_rate": 9.329726719755756e-05, + "loss": 0.6909779906272888, + "step": 3320 + }, + { + "epoch": 1.4016877637130802, + "grad_norm": 1.1010066270828247, + "learning_rate": 9.328525156440952e-05, + "loss": 0.6051948666572571, + "step": 3322 + }, + { + "epoch": 1.4025316455696202, + "grad_norm": 1.127143144607544, + "learning_rate": 9.327322594618528e-05, + "loss": 0.6266679763793945, + "step": 3324 + }, + { + "epoch": 1.4033755274261603, + "grad_norm": 1.2160708904266357, + "learning_rate": 9.326119034565887e-05, + "loss": 0.6587526202201843, + "step": 3326 + }, + { + "epoch": 1.4042194092827005, + "grad_norm": 1.0853947401046753, + "learning_rate": 9.32491447656067e-05, + "loss": 0.5916946530342102, + "step": 3328 + }, + { + "epoch": 1.4050632911392404, + "grad_norm": 1.2205027341842651, + "learning_rate": 9.323708920880744e-05, + "loss": 0.6032452583312988, + "step": 3330 + }, + { + "epoch": 1.4059071729957806, + "grad_norm": 1.1964668035507202, + "learning_rate": 9.32250236780421e-05, + "loss": 0.6649114489555359, + "step": 3332 + }, + { + "epoch": 1.4067510548523208, + "grad_norm": 1.2507994174957275, + "learning_rate": 9.321294817609394e-05, + "loss": 0.7142994403839111, + "step": 3334 + }, + { + "epoch": 1.4075949367088607, + "grad_norm": 1.1310259103775024, + "learning_rate": 9.320086270574854e-05, + "loss": 0.709568977355957, + "step": 3336 + }, + { + "epoch": 1.4084388185654009, + "grad_norm": 1.2454090118408203, + "learning_rate": 9.318876726979385e-05, + "loss": 0.7800853848457336, + "step": 3338 + }, + { + "epoch": 1.409282700421941, + "grad_norm": 1.1168389320373535, + "learning_rate": 9.317666187101996e-05, + "loss": 0.6187908053398132, + "step": 3340 + }, + { + "epoch": 1.410126582278481, + "grad_norm": 1.6696287393569946, + "learning_rate": 9.316454651221942e-05, + "loss": 0.6222613453865051, + "step": 3342 + }, + { + "epoch": 1.4109704641350211, + "grad_norm": 0.9500295519828796, + "learning_rate": 9.315242119618698e-05, + "loss": 0.6116594672203064, + "step": 3344 + }, + { + "epoch": 1.4118143459915613, + "grad_norm": 1.186358094215393, + "learning_rate": 9.314028592571973e-05, + "loss": 0.633224368095398, + "step": 3346 + }, + { + "epoch": 1.4126582278481012, + "grad_norm": 1.1855978965759277, + "learning_rate": 9.312814070361705e-05, + "loss": 0.6675921082496643, + "step": 3348 + }, + { + "epoch": 1.4135021097046414, + "grad_norm": 1.2465872764587402, + "learning_rate": 9.311598553268059e-05, + "loss": 0.7268879413604736, + "step": 3350 + }, + { + "epoch": 1.4143459915611816, + "grad_norm": 1.151274561882019, + "learning_rate": 9.310382041571435e-05, + "loss": 0.6147416830062866, + "step": 3352 + }, + { + "epoch": 1.4151898734177215, + "grad_norm": 1.1226807832717896, + "learning_rate": 9.309164535552453e-05, + "loss": 0.6678543090820312, + "step": 3354 + }, + { + "epoch": 1.4160337552742617, + "grad_norm": 1.375842571258545, + "learning_rate": 9.307946035491975e-05, + "loss": 0.6334129571914673, + "step": 3356 + }, + { + "epoch": 1.4168776371308016, + "grad_norm": 1.058353066444397, + "learning_rate": 9.306726541671081e-05, + "loss": 0.6582583785057068, + "step": 3358 + }, + { + "epoch": 1.4177215189873418, + "grad_norm": 1.0511330366134644, + "learning_rate": 9.305506054371084e-05, + "loss": 0.5877419114112854, + "step": 3360 + }, + { + "epoch": 1.4185654008438817, + "grad_norm": 1.2246462106704712, + "learning_rate": 9.304284573873532e-05, + "loss": 0.711665689945221, + "step": 3362 + }, + { + "epoch": 1.4194092827004219, + "grad_norm": 1.0242294073104858, + "learning_rate": 9.303062100460193e-05, + "loss": 0.6743642687797546, + "step": 3364 + }, + { + "epoch": 1.420253164556962, + "grad_norm": 1.1432100534439087, + "learning_rate": 9.301838634413069e-05, + "loss": 0.6825576424598694, + "step": 3366 + }, + { + "epoch": 1.421097046413502, + "grad_norm": 1.0128604173660278, + "learning_rate": 9.30061417601439e-05, + "loss": 0.624455988407135, + "step": 3368 + }, + { + "epoch": 1.4219409282700421, + "grad_norm": 1.2738330364227295, + "learning_rate": 9.299388725546617e-05, + "loss": 0.7029586434364319, + "step": 3370 + }, + { + "epoch": 1.4227848101265823, + "grad_norm": 1.0857324600219727, + "learning_rate": 9.298162283292435e-05, + "loss": 0.5994319915771484, + "step": 3372 + }, + { + "epoch": 1.4236286919831223, + "grad_norm": 1.0811917781829834, + "learning_rate": 9.296934849534763e-05, + "loss": 0.6537772417068481, + "step": 3374 + }, + { + "epoch": 1.4244725738396624, + "grad_norm": 1.006913185119629, + "learning_rate": 9.295706424556745e-05, + "loss": 0.5775008201599121, + "step": 3376 + }, + { + "epoch": 1.4253164556962026, + "grad_norm": 1.2306486368179321, + "learning_rate": 9.294477008641755e-05, + "loss": 0.7445536255836487, + "step": 3378 + }, + { + "epoch": 1.4261603375527425, + "grad_norm": 1.223608374595642, + "learning_rate": 9.293246602073398e-05, + "loss": 0.6081538796424866, + "step": 3380 + }, + { + "epoch": 1.4270042194092827, + "grad_norm": 1.0933321714401245, + "learning_rate": 9.2920152051355e-05, + "loss": 0.6134634613990784, + "step": 3382 + }, + { + "epoch": 1.4278481012658228, + "grad_norm": 1.1738401651382446, + "learning_rate": 9.290782818112127e-05, + "loss": 0.5961087346076965, + "step": 3384 + }, + { + "epoch": 1.4286919831223628, + "grad_norm": 1.1493438482284546, + "learning_rate": 9.289549441287561e-05, + "loss": 0.6284122467041016, + "step": 3386 + }, + { + "epoch": 1.429535864978903, + "grad_norm": 1.1907998323440552, + "learning_rate": 9.288315074946324e-05, + "loss": 0.6654639840126038, + "step": 3388 + }, + { + "epoch": 1.4303797468354431, + "grad_norm": 1.3423025608062744, + "learning_rate": 9.287079719373157e-05, + "loss": 0.652850329875946, + "step": 3390 + }, + { + "epoch": 1.431223628691983, + "grad_norm": 1.3932039737701416, + "learning_rate": 9.285843374853034e-05, + "loss": 0.703445315361023, + "step": 3392 + }, + { + "epoch": 1.4320675105485232, + "grad_norm": 5.349400043487549, + "learning_rate": 9.284606041671155e-05, + "loss": 0.693265438079834, + "step": 3394 + }, + { + "epoch": 1.4329113924050634, + "grad_norm": 1.0921961069107056, + "learning_rate": 9.28336772011295e-05, + "loss": 0.6578536033630371, + "step": 3396 + }, + { + "epoch": 1.4337552742616033, + "grad_norm": 1.184157133102417, + "learning_rate": 9.282128410464074e-05, + "loss": 0.7092277407646179, + "step": 3398 + }, + { + "epoch": 1.4345991561181435, + "grad_norm": 1.0923491716384888, + "learning_rate": 9.280888113010415e-05, + "loss": 0.6866328120231628, + "step": 3400 + }, + { + "epoch": 1.4345991561181435, + "eval_loss": 0.715917706489563, + "eval_runtime": 868.51, + "eval_samples_per_second": 2.426, + "eval_steps_per_second": 2.426, + "step": 3400 + }, + { + "epoch": 1.4354430379746836, + "grad_norm": 1.2515597343444824, + "learning_rate": 9.279646828038083e-05, + "loss": 0.6617444157600403, + "step": 3402 + }, + { + "epoch": 1.4362869198312236, + "grad_norm": 1.2122540473937988, + "learning_rate": 9.278404555833422e-05, + "loss": 0.6373176574707031, + "step": 3404 + }, + { + "epoch": 1.4371308016877637, + "grad_norm": 1.191904902458191, + "learning_rate": 9.277161296682997e-05, + "loss": 0.6506488919258118, + "step": 3406 + }, + { + "epoch": 1.437974683544304, + "grad_norm": 1.2492214441299438, + "learning_rate": 9.275917050873606e-05, + "loss": 0.7172291874885559, + "step": 3408 + }, + { + "epoch": 1.4388185654008439, + "grad_norm": 1.0518640279769897, + "learning_rate": 9.274671818692272e-05, + "loss": 0.6180248260498047, + "step": 3410 + }, + { + "epoch": 1.439662447257384, + "grad_norm": 1.150563359260559, + "learning_rate": 9.273425600426245e-05, + "loss": 0.6828892827033997, + "step": 3412 + }, + { + "epoch": 1.4405063291139242, + "grad_norm": 1.76945960521698, + "learning_rate": 9.272178396363005e-05, + "loss": 0.6585919857025146, + "step": 3414 + }, + { + "epoch": 1.4413502109704641, + "grad_norm": 1.2367758750915527, + "learning_rate": 9.270930206790257e-05, + "loss": 0.7548692226409912, + "step": 3416 + }, + { + "epoch": 1.4421940928270043, + "grad_norm": 1.2292778491973877, + "learning_rate": 9.269681031995936e-05, + "loss": 0.7017102837562561, + "step": 3418 + }, + { + "epoch": 1.4430379746835442, + "grad_norm": 1.2193396091461182, + "learning_rate": 9.268430872268202e-05, + "loss": 0.6657648682594299, + "step": 3420 + }, + { + "epoch": 1.4438818565400844, + "grad_norm": 1.0505954027175903, + "learning_rate": 9.267179727895443e-05, + "loss": 0.6950910091400146, + "step": 3422 + }, + { + "epoch": 1.4447257383966245, + "grad_norm": 1.1560698747634888, + "learning_rate": 9.265927599166272e-05, + "loss": 0.689308226108551, + "step": 3424 + }, + { + "epoch": 1.4455696202531645, + "grad_norm": 1.189336895942688, + "learning_rate": 9.264674486369533e-05, + "loss": 0.6481659412384033, + "step": 3426 + }, + { + "epoch": 1.4464135021097047, + "grad_norm": 1.3527976274490356, + "learning_rate": 9.263420389794294e-05, + "loss": 0.6626612544059753, + "step": 3428 + }, + { + "epoch": 1.4472573839662446, + "grad_norm": 1.096303105354309, + "learning_rate": 9.262165309729854e-05, + "loss": 0.690841794013977, + "step": 3430 + }, + { + "epoch": 1.4481012658227848, + "grad_norm": 1.2131421566009521, + "learning_rate": 9.260909246465732e-05, + "loss": 0.6497649550437927, + "step": 3432 + }, + { + "epoch": 1.448945147679325, + "grad_norm": 1.1831032037734985, + "learning_rate": 9.259652200291678e-05, + "loss": 0.6236130595207214, + "step": 3434 + }, + { + "epoch": 1.4497890295358649, + "grad_norm": 0.9745979309082031, + "learning_rate": 9.25839417149767e-05, + "loss": 0.5223423838615417, + "step": 3436 + }, + { + "epoch": 1.450632911392405, + "grad_norm": 1.372460126876831, + "learning_rate": 9.257135160373912e-05, + "loss": 0.6642022728919983, + "step": 3438 + }, + { + "epoch": 1.4514767932489452, + "grad_norm": 1.421044111251831, + "learning_rate": 9.255875167210832e-05, + "loss": 0.5426992774009705, + "step": 3440 + }, + { + "epoch": 1.4523206751054851, + "grad_norm": 1.1694250106811523, + "learning_rate": 9.254614192299086e-05, + "loss": 0.6260567307472229, + "step": 3442 + }, + { + "epoch": 1.4531645569620253, + "grad_norm": 1.0892298221588135, + "learning_rate": 9.253352235929558e-05, + "loss": 0.5776100158691406, + "step": 3444 + }, + { + "epoch": 1.4540084388185655, + "grad_norm": 1.1841259002685547, + "learning_rate": 9.252089298393356e-05, + "loss": 0.6495202779769897, + "step": 3446 + }, + { + "epoch": 1.4548523206751054, + "grad_norm": 1.1133549213409424, + "learning_rate": 9.250825379981815e-05, + "loss": 0.6570594906806946, + "step": 3448 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 1.197100281715393, + "learning_rate": 9.249560480986498e-05, + "loss": 0.6496587991714478, + "step": 3450 + }, + { + "epoch": 1.4565400843881857, + "grad_norm": 1.1661107540130615, + "learning_rate": 9.248294601699193e-05, + "loss": 0.6644704341888428, + "step": 3452 + }, + { + "epoch": 1.4573839662447257, + "grad_norm": 1.2257879972457886, + "learning_rate": 9.247027742411912e-05, + "loss": 0.6451231241226196, + "step": 3454 + }, + { + "epoch": 1.4582278481012658, + "grad_norm": 1.3634982109069824, + "learning_rate": 9.245759903416897e-05, + "loss": 0.6108601093292236, + "step": 3456 + }, + { + "epoch": 1.459071729957806, + "grad_norm": 1.1802605390548706, + "learning_rate": 9.244491085006615e-05, + "loss": 0.6080004572868347, + "step": 3458 + }, + { + "epoch": 1.459915611814346, + "grad_norm": 1.280831217765808, + "learning_rate": 9.243221287473756e-05, + "loss": 0.6406423449516296, + "step": 3460 + }, + { + "epoch": 1.460759493670886, + "grad_norm": 1.3127192258834839, + "learning_rate": 9.241950511111237e-05, + "loss": 0.7320113778114319, + "step": 3462 + }, + { + "epoch": 1.4616033755274263, + "grad_norm": 1.1711835861206055, + "learning_rate": 9.240678756212204e-05, + "loss": 0.572110652923584, + "step": 3464 + }, + { + "epoch": 1.4624472573839662, + "grad_norm": 1.347143292427063, + "learning_rate": 9.239406023070028e-05, + "loss": 0.7446795105934143, + "step": 3466 + }, + { + "epoch": 1.4632911392405064, + "grad_norm": 1.4953652620315552, + "learning_rate": 9.238132311978299e-05, + "loss": 0.6709978580474854, + "step": 3468 + }, + { + "epoch": 1.4641350210970465, + "grad_norm": 1.2199387550354004, + "learning_rate": 9.236857623230842e-05, + "loss": 0.6691445112228394, + "step": 3470 + }, + { + "epoch": 1.4649789029535865, + "grad_norm": 1.0959199666976929, + "learning_rate": 9.235581957121702e-05, + "loss": 0.6964292526245117, + "step": 3472 + }, + { + "epoch": 1.4658227848101266, + "grad_norm": 1.455505609512329, + "learning_rate": 9.234305313945149e-05, + "loss": 0.6880454421043396, + "step": 3474 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.2820862531661987, + "learning_rate": 9.233027693995681e-05, + "loss": 0.6737138032913208, + "step": 3476 + }, + { + "epoch": 1.4675105485232067, + "grad_norm": 1.3459213972091675, + "learning_rate": 9.231749097568023e-05, + "loss": 0.6874006390571594, + "step": 3478 + }, + { + "epoch": 1.4683544303797469, + "grad_norm": 1.2815442085266113, + "learning_rate": 9.230469524957119e-05, + "loss": 0.7179469466209412, + "step": 3480 + }, + { + "epoch": 1.469198312236287, + "grad_norm": 1.6181597709655762, + "learning_rate": 9.229188976458145e-05, + "loss": 0.7525522112846375, + "step": 3482 + }, + { + "epoch": 1.470042194092827, + "grad_norm": 1.0633227825164795, + "learning_rate": 9.227907452366495e-05, + "loss": 0.5918128490447998, + "step": 3484 + }, + { + "epoch": 1.4708860759493672, + "grad_norm": 1.2055985927581787, + "learning_rate": 9.226624952977796e-05, + "loss": 0.6686186194419861, + "step": 3486 + }, + { + "epoch": 1.471729957805907, + "grad_norm": 1.2495088577270508, + "learning_rate": 9.225341478587893e-05, + "loss": 0.764410674571991, + "step": 3488 + }, + { + "epoch": 1.4725738396624473, + "grad_norm": 1.174229383468628, + "learning_rate": 9.22405702949286e-05, + "loss": 0.7066780924797058, + "step": 3490 + }, + { + "epoch": 1.4734177215189874, + "grad_norm": 1.0970302820205688, + "learning_rate": 9.222771605988995e-05, + "loss": 0.6740228533744812, + "step": 3492 + }, + { + "epoch": 1.4742616033755274, + "grad_norm": 1.2470436096191406, + "learning_rate": 9.221485208372822e-05, + "loss": 0.698371410369873, + "step": 3494 + }, + { + "epoch": 1.4751054852320675, + "grad_norm": 1.0750112533569336, + "learning_rate": 9.220197836941084e-05, + "loss": 0.6354188919067383, + "step": 3496 + }, + { + "epoch": 1.4759493670886075, + "grad_norm": 1.2656232118606567, + "learning_rate": 9.218909491990757e-05, + "loss": 0.7268608212471008, + "step": 3498 + }, + { + "epoch": 1.4767932489451476, + "grad_norm": 1.2389028072357178, + "learning_rate": 9.217620173819037e-05, + "loss": 0.6652966141700745, + "step": 3500 + }, + { + "epoch": 1.4767932489451476, + "eval_loss": 0.7155047059059143, + "eval_runtime": 855.8428, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 2.462, + "step": 3500 + }, + { + "epoch": 1.4776371308016878, + "grad_norm": 1.218304991722107, + "learning_rate": 9.216329882723343e-05, + "loss": 0.6845020651817322, + "step": 3502 + }, + { + "epoch": 1.4784810126582277, + "grad_norm": 1.123903512954712, + "learning_rate": 9.21503861900132e-05, + "loss": 0.6972519755363464, + "step": 3504 + }, + { + "epoch": 1.479324894514768, + "grad_norm": 1.1827739477157593, + "learning_rate": 9.213746382950839e-05, + "loss": 0.6699702739715576, + "step": 3506 + }, + { + "epoch": 1.480168776371308, + "grad_norm": 0.9934872984886169, + "learning_rate": 9.212453174869995e-05, + "loss": 0.5623225569725037, + "step": 3508 + }, + { + "epoch": 1.481012658227848, + "grad_norm": 1.221093773841858, + "learning_rate": 9.211158995057105e-05, + "loss": 0.6527173519134521, + "step": 3510 + }, + { + "epoch": 1.4818565400843882, + "grad_norm": 1.4569166898727417, + "learning_rate": 9.209863843810711e-05, + "loss": 0.7015712261199951, + "step": 3512 + }, + { + "epoch": 1.4827004219409283, + "grad_norm": 1.0764813423156738, + "learning_rate": 9.208567721429581e-05, + "loss": 0.6442505717277527, + "step": 3514 + }, + { + "epoch": 1.4835443037974683, + "grad_norm": 2.1307506561279297, + "learning_rate": 9.207270628212704e-05, + "loss": 0.666451096534729, + "step": 3516 + }, + { + "epoch": 1.4843881856540084, + "grad_norm": 1.180590271949768, + "learning_rate": 9.205972564459296e-05, + "loss": 0.6354807019233704, + "step": 3518 + }, + { + "epoch": 1.4852320675105486, + "grad_norm": 1.2999447584152222, + "learning_rate": 9.204673530468795e-05, + "loss": 0.6080324053764343, + "step": 3520 + }, + { + "epoch": 1.4860759493670885, + "grad_norm": 1.1680655479431152, + "learning_rate": 9.203373526540862e-05, + "loss": 0.6411244869232178, + "step": 3522 + }, + { + "epoch": 1.4869198312236287, + "grad_norm": 1.0565013885498047, + "learning_rate": 9.202072552975383e-05, + "loss": 0.6498287916183472, + "step": 3524 + }, + { + "epoch": 1.4877637130801689, + "grad_norm": 1.246267318725586, + "learning_rate": 9.20077061007247e-05, + "loss": 0.633613109588623, + "step": 3526 + }, + { + "epoch": 1.4886075949367088, + "grad_norm": 1.0626300573349, + "learning_rate": 9.199467698132453e-05, + "loss": 0.6102107167243958, + "step": 3528 + }, + { + "epoch": 1.489451476793249, + "grad_norm": 1.256600260734558, + "learning_rate": 9.198163817455892e-05, + "loss": 0.669352114200592, + "step": 3530 + }, + { + "epoch": 1.4902953586497891, + "grad_norm": 1.143188238143921, + "learning_rate": 9.196858968343565e-05, + "loss": 0.6305804252624512, + "step": 3532 + }, + { + "epoch": 1.491139240506329, + "grad_norm": 1.1471205949783325, + "learning_rate": 9.195553151096475e-05, + "loss": 0.6256994605064392, + "step": 3534 + }, + { + "epoch": 1.4919831223628692, + "grad_norm": 1.1771589517593384, + "learning_rate": 9.194246366015851e-05, + "loss": 0.6395107507705688, + "step": 3536 + }, + { + "epoch": 1.4928270042194094, + "grad_norm": 1.1997097730636597, + "learning_rate": 9.192938613403144e-05, + "loss": 0.6875160932540894, + "step": 3538 + }, + { + "epoch": 1.4936708860759493, + "grad_norm": 1.3962169885635376, + "learning_rate": 9.191629893560024e-05, + "loss": 0.7216510772705078, + "step": 3540 + }, + { + "epoch": 1.4945147679324895, + "grad_norm": 1.1835654973983765, + "learning_rate": 9.19032020678839e-05, + "loss": 0.6870693564414978, + "step": 3542 + }, + { + "epoch": 1.4953586497890297, + "grad_norm": 1.112331509590149, + "learning_rate": 9.18900955339036e-05, + "loss": 0.6266092658042908, + "step": 3544 + }, + { + "epoch": 1.4962025316455696, + "grad_norm": 1.0298354625701904, + "learning_rate": 9.187697933668278e-05, + "loss": 0.5906343460083008, + "step": 3546 + }, + { + "epoch": 1.4970464135021098, + "grad_norm": 1.2650012969970703, + "learning_rate": 9.186385347924709e-05, + "loss": 0.6203610897064209, + "step": 3548 + }, + { + "epoch": 1.49789029535865, + "grad_norm": 1.1208417415618896, + "learning_rate": 9.185071796462441e-05, + "loss": 0.6841281652450562, + "step": 3550 + }, + { + "epoch": 1.4987341772151899, + "grad_norm": 1.1319488286972046, + "learning_rate": 9.183757279584486e-05, + "loss": 0.7089514136314392, + "step": 3552 + }, + { + "epoch": 1.49957805907173, + "grad_norm": 1.1104235649108887, + "learning_rate": 9.182441797594076e-05, + "loss": 0.6663861870765686, + "step": 3554 + }, + { + "epoch": 1.5004219409282702, + "grad_norm": 1.161412000656128, + "learning_rate": 9.18112535079467e-05, + "loss": 0.6713237762451172, + "step": 3556 + }, + { + "epoch": 1.5012658227848101, + "grad_norm": 1.2925246953964233, + "learning_rate": 9.179807939489945e-05, + "loss": 0.6665274500846863, + "step": 3558 + }, + { + "epoch": 1.50210970464135, + "grad_norm": 1.0968270301818848, + "learning_rate": 9.178489563983802e-05, + "loss": 0.6881593465805054, + "step": 3560 + }, + { + "epoch": 1.5029535864978905, + "grad_norm": 1.111439824104309, + "learning_rate": 9.177170224580368e-05, + "loss": 0.631568431854248, + "step": 3562 + }, + { + "epoch": 1.5037974683544304, + "grad_norm": 1.6731075048446655, + "learning_rate": 9.175849921583986e-05, + "loss": 0.6896167397499084, + "step": 3564 + }, + { + "epoch": 1.5046413502109703, + "grad_norm": 1.226739525794983, + "learning_rate": 9.174528655299226e-05, + "loss": 0.6285277605056763, + "step": 3566 + }, + { + "epoch": 1.5054852320675105, + "grad_norm": 1.2030941247940063, + "learning_rate": 9.17320642603088e-05, + "loss": 0.6256678700447083, + "step": 3568 + }, + { + "epoch": 1.5063291139240507, + "grad_norm": 1.1980781555175781, + "learning_rate": 9.171883234083958e-05, + "loss": 0.6895992159843445, + "step": 3570 + }, + { + "epoch": 1.5071729957805906, + "grad_norm": 1.2083429098129272, + "learning_rate": 9.170559079763696e-05, + "loss": 0.6642275452613831, + "step": 3572 + }, + { + "epoch": 1.5080168776371308, + "grad_norm": 1.134020209312439, + "learning_rate": 9.169233963375552e-05, + "loss": 0.7441924214363098, + "step": 3574 + }, + { + "epoch": 1.508860759493671, + "grad_norm": 1.8178621530532837, + "learning_rate": 9.167907885225204e-05, + "loss": 0.6435995101928711, + "step": 3576 + }, + { + "epoch": 1.5097046413502109, + "grad_norm": 1.3850326538085938, + "learning_rate": 9.166580845618553e-05, + "loss": 0.6933603882789612, + "step": 3578 + }, + { + "epoch": 1.510548523206751, + "grad_norm": 1.2500641345977783, + "learning_rate": 9.165252844861723e-05, + "loss": 0.6686714887619019, + "step": 3580 + }, + { + "epoch": 1.5113924050632912, + "grad_norm": 1.0226643085479736, + "learning_rate": 9.163923883261056e-05, + "loss": 0.607890248298645, + "step": 3582 + }, + { + "epoch": 1.5122362869198311, + "grad_norm": 1.233402132987976, + "learning_rate": 9.162593961123118e-05, + "loss": 0.6604583859443665, + "step": 3584 + }, + { + "epoch": 1.5130801687763713, + "grad_norm": 1.2609056234359741, + "learning_rate": 9.161263078754698e-05, + "loss": 0.6756428480148315, + "step": 3586 + }, + { + "epoch": 1.5139240506329115, + "grad_norm": 1.22673761844635, + "learning_rate": 9.159931236462805e-05, + "loss": 0.6990940570831299, + "step": 3588 + }, + { + "epoch": 1.5147679324894514, + "grad_norm": 1.1386182308197021, + "learning_rate": 9.158598434554668e-05, + "loss": 0.6436648964881897, + "step": 3590 + }, + { + "epoch": 1.5156118143459916, + "grad_norm": 1.1136831045150757, + "learning_rate": 9.157264673337739e-05, + "loss": 0.6420145034790039, + "step": 3592 + }, + { + "epoch": 1.5164556962025317, + "grad_norm": 1.1957908868789673, + "learning_rate": 9.155929953119693e-05, + "loss": 0.6518592834472656, + "step": 3594 + }, + { + "epoch": 1.5172995780590717, + "grad_norm": 1.1049647331237793, + "learning_rate": 9.154594274208422e-05, + "loss": 0.6891129612922668, + "step": 3596 + }, + { + "epoch": 1.5181434599156118, + "grad_norm": 1.243675947189331, + "learning_rate": 9.153257636912043e-05, + "loss": 0.6945107579231262, + "step": 3598 + }, + { + "epoch": 1.518987341772152, + "grad_norm": 1.2633713483810425, + "learning_rate": 9.15192004153889e-05, + "loss": 0.7011660933494568, + "step": 3600 + }, + { + "epoch": 1.518987341772152, + "eval_loss": 0.7118256688117981, + "eval_runtime": 851.3079, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 2.475, + "step": 3600 + }, + { + "epoch": 1.519831223628692, + "grad_norm": 1.2995525598526, + "learning_rate": 9.150581488397525e-05, + "loss": 0.6843758821487427, + "step": 3602 + }, + { + "epoch": 1.520675105485232, + "grad_norm": 1.3140910863876343, + "learning_rate": 9.149241977796723e-05, + "loss": 0.6699353456497192, + "step": 3604 + }, + { + "epoch": 1.5215189873417723, + "grad_norm": 1.2674909830093384, + "learning_rate": 9.147901510045485e-05, + "loss": 0.7269271612167358, + "step": 3606 + }, + { + "epoch": 1.5223628691983122, + "grad_norm": 1.0232038497924805, + "learning_rate": 9.146560085453031e-05, + "loss": 0.5556837916374207, + "step": 3608 + }, + { + "epoch": 1.5232067510548524, + "grad_norm": 1.2598992586135864, + "learning_rate": 9.1452177043288e-05, + "loss": 0.7273092269897461, + "step": 3610 + }, + { + "epoch": 1.5240506329113925, + "grad_norm": 1.2002917528152466, + "learning_rate": 9.143874366982455e-05, + "loss": 0.6897470355033875, + "step": 3612 + }, + { + "epoch": 1.5248945147679325, + "grad_norm": 1.0959099531173706, + "learning_rate": 9.142530073723878e-05, + "loss": 0.6060715913772583, + "step": 3614 + }, + { + "epoch": 1.5257383966244724, + "grad_norm": 1.9890750646591187, + "learning_rate": 9.141184824863173e-05, + "loss": 0.6585046052932739, + "step": 3616 + }, + { + "epoch": 1.5265822784810128, + "grad_norm": 1.1460137367248535, + "learning_rate": 9.139838620710663e-05, + "loss": 0.6022046804428101, + "step": 3618 + }, + { + "epoch": 1.5274261603375527, + "grad_norm": 1.193206548690796, + "learning_rate": 9.138491461576888e-05, + "loss": 0.6332581639289856, + "step": 3620 + }, + { + "epoch": 1.5282700421940927, + "grad_norm": 1.2813689708709717, + "learning_rate": 9.137143347772614e-05, + "loss": 0.6690208315849304, + "step": 3622 + }, + { + "epoch": 1.529113924050633, + "grad_norm": 1.0950052738189697, + "learning_rate": 9.135794279608827e-05, + "loss": 0.6034293174743652, + "step": 3624 + }, + { + "epoch": 1.529957805907173, + "grad_norm": 1.208884358406067, + "learning_rate": 9.134444257396729e-05, + "loss": 0.7077960968017578, + "step": 3626 + }, + { + "epoch": 1.530801687763713, + "grad_norm": 1.093759298324585, + "learning_rate": 9.133093281447742e-05, + "loss": 0.6741147637367249, + "step": 3628 + }, + { + "epoch": 1.5316455696202531, + "grad_norm": 1.1280012130737305, + "learning_rate": 9.131741352073514e-05, + "loss": 0.6816818118095398, + "step": 3630 + }, + { + "epoch": 1.5324894514767933, + "grad_norm": 1.2868385314941406, + "learning_rate": 9.130388469585907e-05, + "loss": 0.7149180769920349, + "step": 3632 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.9654553532600403, + "learning_rate": 9.129034634297007e-05, + "loss": 0.613467812538147, + "step": 3634 + }, + { + "epoch": 1.5341772151898734, + "grad_norm": 1.8958736658096313, + "learning_rate": 9.127679846519115e-05, + "loss": 0.7034116387367249, + "step": 3636 + }, + { + "epoch": 1.5350210970464135, + "grad_norm": 1.305284857749939, + "learning_rate": 9.126324106564757e-05, + "loss": 0.7076106667518616, + "step": 3638 + }, + { + "epoch": 1.5358649789029535, + "grad_norm": 1.1843762397766113, + "learning_rate": 9.124967414746675e-05, + "loss": 0.6671180725097656, + "step": 3640 + }, + { + "epoch": 1.5367088607594936, + "grad_norm": 1.0460047721862793, + "learning_rate": 9.123609771377832e-05, + "loss": 0.667533814907074, + "step": 3642 + }, + { + "epoch": 1.5375527426160338, + "grad_norm": 1.0441135168075562, + "learning_rate": 9.122251176771409e-05, + "loss": 0.6454499959945679, + "step": 3644 + }, + { + "epoch": 1.5383966244725737, + "grad_norm": 1.5647634267807007, + "learning_rate": 9.120891631240811e-05, + "loss": 0.677007794380188, + "step": 3646 + }, + { + "epoch": 1.539240506329114, + "grad_norm": 1.0650273561477661, + "learning_rate": 9.119531135099655e-05, + "loss": 0.7017449736595154, + "step": 3648 + }, + { + "epoch": 1.540084388185654, + "grad_norm": 1.2904767990112305, + "learning_rate": 9.118169688661784e-05, + "loss": 0.683830738067627, + "step": 3650 + }, + { + "epoch": 1.540928270042194, + "grad_norm": 1.1278672218322754, + "learning_rate": 9.116807292241257e-05, + "loss": 0.5923286080360413, + "step": 3652 + }, + { + "epoch": 1.5417721518987342, + "grad_norm": 1.1107184886932373, + "learning_rate": 9.115443946152352e-05, + "loss": 0.6595140099525452, + "step": 3654 + }, + { + "epoch": 1.5426160337552743, + "grad_norm": 1.0917898416519165, + "learning_rate": 9.114079650709566e-05, + "loss": 0.655241072177887, + "step": 3656 + }, + { + "epoch": 1.5434599156118143, + "grad_norm": 1.1922433376312256, + "learning_rate": 9.11271440622762e-05, + "loss": 0.5987096428871155, + "step": 3658 + }, + { + "epoch": 1.5443037974683544, + "grad_norm": 0.9974617958068848, + "learning_rate": 9.111348213021445e-05, + "loss": 0.5710145235061646, + "step": 3660 + }, + { + "epoch": 1.5451476793248946, + "grad_norm": 1.133683443069458, + "learning_rate": 9.109981071406197e-05, + "loss": 0.6067734360694885, + "step": 3662 + }, + { + "epoch": 1.5459915611814345, + "grad_norm": 1.1958736181259155, + "learning_rate": 9.108612981697248e-05, + "loss": 0.622981071472168, + "step": 3664 + }, + { + "epoch": 1.5468354430379747, + "grad_norm": 1.234328031539917, + "learning_rate": 9.107243944210194e-05, + "loss": 0.6520710587501526, + "step": 3666 + }, + { + "epoch": 1.5476793248945149, + "grad_norm": 1.0374714136123657, + "learning_rate": 9.105873959260842e-05, + "loss": 0.5993341207504272, + "step": 3668 + }, + { + "epoch": 1.5485232067510548, + "grad_norm": 0.9987428784370422, + "learning_rate": 9.104503027165223e-05, + "loss": 0.6564813852310181, + "step": 3670 + }, + { + "epoch": 1.549367088607595, + "grad_norm": 1.0823339223861694, + "learning_rate": 9.103131148239584e-05, + "loss": 0.61710524559021, + "step": 3672 + }, + { + "epoch": 1.5502109704641351, + "grad_norm": 1.3481065034866333, + "learning_rate": 9.101758322800391e-05, + "loss": 0.687752366065979, + "step": 3674 + }, + { + "epoch": 1.551054852320675, + "grad_norm": 1.2243965864181519, + "learning_rate": 9.10038455116433e-05, + "loss": 0.5981095433235168, + "step": 3676 + }, + { + "epoch": 1.5518987341772152, + "grad_norm": 1.1384631395339966, + "learning_rate": 9.0990098336483e-05, + "loss": 0.7181004285812378, + "step": 3678 + }, + { + "epoch": 1.5527426160337554, + "grad_norm": 1.042925477027893, + "learning_rate": 9.097634170569426e-05, + "loss": 0.6137188076972961, + "step": 3680 + }, + { + "epoch": 1.5535864978902953, + "grad_norm": 1.372023105621338, + "learning_rate": 9.096257562245045e-05, + "loss": 0.6761168241500854, + "step": 3682 + }, + { + "epoch": 1.5544303797468353, + "grad_norm": 1.0574673414230347, + "learning_rate": 9.094880008992714e-05, + "loss": 0.614276647567749, + "step": 3684 + }, + { + "epoch": 1.5552742616033757, + "grad_norm": 1.2894645929336548, + "learning_rate": 9.093501511130208e-05, + "loss": 0.668122410774231, + "step": 3686 + }, + { + "epoch": 1.5561181434599156, + "grad_norm": 1.2241230010986328, + "learning_rate": 9.092122068975523e-05, + "loss": 0.6305631399154663, + "step": 3688 + }, + { + "epoch": 1.5569620253164556, + "grad_norm": 1.1316208839416504, + "learning_rate": 9.090741682846866e-05, + "loss": 0.633276641368866, + "step": 3690 + }, + { + "epoch": 1.557805907172996, + "grad_norm": 1.2857953310012817, + "learning_rate": 9.089360353062666e-05, + "loss": 0.6657599806785583, + "step": 3692 + }, + { + "epoch": 1.5586497890295359, + "grad_norm": 1.2325671911239624, + "learning_rate": 9.087978079941573e-05, + "loss": 0.6379332542419434, + "step": 3694 + }, + { + "epoch": 1.5594936708860758, + "grad_norm": 1.3286080360412598, + "learning_rate": 9.086594863802445e-05, + "loss": 0.6841909885406494, + "step": 3696 + }, + { + "epoch": 1.560337552742616, + "grad_norm": 1.261890172958374, + "learning_rate": 9.085210704964368e-05, + "loss": 0.6735964417457581, + "step": 3698 + }, + { + "epoch": 1.5611814345991561, + "grad_norm": 1.0922305583953857, + "learning_rate": 9.083825603746639e-05, + "loss": 0.6602351665496826, + "step": 3700 + }, + { + "epoch": 1.5611814345991561, + "eval_loss": 0.7099412679672241, + "eval_runtime": 857.2273, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 3700 + }, + { + "epoch": 1.562025316455696, + "grad_norm": 1.1113468408584595, + "learning_rate": 9.082439560468774e-05, + "loss": 0.6590834259986877, + "step": 3702 + }, + { + "epoch": 1.5628691983122363, + "grad_norm": 1.1476659774780273, + "learning_rate": 9.081052575450508e-05, + "loss": 0.6397460103034973, + "step": 3704 + }, + { + "epoch": 1.5637130801687764, + "grad_norm": 1.2270452976226807, + "learning_rate": 9.07966464901179e-05, + "loss": 0.6337460279464722, + "step": 3706 + }, + { + "epoch": 1.5645569620253164, + "grad_norm": 1.233667016029358, + "learning_rate": 9.07827578147279e-05, + "loss": 0.680374801158905, + "step": 3708 + }, + { + "epoch": 1.5654008438818565, + "grad_norm": 1.0761466026306152, + "learning_rate": 9.076885973153891e-05, + "loss": 0.6234241724014282, + "step": 3710 + }, + { + "epoch": 1.5662447257383967, + "grad_norm": 0.9219012260437012, + "learning_rate": 9.075495224375697e-05, + "loss": 0.6096800565719604, + "step": 3712 + }, + { + "epoch": 1.5670886075949366, + "grad_norm": 1.151168942451477, + "learning_rate": 9.074103535459026e-05, + "loss": 0.649919867515564, + "step": 3714 + }, + { + "epoch": 1.5679324894514768, + "grad_norm": 1.1380470991134644, + "learning_rate": 9.072710906724914e-05, + "loss": 0.6704574227333069, + "step": 3716 + }, + { + "epoch": 1.568776371308017, + "grad_norm": 1.2184447050094604, + "learning_rate": 9.071317338494614e-05, + "loss": 0.6619362831115723, + "step": 3718 + }, + { + "epoch": 1.5696202531645569, + "grad_norm": 1.131170630455017, + "learning_rate": 9.069922831089594e-05, + "loss": 0.6179121732711792, + "step": 3720 + }, + { + "epoch": 1.570464135021097, + "grad_norm": 1.2668405771255493, + "learning_rate": 9.06852738483154e-05, + "loss": 0.594958484172821, + "step": 3722 + }, + { + "epoch": 1.5713080168776372, + "grad_norm": 1.1624782085418701, + "learning_rate": 9.067131000042359e-05, + "loss": 0.6323778629302979, + "step": 3724 + }, + { + "epoch": 1.5721518987341772, + "grad_norm": 1.2936128377914429, + "learning_rate": 9.065733677044166e-05, + "loss": 0.628058910369873, + "step": 3726 + }, + { + "epoch": 1.5729957805907173, + "grad_norm": 1.1847784519195557, + "learning_rate": 9.064335416159296e-05, + "loss": 0.6472614407539368, + "step": 3728 + }, + { + "epoch": 1.5738396624472575, + "grad_norm": 1.8903449773788452, + "learning_rate": 9.062936217710305e-05, + "loss": 0.6395491361618042, + "step": 3730 + }, + { + "epoch": 1.5746835443037974, + "grad_norm": 1.1150785684585571, + "learning_rate": 9.061536082019956e-05, + "loss": 0.6911961436271667, + "step": 3732 + }, + { + "epoch": 1.5755274261603376, + "grad_norm": 1.1206107139587402, + "learning_rate": 9.060135009411239e-05, + "loss": 0.7051874399185181, + "step": 3734 + }, + { + "epoch": 1.5763713080168777, + "grad_norm": 1.27924382686615, + "learning_rate": 9.05873300020735e-05, + "loss": 0.7012752890586853, + "step": 3736 + }, + { + "epoch": 1.5772151898734177, + "grad_norm": 1.3970832824707031, + "learning_rate": 9.057330054731707e-05, + "loss": 0.7185142040252686, + "step": 3738 + }, + { + "epoch": 1.5780590717299579, + "grad_norm": 0.9732457995414734, + "learning_rate": 9.055926173307945e-05, + "loss": 0.6298858523368835, + "step": 3740 + }, + { + "epoch": 1.578902953586498, + "grad_norm": 1.230928897857666, + "learning_rate": 9.054521356259909e-05, + "loss": 0.7142943739891052, + "step": 3742 + }, + { + "epoch": 1.579746835443038, + "grad_norm": 1.1297426223754883, + "learning_rate": 9.053115603911664e-05, + "loss": 0.6535376310348511, + "step": 3744 + }, + { + "epoch": 1.580590717299578, + "grad_norm": 1.2132076025009155, + "learning_rate": 9.051708916587491e-05, + "loss": 0.6236510872840881, + "step": 3746 + }, + { + "epoch": 1.5814345991561183, + "grad_norm": 1.201319932937622, + "learning_rate": 9.050301294611885e-05, + "loss": 0.6752219200134277, + "step": 3748 + }, + { + "epoch": 1.5822784810126582, + "grad_norm": 1.2969163656234741, + "learning_rate": 9.048892738309559e-05, + "loss": 0.7248554825782776, + "step": 3750 + }, + { + "epoch": 1.5831223628691982, + "grad_norm": 1.0721957683563232, + "learning_rate": 9.047483248005439e-05, + "loss": 0.6488997340202332, + "step": 3752 + }, + { + "epoch": 1.5839662447257385, + "grad_norm": 0.9988508820533752, + "learning_rate": 9.046072824024667e-05, + "loss": 0.6191130876541138, + "step": 3754 + }, + { + "epoch": 1.5848101265822785, + "grad_norm": 1.260183572769165, + "learning_rate": 9.0446614666926e-05, + "loss": 0.6681985259056091, + "step": 3756 + }, + { + "epoch": 1.5856540084388184, + "grad_norm": 1.1288834810256958, + "learning_rate": 9.043249176334812e-05, + "loss": 0.662024736404419, + "step": 3758 + }, + { + "epoch": 1.5864978902953588, + "grad_norm": 1.4384263753890991, + "learning_rate": 9.04183595327709e-05, + "loss": 0.609916627407074, + "step": 3760 + }, + { + "epoch": 1.5873417721518988, + "grad_norm": 1.1109941005706787, + "learning_rate": 9.04042179784544e-05, + "loss": 0.6532528400421143, + "step": 3762 + }, + { + "epoch": 1.5881856540084387, + "grad_norm": 1.0959233045578003, + "learning_rate": 9.039006710366078e-05, + "loss": 0.7136290669441223, + "step": 3764 + }, + { + "epoch": 1.5890295358649789, + "grad_norm": 1.2313964366912842, + "learning_rate": 9.037590691165439e-05, + "loss": 0.6907190084457397, + "step": 3766 + }, + { + "epoch": 1.589873417721519, + "grad_norm": 1.3127682209014893, + "learning_rate": 9.036173740570172e-05, + "loss": 0.7114790678024292, + "step": 3768 + }, + { + "epoch": 1.590717299578059, + "grad_norm": 1.0038903951644897, + "learning_rate": 9.034755858907138e-05, + "loss": 0.6257581114768982, + "step": 3770 + }, + { + "epoch": 1.5915611814345991, + "grad_norm": 1.1058061122894287, + "learning_rate": 9.033337046503416e-05, + "loss": 0.578145444393158, + "step": 3772 + }, + { + "epoch": 1.5924050632911393, + "grad_norm": 1.0893515348434448, + "learning_rate": 9.0319173036863e-05, + "loss": 0.6312620043754578, + "step": 3774 + }, + { + "epoch": 1.5932489451476792, + "grad_norm": 1.1091047525405884, + "learning_rate": 9.030496630783297e-05, + "loss": 0.6799508333206177, + "step": 3776 + }, + { + "epoch": 1.5940928270042194, + "grad_norm": 1.1103609800338745, + "learning_rate": 9.029075028122127e-05, + "loss": 0.678726315498352, + "step": 3778 + }, + { + "epoch": 1.5949367088607596, + "grad_norm": 1.1918376684188843, + "learning_rate": 9.027652496030728e-05, + "loss": 0.7357890009880066, + "step": 3780 + }, + { + "epoch": 1.5957805907172995, + "grad_norm": 1.0541924238204956, + "learning_rate": 9.026229034837253e-05, + "loss": 0.6079391241073608, + "step": 3782 + }, + { + "epoch": 1.5966244725738397, + "grad_norm": 1.195845603942871, + "learning_rate": 9.024804644870062e-05, + "loss": 0.7173702120780945, + "step": 3784 + }, + { + "epoch": 1.5974683544303798, + "grad_norm": 1.1362866163253784, + "learning_rate": 9.023379326457737e-05, + "loss": 0.6431670188903809, + "step": 3786 + }, + { + "epoch": 1.5983122362869198, + "grad_norm": 1.2327499389648438, + "learning_rate": 9.021953079929074e-05, + "loss": 0.6346777677536011, + "step": 3788 + }, + { + "epoch": 1.59915611814346, + "grad_norm": 1.1623177528381348, + "learning_rate": 9.020525905613078e-05, + "loss": 0.6852784156799316, + "step": 3790 + }, + { + "epoch": 1.6, + "grad_norm": 1.0258424282073975, + "learning_rate": 9.019097803838971e-05, + "loss": 0.6357095241546631, + "step": 3792 + }, + { + "epoch": 1.60084388185654, + "grad_norm": 1.0825177431106567, + "learning_rate": 9.017668774936188e-05, + "loss": 0.6663659811019897, + "step": 3794 + }, + { + "epoch": 1.6016877637130802, + "grad_norm": 1.1190401315689087, + "learning_rate": 9.016238819234381e-05, + "loss": 0.6009758710861206, + "step": 3796 + }, + { + "epoch": 1.6025316455696204, + "grad_norm": 1.09871244430542, + "learning_rate": 9.01480793706341e-05, + "loss": 0.6907890439033508, + "step": 3798 + }, + { + "epoch": 1.6033755274261603, + "grad_norm": 1.2046958208084106, + "learning_rate": 9.013376128753354e-05, + "loss": 0.6709389090538025, + "step": 3800 + }, + { + "epoch": 1.6033755274261603, + "eval_loss": 0.7080941200256348, + "eval_runtime": 865.6774, + "eval_samples_per_second": 2.434, + "eval_steps_per_second": 2.434, + "step": 3800 + }, + { + "epoch": 1.6042194092827005, + "grad_norm": 1.0671489238739014, + "learning_rate": 9.011943394634505e-05, + "loss": 0.653937041759491, + "step": 3802 + }, + { + "epoch": 1.6050632911392406, + "grad_norm": 1.4205375909805298, + "learning_rate": 9.010509735037364e-05, + "loss": 0.6647229194641113, + "step": 3804 + }, + { + "epoch": 1.6059071729957806, + "grad_norm": 1.3793799877166748, + "learning_rate": 9.009075150292652e-05, + "loss": 0.6981267929077148, + "step": 3806 + }, + { + "epoch": 1.6067510548523207, + "grad_norm": 1.0534380674362183, + "learning_rate": 9.007639640731298e-05, + "loss": 0.6151314973831177, + "step": 3808 + }, + { + "epoch": 1.6075949367088609, + "grad_norm": 1.1359853744506836, + "learning_rate": 9.006203206684447e-05, + "loss": 0.6671237349510193, + "step": 3810 + }, + { + "epoch": 1.6084388185654008, + "grad_norm": 1.2385475635528564, + "learning_rate": 9.004765848483456e-05, + "loss": 0.7145646810531616, + "step": 3812 + }, + { + "epoch": 1.6092827004219408, + "grad_norm": 1.1323930025100708, + "learning_rate": 9.003327566459899e-05, + "loss": 0.6524789929389954, + "step": 3814 + }, + { + "epoch": 1.6101265822784812, + "grad_norm": 1.1863508224487305, + "learning_rate": 9.001888360945555e-05, + "loss": 0.7574670314788818, + "step": 3816 + }, + { + "epoch": 1.610970464135021, + "grad_norm": 1.0288994312286377, + "learning_rate": 9.000448232272425e-05, + "loss": 0.5858811736106873, + "step": 3818 + }, + { + "epoch": 1.611814345991561, + "grad_norm": 1.2674148082733154, + "learning_rate": 8.999007180772719e-05, + "loss": 0.6834250688552856, + "step": 3820 + }, + { + "epoch": 1.6126582278481014, + "grad_norm": 1.2014318704605103, + "learning_rate": 8.997565206778856e-05, + "loss": 0.6435309052467346, + "step": 3822 + }, + { + "epoch": 1.6135021097046414, + "grad_norm": 1.205741286277771, + "learning_rate": 8.996122310623476e-05, + "loss": 0.6212471127510071, + "step": 3824 + }, + { + "epoch": 1.6143459915611813, + "grad_norm": 1.0866186618804932, + "learning_rate": 8.994678492639426e-05, + "loss": 0.6832143664360046, + "step": 3826 + }, + { + "epoch": 1.6151898734177215, + "grad_norm": 1.0786924362182617, + "learning_rate": 8.993233753159768e-05, + "loss": 0.6129988431930542, + "step": 3828 + }, + { + "epoch": 1.6160337552742616, + "grad_norm": 1.176597237586975, + "learning_rate": 8.991788092517775e-05, + "loss": 0.6376019716262817, + "step": 3830 + }, + { + "epoch": 1.6168776371308016, + "grad_norm": 1.149990200996399, + "learning_rate": 8.99034151104693e-05, + "loss": 0.7300569415092468, + "step": 3832 + }, + { + "epoch": 1.6177215189873417, + "grad_norm": 1.0655301809310913, + "learning_rate": 8.988894009080936e-05, + "loss": 0.6163336634635925, + "step": 3834 + }, + { + "epoch": 1.618565400843882, + "grad_norm": 1.1596909761428833, + "learning_rate": 8.987445586953703e-05, + "loss": 0.6459008455276489, + "step": 3836 + }, + { + "epoch": 1.6194092827004218, + "grad_norm": 1.201897382736206, + "learning_rate": 8.985996244999352e-05, + "loss": 0.6166399121284485, + "step": 3838 + }, + { + "epoch": 1.620253164556962, + "grad_norm": 1.1000950336456299, + "learning_rate": 8.984545983552219e-05, + "loss": 0.6438087224960327, + "step": 3840 + }, + { + "epoch": 1.6210970464135022, + "grad_norm": 0.9962409734725952, + "learning_rate": 8.983094802946854e-05, + "loss": 0.6238043308258057, + "step": 3842 + }, + { + "epoch": 1.621940928270042, + "grad_norm": 1.2501682043075562, + "learning_rate": 8.981642703518015e-05, + "loss": 0.6445946097373962, + "step": 3844 + }, + { + "epoch": 1.6227848101265823, + "grad_norm": 1.2027913331985474, + "learning_rate": 8.980189685600673e-05, + "loss": 0.7147613167762756, + "step": 3846 + }, + { + "epoch": 1.6236286919831224, + "grad_norm": 1.1382197141647339, + "learning_rate": 8.97873574953001e-05, + "loss": 0.6531714200973511, + "step": 3848 + }, + { + "epoch": 1.6244725738396624, + "grad_norm": 1.2600723505020142, + "learning_rate": 8.977280895641425e-05, + "loss": 0.6811055541038513, + "step": 3850 + }, + { + "epoch": 1.6253164556962025, + "grad_norm": 0.9908071160316467, + "learning_rate": 8.97582512427052e-05, + "loss": 0.6142261624336243, + "step": 3852 + }, + { + "epoch": 1.6261603375527427, + "grad_norm": 1.171557068824768, + "learning_rate": 8.974368435753117e-05, + "loss": 0.6408987045288086, + "step": 3854 + }, + { + "epoch": 1.6270042194092826, + "grad_norm": 1.1839419603347778, + "learning_rate": 8.972910830425247e-05, + "loss": 0.7352069616317749, + "step": 3856 + }, + { + "epoch": 1.6278481012658228, + "grad_norm": 1.233730673789978, + "learning_rate": 8.971452308623148e-05, + "loss": 0.7663040161132812, + "step": 3858 + }, + { + "epoch": 1.628691983122363, + "grad_norm": 1.3636224269866943, + "learning_rate": 8.969992870683273e-05, + "loss": 0.6496971249580383, + "step": 3860 + }, + { + "epoch": 1.629535864978903, + "grad_norm": 1.2819573879241943, + "learning_rate": 8.96853251694229e-05, + "loss": 0.6079609394073486, + "step": 3862 + }, + { + "epoch": 1.630379746835443, + "grad_norm": 1.087265968322754, + "learning_rate": 8.967071247737071e-05, + "loss": 0.6299422979354858, + "step": 3864 + }, + { + "epoch": 1.6312236286919832, + "grad_norm": 1.24200439453125, + "learning_rate": 8.965609063404706e-05, + "loss": 0.6691840291023254, + "step": 3866 + }, + { + "epoch": 1.6320675105485232, + "grad_norm": 1.0771806240081787, + "learning_rate": 8.96414596428249e-05, + "loss": 0.6623613238334656, + "step": 3868 + }, + { + "epoch": 1.6329113924050633, + "grad_norm": 1.1830974817276, + "learning_rate": 8.962681950707932e-05, + "loss": 0.6663276553153992, + "step": 3870 + }, + { + "epoch": 1.6337552742616035, + "grad_norm": 1.1107177734375, + "learning_rate": 8.961217023018754e-05, + "loss": 0.6426810622215271, + "step": 3872 + }, + { + "epoch": 1.6345991561181434, + "grad_norm": 1.2528507709503174, + "learning_rate": 8.959751181552886e-05, + "loss": 0.7113696336746216, + "step": 3874 + }, + { + "epoch": 1.6354430379746834, + "grad_norm": 1.0656070709228516, + "learning_rate": 8.958284426648467e-05, + "loss": 0.6211581230163574, + "step": 3876 + }, + { + "epoch": 1.6362869198312238, + "grad_norm": 1.0627381801605225, + "learning_rate": 8.956816758643852e-05, + "loss": 0.5950066447257996, + "step": 3878 + }, + { + "epoch": 1.6371308016877637, + "grad_norm": 0.9812912344932556, + "learning_rate": 8.955348177877603e-05, + "loss": 0.6519815325737, + "step": 3880 + }, + { + "epoch": 1.6379746835443036, + "grad_norm": 1.1843842267990112, + "learning_rate": 8.953878684688493e-05, + "loss": 0.6830767393112183, + "step": 3882 + }, + { + "epoch": 1.638818565400844, + "grad_norm": 1.0393236875534058, + "learning_rate": 8.952408279415507e-05, + "loss": 0.5920302271842957, + "step": 3884 + }, + { + "epoch": 1.639662447257384, + "grad_norm": 0.9931944608688354, + "learning_rate": 8.950936962397838e-05, + "loss": 0.6269177198410034, + "step": 3886 + }, + { + "epoch": 1.640506329113924, + "grad_norm": 1.1461358070373535, + "learning_rate": 8.949464733974891e-05, + "loss": 0.7021532654762268, + "step": 3888 + }, + { + "epoch": 1.6413502109704643, + "grad_norm": 1.2654093503952026, + "learning_rate": 8.947991594486279e-05, + "loss": 0.7331246733665466, + "step": 3890 + }, + { + "epoch": 1.6421940928270042, + "grad_norm": 1.1487081050872803, + "learning_rate": 8.946517544271831e-05, + "loss": 0.6438513994216919, + "step": 3892 + }, + { + "epoch": 1.6430379746835442, + "grad_norm": 1.0876784324645996, + "learning_rate": 8.945042583671579e-05, + "loss": 0.6779276728630066, + "step": 3894 + }, + { + "epoch": 1.6438818565400843, + "grad_norm": 1.2382020950317383, + "learning_rate": 8.943566713025768e-05, + "loss": 0.7255419492721558, + "step": 3896 + }, + { + "epoch": 1.6447257383966245, + "grad_norm": 1.3502718210220337, + "learning_rate": 8.942089932674855e-05, + "loss": 0.7068934440612793, + "step": 3898 + }, + { + "epoch": 1.6455696202531644, + "grad_norm": 1.050878643989563, + "learning_rate": 8.940612242959503e-05, + "loss": 0.608700156211853, + "step": 3900 + }, + { + "epoch": 1.6455696202531644, + "eval_loss": 0.7049403786659241, + "eval_runtime": 854.9866, + "eval_samples_per_second": 2.464, + "eval_steps_per_second": 2.464, + "step": 3900 + }, + { + "epoch": 1.6464135021097046, + "grad_norm": 1.0536954402923584, + "learning_rate": 8.939133644220588e-05, + "loss": 0.6257222890853882, + "step": 3902 + }, + { + "epoch": 1.6472573839662448, + "grad_norm": 1.1903947591781616, + "learning_rate": 8.937654136799195e-05, + "loss": 0.6823404431343079, + "step": 3904 + }, + { + "epoch": 1.6481012658227847, + "grad_norm": 1.225679874420166, + "learning_rate": 8.936173721036616e-05, + "loss": 0.6596478819847107, + "step": 3906 + }, + { + "epoch": 1.6489451476793249, + "grad_norm": 1.0071430206298828, + "learning_rate": 8.934692397274354e-05, + "loss": 0.5638422966003418, + "step": 3908 + }, + { + "epoch": 1.649789029535865, + "grad_norm": 1.0146223306655884, + "learning_rate": 8.933210165854125e-05, + "loss": 0.5743419528007507, + "step": 3910 + }, + { + "epoch": 1.650632911392405, + "grad_norm": 1.122976541519165, + "learning_rate": 8.931727027117848e-05, + "loss": 0.6775169372558594, + "step": 3912 + }, + { + "epoch": 1.6514767932489451, + "grad_norm": 0.9223271012306213, + "learning_rate": 8.930242981407656e-05, + "loss": 0.5984215140342712, + "step": 3914 + }, + { + "epoch": 1.6523206751054853, + "grad_norm": 1.1599735021591187, + "learning_rate": 8.928758029065891e-05, + "loss": 0.6342158913612366, + "step": 3916 + }, + { + "epoch": 1.6531645569620252, + "grad_norm": 1.2680121660232544, + "learning_rate": 8.927272170435101e-05, + "loss": 0.678507924079895, + "step": 3918 + }, + { + "epoch": 1.6540084388185654, + "grad_norm": 1.3628549575805664, + "learning_rate": 8.925785405858047e-05, + "loss": 0.6739710569381714, + "step": 3920 + }, + { + "epoch": 1.6548523206751056, + "grad_norm": 1.163482427597046, + "learning_rate": 8.924297735677694e-05, + "loss": 0.7050020098686218, + "step": 3922 + }, + { + "epoch": 1.6556962025316455, + "grad_norm": 1.2057000398635864, + "learning_rate": 8.922809160237222e-05, + "loss": 0.6847540140151978, + "step": 3924 + }, + { + "epoch": 1.6565400843881857, + "grad_norm": 1.2784082889556885, + "learning_rate": 8.921319679880016e-05, + "loss": 0.7079069018363953, + "step": 3926 + }, + { + "epoch": 1.6573839662447258, + "grad_norm": 1.1701157093048096, + "learning_rate": 8.919829294949671e-05, + "loss": 0.665060818195343, + "step": 3928 + }, + { + "epoch": 1.6582278481012658, + "grad_norm": 1.3886606693267822, + "learning_rate": 8.918338005789988e-05, + "loss": 0.7547550201416016, + "step": 3930 + }, + { + "epoch": 1.659071729957806, + "grad_norm": 0.9504727721214294, + "learning_rate": 8.91684581274498e-05, + "loss": 0.5718522667884827, + "step": 3932 + }, + { + "epoch": 1.659915611814346, + "grad_norm": 1.1185030937194824, + "learning_rate": 8.915352716158869e-05, + "loss": 0.5984254479408264, + "step": 3934 + }, + { + "epoch": 1.660759493670886, + "grad_norm": 1.1489602327346802, + "learning_rate": 8.913858716376081e-05, + "loss": 0.6749780774116516, + "step": 3936 + }, + { + "epoch": 1.6616033755274262, + "grad_norm": 1.389431118965149, + "learning_rate": 8.912363813741255e-05, + "loss": 0.6537864804267883, + "step": 3938 + }, + { + "epoch": 1.6624472573839664, + "grad_norm": 1.0958757400512695, + "learning_rate": 8.910868008599235e-05, + "loss": 0.6033569574356079, + "step": 3940 + }, + { + "epoch": 1.6632911392405063, + "grad_norm": 1.2735344171524048, + "learning_rate": 8.909371301295075e-05, + "loss": 0.7404987215995789, + "step": 3942 + }, + { + "epoch": 1.6641350210970463, + "grad_norm": 1.123336911201477, + "learning_rate": 8.907873692174038e-05, + "loss": 0.6265006065368652, + "step": 3944 + }, + { + "epoch": 1.6649789029535866, + "grad_norm": 1.259470820426941, + "learning_rate": 8.90637518158159e-05, + "loss": 0.650705099105835, + "step": 3946 + }, + { + "epoch": 1.6658227848101266, + "grad_norm": 1.4020485877990723, + "learning_rate": 8.904875769863412e-05, + "loss": 0.7813970446586609, + "step": 3948 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.1709671020507812, + "learning_rate": 8.903375457365389e-05, + "loss": 0.6499447822570801, + "step": 3950 + }, + { + "epoch": 1.667510548523207, + "grad_norm": 1.085585355758667, + "learning_rate": 8.901874244433612e-05, + "loss": 0.6141875386238098, + "step": 3952 + }, + { + "epoch": 1.6683544303797468, + "grad_norm": 1.2340166568756104, + "learning_rate": 8.900372131414386e-05, + "loss": 0.7080221176147461, + "step": 3954 + }, + { + "epoch": 1.6691983122362868, + "grad_norm": 1.148576259613037, + "learning_rate": 8.898869118654216e-05, + "loss": 0.6340513229370117, + "step": 3956 + }, + { + "epoch": 1.6700421940928272, + "grad_norm": 1.2231999635696411, + "learning_rate": 8.89736520649982e-05, + "loss": 0.6999116539955139, + "step": 3958 + }, + { + "epoch": 1.6708860759493671, + "grad_norm": 1.1600396633148193, + "learning_rate": 8.895860395298121e-05, + "loss": 0.7177759408950806, + "step": 3960 + }, + { + "epoch": 1.671729957805907, + "grad_norm": 1.3019158840179443, + "learning_rate": 8.894354685396251e-05, + "loss": 0.6485702395439148, + "step": 3962 + }, + { + "epoch": 1.6725738396624472, + "grad_norm": 1.0153226852416992, + "learning_rate": 8.892848077141546e-05, + "loss": 0.6189450025558472, + "step": 3964 + }, + { + "epoch": 1.6734177215189874, + "grad_norm": 1.1953094005584717, + "learning_rate": 8.891340570881555e-05, + "loss": 0.6756728291511536, + "step": 3966 + }, + { + "epoch": 1.6742616033755273, + "grad_norm": 1.3376187086105347, + "learning_rate": 8.889832166964027e-05, + "loss": 0.6851167678833008, + "step": 3968 + }, + { + "epoch": 1.6751054852320675, + "grad_norm": 1.0045926570892334, + "learning_rate": 8.888322865736924e-05, + "loss": 0.5991915464401245, + "step": 3970 + }, + { + "epoch": 1.6759493670886076, + "grad_norm": 1.2115750312805176, + "learning_rate": 8.886812667548414e-05, + "loss": 0.713362455368042, + "step": 3972 + }, + { + "epoch": 1.6767932489451476, + "grad_norm": 1.1887929439544678, + "learning_rate": 8.88530157274687e-05, + "loss": 0.7058883309364319, + "step": 3974 + }, + { + "epoch": 1.6776371308016877, + "grad_norm": 1.1465295553207397, + "learning_rate": 8.883789581680868e-05, + "loss": 0.6501380801200867, + "step": 3976 + }, + { + "epoch": 1.678481012658228, + "grad_norm": 1.184693694114685, + "learning_rate": 8.882276694699204e-05, + "loss": 0.6109840273857117, + "step": 3978 + }, + { + "epoch": 1.6793248945147679, + "grad_norm": 1.2034777402877808, + "learning_rate": 8.880762912150862e-05, + "loss": 0.6815584897994995, + "step": 3980 + }, + { + "epoch": 1.680168776371308, + "grad_norm": 1.1312000751495361, + "learning_rate": 8.879248234385052e-05, + "loss": 0.6859248876571655, + "step": 3982 + }, + { + "epoch": 1.6810126582278482, + "grad_norm": 1.2273681163787842, + "learning_rate": 8.877732661751173e-05, + "loss": 0.6426702737808228, + "step": 3984 + }, + { + "epoch": 1.6818565400843881, + "grad_norm": 1.2550326585769653, + "learning_rate": 8.876216194598844e-05, + "loss": 0.6462456583976746, + "step": 3986 + }, + { + "epoch": 1.6827004219409283, + "grad_norm": 1.3111321926116943, + "learning_rate": 8.874698833277884e-05, + "loss": 0.6293925046920776, + "step": 3988 + }, + { + "epoch": 1.6835443037974684, + "grad_norm": 1.037883996963501, + "learning_rate": 8.873180578138316e-05, + "loss": 0.59798264503479, + "step": 3990 + }, + { + "epoch": 1.6843881856540084, + "grad_norm": 1.2411901950836182, + "learning_rate": 8.871661429530376e-05, + "loss": 0.6741529703140259, + "step": 3992 + }, + { + "epoch": 1.6852320675105485, + "grad_norm": 1.206354022026062, + "learning_rate": 8.8701413878045e-05, + "loss": 0.5972680449485779, + "step": 3994 + }, + { + "epoch": 1.6860759493670887, + "grad_norm": 1.1922144889831543, + "learning_rate": 8.868620453311334e-05, + "loss": 0.5879245400428772, + "step": 3996 + }, + { + "epoch": 1.6869198312236287, + "grad_norm": 1.3499996662139893, + "learning_rate": 8.867098626401729e-05, + "loss": 0.7381167411804199, + "step": 3998 + }, + { + "epoch": 1.6877637130801688, + "grad_norm": 1.3601514101028442, + "learning_rate": 8.865575907426737e-05, + "loss": 0.6590276956558228, + "step": 4000 + }, + { + "epoch": 1.6877637130801688, + "eval_loss": 0.7027890682220459, + "eval_runtime": 848.7529, + "eval_samples_per_second": 2.482, + "eval_steps_per_second": 2.482, + "step": 4000 + }, + { + "epoch": 1.688607594936709, + "grad_norm": 1.1060529947280884, + "learning_rate": 8.864052296737624e-05, + "loss": 0.5958077907562256, + "step": 4002 + }, + { + "epoch": 1.689451476793249, + "grad_norm": 1.2067371606826782, + "learning_rate": 8.862527794685858e-05, + "loss": 0.6802279353141785, + "step": 4004 + }, + { + "epoch": 1.690295358649789, + "grad_norm": 1.0094636678695679, + "learning_rate": 8.86100240162311e-05, + "loss": 0.5701603889465332, + "step": 4006 + }, + { + "epoch": 1.6911392405063292, + "grad_norm": 1.0976500511169434, + "learning_rate": 8.85947611790126e-05, + "loss": 0.6580625176429749, + "step": 4008 + }, + { + "epoch": 1.6919831223628692, + "grad_norm": 0.9448981285095215, + "learning_rate": 8.857948943872392e-05, + "loss": 0.5947542190551758, + "step": 4010 + }, + { + "epoch": 1.6928270042194091, + "grad_norm": 1.219609260559082, + "learning_rate": 8.856420879888796e-05, + "loss": 0.6361464262008667, + "step": 4012 + }, + { + "epoch": 1.6936708860759495, + "grad_norm": 1.2395503520965576, + "learning_rate": 8.854891926302966e-05, + "loss": 0.608664333820343, + "step": 4014 + }, + { + "epoch": 1.6945147679324895, + "grad_norm": 1.1300057172775269, + "learning_rate": 8.853362083467604e-05, + "loss": 0.6932460069656372, + "step": 4016 + }, + { + "epoch": 1.6953586497890294, + "grad_norm": 1.2300254106521606, + "learning_rate": 8.851831351735616e-05, + "loss": 0.646004855632782, + "step": 4018 + }, + { + "epoch": 1.6962025316455698, + "grad_norm": 1.2328956127166748, + "learning_rate": 8.85029973146011e-05, + "loss": 0.6760826110839844, + "step": 4020 + }, + { + "epoch": 1.6970464135021097, + "grad_norm": 1.1252286434173584, + "learning_rate": 8.848767222994401e-05, + "loss": 0.5943224430084229, + "step": 4022 + }, + { + "epoch": 1.6978902953586497, + "grad_norm": 1.1587592363357544, + "learning_rate": 8.847233826692012e-05, + "loss": 0.7535276412963867, + "step": 4024 + }, + { + "epoch": 1.6987341772151898, + "grad_norm": 1.0294606685638428, + "learning_rate": 8.845699542906667e-05, + "loss": 0.5903090834617615, + "step": 4026 + }, + { + "epoch": 1.69957805907173, + "grad_norm": 1.1940597295761108, + "learning_rate": 8.844164371992295e-05, + "loss": 0.6031379699707031, + "step": 4028 + }, + { + "epoch": 1.70042194092827, + "grad_norm": 1.0416409969329834, + "learning_rate": 8.842628314303031e-05, + "loss": 0.6185168623924255, + "step": 4030 + }, + { + "epoch": 1.70126582278481, + "grad_norm": 1.8715689182281494, + "learning_rate": 8.841091370193214e-05, + "loss": 0.6325570344924927, + "step": 4032 + }, + { + "epoch": 1.7021097046413503, + "grad_norm": 1.230658769607544, + "learning_rate": 8.839553540017387e-05, + "loss": 0.7413952350616455, + "step": 4034 + }, + { + "epoch": 1.7029535864978902, + "grad_norm": 1.298003077507019, + "learning_rate": 8.838014824130299e-05, + "loss": 0.6973189115524292, + "step": 4036 + }, + { + "epoch": 1.7037974683544304, + "grad_norm": 1.0246652364730835, + "learning_rate": 8.836475222886902e-05, + "loss": 0.6582493185997009, + "step": 4038 + }, + { + "epoch": 1.7046413502109705, + "grad_norm": 1.3652594089508057, + "learning_rate": 8.834934736642351e-05, + "loss": 0.6934399008750916, + "step": 4040 + }, + { + "epoch": 1.7054852320675105, + "grad_norm": 1.029778242111206, + "learning_rate": 8.833393365752007e-05, + "loss": 0.6437561511993408, + "step": 4042 + }, + { + "epoch": 1.7063291139240506, + "grad_norm": 1.1993004083633423, + "learning_rate": 8.831851110571437e-05, + "loss": 0.605059027671814, + "step": 4044 + }, + { + "epoch": 1.7071729957805908, + "grad_norm": 1.286389946937561, + "learning_rate": 8.830307971456406e-05, + "loss": 0.7035017609596252, + "step": 4046 + }, + { + "epoch": 1.7080168776371307, + "grad_norm": 1.1211459636688232, + "learning_rate": 8.82876394876289e-05, + "loss": 0.6429924964904785, + "step": 4048 + }, + { + "epoch": 1.7088607594936709, + "grad_norm": 1.1284868717193604, + "learning_rate": 8.827219042847064e-05, + "loss": 0.6454769968986511, + "step": 4050 + }, + { + "epoch": 1.709704641350211, + "grad_norm": 1.1934884786605835, + "learning_rate": 8.825673254065306e-05, + "loss": 0.707233190536499, + "step": 4052 + }, + { + "epoch": 1.710548523206751, + "grad_norm": 1.1560680866241455, + "learning_rate": 8.824126582774203e-05, + "loss": 0.6790444254875183, + "step": 4054 + }, + { + "epoch": 1.7113924050632912, + "grad_norm": 1.1924364566802979, + "learning_rate": 8.822579029330541e-05, + "loss": 0.6115295886993408, + "step": 4056 + }, + { + "epoch": 1.7122362869198313, + "grad_norm": 1.107370138168335, + "learning_rate": 8.82103059409131e-05, + "loss": 0.7039182186126709, + "step": 4058 + }, + { + "epoch": 1.7130801687763713, + "grad_norm": 1.2554657459259033, + "learning_rate": 8.819481277413707e-05, + "loss": 0.6580052971839905, + "step": 4060 + }, + { + "epoch": 1.7139240506329114, + "grad_norm": 1.2873135805130005, + "learning_rate": 8.817931079655127e-05, + "loss": 0.6042479276657104, + "step": 4062 + }, + { + "epoch": 1.7147679324894516, + "grad_norm": 1.027056097984314, + "learning_rate": 8.816380001173172e-05, + "loss": 0.5992372632026672, + "step": 4064 + }, + { + "epoch": 1.7156118143459915, + "grad_norm": 1.0694721937179565, + "learning_rate": 8.814828042325644e-05, + "loss": 0.7078655362129211, + "step": 4066 + }, + { + "epoch": 1.7164556962025317, + "grad_norm": 1.194984793663025, + "learning_rate": 8.813275203470555e-05, + "loss": 0.6618752479553223, + "step": 4068 + }, + { + "epoch": 1.7172995780590719, + "grad_norm": 1.1713165044784546, + "learning_rate": 8.811721484966109e-05, + "loss": 0.6328625679016113, + "step": 4070 + }, + { + "epoch": 1.7181434599156118, + "grad_norm": 0.9993656277656555, + "learning_rate": 8.810166887170724e-05, + "loss": 0.5916416645050049, + "step": 4072 + }, + { + "epoch": 1.7189873417721517, + "grad_norm": 1.172642707824707, + "learning_rate": 8.808611410443011e-05, + "loss": 0.6490002274513245, + "step": 4074 + }, + { + "epoch": 1.7198312236286921, + "grad_norm": 1.1404821872711182, + "learning_rate": 8.807055055141793e-05, + "loss": 0.6571791172027588, + "step": 4076 + }, + { + "epoch": 1.720675105485232, + "grad_norm": 1.2104214429855347, + "learning_rate": 8.80549782162609e-05, + "loss": 0.6233854293823242, + "step": 4078 + }, + { + "epoch": 1.721518987341772, + "grad_norm": 1.1691396236419678, + "learning_rate": 8.803939710255126e-05, + "loss": 0.6331531405448914, + "step": 4080 + }, + { + "epoch": 1.7223628691983124, + "grad_norm": 1.263174057006836, + "learning_rate": 8.802380721388325e-05, + "loss": 0.6321156620979309, + "step": 4082 + }, + { + "epoch": 1.7232067510548523, + "grad_norm": 1.0685606002807617, + "learning_rate": 8.80082085538532e-05, + "loss": 0.644904613494873, + "step": 4084 + }, + { + "epoch": 1.7240506329113923, + "grad_norm": 1.2289735078811646, + "learning_rate": 8.799260112605938e-05, + "loss": 0.6743831634521484, + "step": 4086 + }, + { + "epoch": 1.7248945147679327, + "grad_norm": 1.0661355257034302, + "learning_rate": 8.797698493410216e-05, + "loss": 0.6866999268531799, + "step": 4088 + }, + { + "epoch": 1.7257383966244726, + "grad_norm": 1.1001228094100952, + "learning_rate": 8.796135998158386e-05, + "loss": 0.691387414932251, + "step": 4090 + }, + { + "epoch": 1.7265822784810125, + "grad_norm": 1.1078115701675415, + "learning_rate": 8.794572627210887e-05, + "loss": 0.5882864594459534, + "step": 4092 + }, + { + "epoch": 1.7274261603375527, + "grad_norm": 1.0483999252319336, + "learning_rate": 8.79300838092836e-05, + "loss": 0.6192089319229126, + "step": 4094 + }, + { + "epoch": 1.7282700421940929, + "grad_norm": 1.1194913387298584, + "learning_rate": 8.791443259671645e-05, + "loss": 0.603322446346283, + "step": 4096 + }, + { + "epoch": 1.7291139240506328, + "grad_norm": 1.1800397634506226, + "learning_rate": 8.789877263801787e-05, + "loss": 0.6141818165779114, + "step": 4098 + }, + { + "epoch": 1.729957805907173, + "grad_norm": 1.261768102645874, + "learning_rate": 8.78831039368003e-05, + "loss": 0.6707983016967773, + "step": 4100 + }, + { + "epoch": 1.729957805907173, + "eval_loss": 0.7022181153297424, + "eval_runtime": 844.6405, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 2.495, + "step": 4100 + }, + { + "epoch": 1.7308016877637131, + "grad_norm": 1.2505232095718384, + "learning_rate": 8.786742649667822e-05, + "loss": 0.6440353989601135, + "step": 4102 + }, + { + "epoch": 1.731645569620253, + "grad_norm": 1.2631809711456299, + "learning_rate": 8.78517403212681e-05, + "loss": 0.6712808012962341, + "step": 4104 + }, + { + "epoch": 1.7324894514767932, + "grad_norm": 1.2781071662902832, + "learning_rate": 8.783604541418845e-05, + "loss": 0.6854958534240723, + "step": 4106 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 1.1065936088562012, + "learning_rate": 8.782034177905976e-05, + "loss": 0.6281477808952332, + "step": 4108 + }, + { + "epoch": 1.7341772151898733, + "grad_norm": 1.010961890220642, + "learning_rate": 8.780462941950457e-05, + "loss": 0.6835165619850159, + "step": 4110 + }, + { + "epoch": 1.7350210970464135, + "grad_norm": 1.1467366218566895, + "learning_rate": 8.778890833914744e-05, + "loss": 0.6674962639808655, + "step": 4112 + }, + { + "epoch": 1.7358649789029537, + "grad_norm": 1.0221859216690063, + "learning_rate": 8.77731785416149e-05, + "loss": 0.5967551469802856, + "step": 4114 + }, + { + "epoch": 1.7367088607594936, + "grad_norm": 1.347937822341919, + "learning_rate": 8.775744003053552e-05, + "loss": 0.7356855869293213, + "step": 4116 + }, + { + "epoch": 1.7375527426160338, + "grad_norm": 1.2952557802200317, + "learning_rate": 8.774169280953988e-05, + "loss": 0.6932644844055176, + "step": 4118 + }, + { + "epoch": 1.738396624472574, + "grad_norm": 1.0157089233398438, + "learning_rate": 8.772593688226052e-05, + "loss": 0.5917407870292664, + "step": 4120 + }, + { + "epoch": 1.7392405063291139, + "grad_norm": 1.1537878513336182, + "learning_rate": 8.77101722523321e-05, + "loss": 0.6335760354995728, + "step": 4122 + }, + { + "epoch": 1.740084388185654, + "grad_norm": 1.0989667177200317, + "learning_rate": 8.769439892339115e-05, + "loss": 0.6892110109329224, + "step": 4124 + }, + { + "epoch": 1.7409282700421942, + "grad_norm": 1.1293572187423706, + "learning_rate": 8.767861689907633e-05, + "loss": 0.5966230630874634, + "step": 4126 + }, + { + "epoch": 1.7417721518987341, + "grad_norm": 1.1167775392532349, + "learning_rate": 8.76628261830282e-05, + "loss": 0.5981804728507996, + "step": 4128 + }, + { + "epoch": 1.7426160337552743, + "grad_norm": 1.0572419166564941, + "learning_rate": 8.76470267788894e-05, + "loss": 0.5539529919624329, + "step": 4130 + }, + { + "epoch": 1.7434599156118145, + "grad_norm": 0.937256932258606, + "learning_rate": 8.763121869030456e-05, + "loss": 0.6238219141960144, + "step": 4132 + }, + { + "epoch": 1.7443037974683544, + "grad_norm": 1.082932472229004, + "learning_rate": 8.761540192092029e-05, + "loss": 0.6033329963684082, + "step": 4134 + }, + { + "epoch": 1.7451476793248946, + "grad_norm": 1.0495184659957886, + "learning_rate": 8.75995764743852e-05, + "loss": 0.5567626357078552, + "step": 4136 + }, + { + "epoch": 1.7459915611814347, + "grad_norm": 1.3143779039382935, + "learning_rate": 8.758374235434994e-05, + "loss": 0.6759346127510071, + "step": 4138 + }, + { + "epoch": 1.7468354430379747, + "grad_norm": 1.2385786771774292, + "learning_rate": 8.756789956446713e-05, + "loss": 0.6439400315284729, + "step": 4140 + }, + { + "epoch": 1.7476793248945146, + "grad_norm": 1.0453747510910034, + "learning_rate": 8.75520481083914e-05, + "loss": 0.627493679523468, + "step": 4142 + }, + { + "epoch": 1.748523206751055, + "grad_norm": 1.09946608543396, + "learning_rate": 8.753618798977935e-05, + "loss": 0.677209198474884, + "step": 4144 + }, + { + "epoch": 1.749367088607595, + "grad_norm": 1.2207063436508179, + "learning_rate": 8.752031921228965e-05, + "loss": 0.6874014735221863, + "step": 4146 + }, + { + "epoch": 1.7502109704641349, + "grad_norm": 1.2520697116851807, + "learning_rate": 8.750444177958288e-05, + "loss": 0.6332831382751465, + "step": 4148 + }, + { + "epoch": 1.7510548523206753, + "grad_norm": 1.2463186979293823, + "learning_rate": 8.748855569532168e-05, + "loss": 0.682744562625885, + "step": 4150 + }, + { + "epoch": 1.7518987341772152, + "grad_norm": 1.1895235776901245, + "learning_rate": 8.747266096317069e-05, + "loss": 0.7006803750991821, + "step": 4152 + }, + { + "epoch": 1.7527426160337551, + "grad_norm": 1.1627185344696045, + "learning_rate": 8.745675758679646e-05, + "loss": 0.6751191020011902, + "step": 4154 + }, + { + "epoch": 1.7535864978902953, + "grad_norm": 1.324127197265625, + "learning_rate": 8.744084556986764e-05, + "loss": 0.661848247051239, + "step": 4156 + }, + { + "epoch": 1.7544303797468355, + "grad_norm": 1.226809024810791, + "learning_rate": 8.74249249160548e-05, + "loss": 0.7057217955589294, + "step": 4158 + }, + { + "epoch": 1.7552742616033754, + "grad_norm": 1.2341214418411255, + "learning_rate": 8.740899562903056e-05, + "loss": 0.6856105923652649, + "step": 4160 + }, + { + "epoch": 1.7561181434599156, + "grad_norm": 1.3907564878463745, + "learning_rate": 8.739305771246946e-05, + "loss": 0.6616930365562439, + "step": 4162 + }, + { + "epoch": 1.7569620253164557, + "grad_norm": 1.2756825685501099, + "learning_rate": 8.737711117004812e-05, + "loss": 0.5791551470756531, + "step": 4164 + }, + { + "epoch": 1.7578059071729957, + "grad_norm": 1.2861095666885376, + "learning_rate": 8.736115600544506e-05, + "loss": 0.7074756622314453, + "step": 4166 + }, + { + "epoch": 1.7586497890295358, + "grad_norm": 1.2198424339294434, + "learning_rate": 8.734519222234083e-05, + "loss": 0.6494167447090149, + "step": 4168 + }, + { + "epoch": 1.759493670886076, + "grad_norm": 1.19169020652771, + "learning_rate": 8.732921982441799e-05, + "loss": 0.6546841859817505, + "step": 4170 + }, + { + "epoch": 1.760337552742616, + "grad_norm": 1.11533784866333, + "learning_rate": 8.731323881536108e-05, + "loss": 0.6701815724372864, + "step": 4172 + }, + { + "epoch": 1.761181434599156, + "grad_norm": 1.2148140668869019, + "learning_rate": 8.729724919885657e-05, + "loss": 0.6678179502487183, + "step": 4174 + }, + { + "epoch": 1.7620253164556963, + "grad_norm": 1.1968709230422974, + "learning_rate": 8.728125097859298e-05, + "loss": 0.6505144834518433, + "step": 4176 + }, + { + "epoch": 1.7628691983122362, + "grad_norm": 1.0954766273498535, + "learning_rate": 8.726524415826079e-05, + "loss": 0.6531696915626526, + "step": 4178 + }, + { + "epoch": 1.7637130801687764, + "grad_norm": 1.5149537324905396, + "learning_rate": 8.724922874155246e-05, + "loss": 0.710014283657074, + "step": 4180 + }, + { + "epoch": 1.7645569620253165, + "grad_norm": 1.145113229751587, + "learning_rate": 8.723320473216245e-05, + "loss": 0.714016318321228, + "step": 4182 + }, + { + "epoch": 1.7654008438818565, + "grad_norm": 0.9454524517059326, + "learning_rate": 8.721717213378719e-05, + "loss": 0.6775414347648621, + "step": 4184 + }, + { + "epoch": 1.7662447257383966, + "grad_norm": 1.1414754390716553, + "learning_rate": 8.720113095012507e-05, + "loss": 0.6279728412628174, + "step": 4186 + }, + { + "epoch": 1.7670886075949368, + "grad_norm": 1.212802767753601, + "learning_rate": 8.718508118487652e-05, + "loss": 0.5894309282302856, + "step": 4188 + }, + { + "epoch": 1.7679324894514767, + "grad_norm": 1.5213478803634644, + "learning_rate": 8.716902284174388e-05, + "loss": 0.6124046444892883, + "step": 4190 + }, + { + "epoch": 1.768776371308017, + "grad_norm": 0.9973840713500977, + "learning_rate": 8.715295592443154e-05, + "loss": 0.5990801453590393, + "step": 4192 + }, + { + "epoch": 1.769620253164557, + "grad_norm": 1.1084294319152832, + "learning_rate": 8.713688043664579e-05, + "loss": 0.6485559344291687, + "step": 4194 + }, + { + "epoch": 1.770464135021097, + "grad_norm": 1.1401913166046143, + "learning_rate": 8.712079638209493e-05, + "loss": 0.7083099484443665, + "step": 4196 + }, + { + "epoch": 1.7713080168776372, + "grad_norm": 1.278105616569519, + "learning_rate": 8.71047037644893e-05, + "loss": 0.7237915992736816, + "step": 4198 + }, + { + "epoch": 1.7721518987341773, + "grad_norm": 1.2407530546188354, + "learning_rate": 8.708860258754108e-05, + "loss": 0.6259870529174805, + "step": 4200 + }, + { + "epoch": 1.7721518987341773, + "eval_loss": 0.6993561387062073, + "eval_runtime": 542.0281, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 3.887, + "step": 4200 + }, + { + "epoch": 1.7729957805907173, + "grad_norm": 1.102859616279602, + "learning_rate": 8.707249285496457e-05, + "loss": 0.6604248285293579, + "step": 4202 + }, + { + "epoch": 1.7738396624472574, + "grad_norm": 1.2478244304656982, + "learning_rate": 8.705637457047594e-05, + "loss": 0.6799775958061218, + "step": 4204 + }, + { + "epoch": 1.7746835443037976, + "grad_norm": 1.1178022623062134, + "learning_rate": 8.704024773779338e-05, + "loss": 0.6136477589607239, + "step": 4206 + }, + { + "epoch": 1.7755274261603375, + "grad_norm": 1.904076337814331, + "learning_rate": 8.702411236063703e-05, + "loss": 0.6568390130996704, + "step": 4208 + }, + { + "epoch": 1.7763713080168775, + "grad_norm": 1.0902835130691528, + "learning_rate": 8.700796844272903e-05, + "loss": 0.6404406428337097, + "step": 4210 + }, + { + "epoch": 1.7772151898734179, + "grad_norm": 1.1858288049697876, + "learning_rate": 8.699181598779347e-05, + "loss": 0.6924911737442017, + "step": 4212 + }, + { + "epoch": 1.7780590717299578, + "grad_norm": 1.0015727281570435, + "learning_rate": 8.69756549995564e-05, + "loss": 0.572692334651947, + "step": 4214 + }, + { + "epoch": 1.7789029535864977, + "grad_norm": 1.440079689025879, + "learning_rate": 8.695948548174583e-05, + "loss": 0.7196018695831299, + "step": 4216 + }, + { + "epoch": 1.7797468354430381, + "grad_norm": 1.1320992708206177, + "learning_rate": 8.69433074380918e-05, + "loss": 0.5870906710624695, + "step": 4218 + }, + { + "epoch": 1.780590717299578, + "grad_norm": 1.3156964778900146, + "learning_rate": 8.692712087232626e-05, + "loss": 0.6501539349555969, + "step": 4220 + }, + { + "epoch": 1.781434599156118, + "grad_norm": 1.1869803667068481, + "learning_rate": 8.691092578818311e-05, + "loss": 0.7017278075218201, + "step": 4222 + }, + { + "epoch": 1.7822784810126582, + "grad_norm": 0.9708380699157715, + "learning_rate": 8.689472218939829e-05, + "loss": 0.5954802632331848, + "step": 4224 + }, + { + "epoch": 1.7831223628691983, + "grad_norm": 1.0753228664398193, + "learning_rate": 8.687851007970962e-05, + "loss": 0.6494144797325134, + "step": 4226 + }, + { + "epoch": 1.7839662447257383, + "grad_norm": 1.1038413047790527, + "learning_rate": 8.686228946285695e-05, + "loss": 0.7247282862663269, + "step": 4228 + }, + { + "epoch": 1.7848101265822784, + "grad_norm": 0.9666786789894104, + "learning_rate": 8.684606034258206e-05, + "loss": 0.5673812627792358, + "step": 4230 + }, + { + "epoch": 1.7856540084388186, + "grad_norm": 1.1972676515579224, + "learning_rate": 8.682982272262869e-05, + "loss": 0.5950504541397095, + "step": 4232 + }, + { + "epoch": 1.7864978902953585, + "grad_norm": 1.23736572265625, + "learning_rate": 8.681357660674255e-05, + "loss": 0.6477514505386353, + "step": 4234 + }, + { + "epoch": 1.7873417721518987, + "grad_norm": 1.0238158702850342, + "learning_rate": 8.679732199867127e-05, + "loss": 0.6180200576782227, + "step": 4236 + }, + { + "epoch": 1.7881856540084389, + "grad_norm": 1.0333375930786133, + "learning_rate": 8.678105890216455e-05, + "loss": 0.5771099328994751, + "step": 4238 + }, + { + "epoch": 1.7890295358649788, + "grad_norm": 1.30390202999115, + "learning_rate": 8.676478732097393e-05, + "loss": 0.6592516899108887, + "step": 4240 + }, + { + "epoch": 1.789873417721519, + "grad_norm": 1.115160346031189, + "learning_rate": 8.674850725885294e-05, + "loss": 0.6662757396697998, + "step": 4242 + }, + { + "epoch": 1.7907172995780591, + "grad_norm": 1.2130142450332642, + "learning_rate": 8.67322187195571e-05, + "loss": 0.6673333048820496, + "step": 4244 + }, + { + "epoch": 1.791561181434599, + "grad_norm": 1.1505554914474487, + "learning_rate": 8.671592170684386e-05, + "loss": 0.6698325872421265, + "step": 4246 + }, + { + "epoch": 1.7924050632911392, + "grad_norm": 1.0758062601089478, + "learning_rate": 8.669961622447262e-05, + "loss": 0.6216199398040771, + "step": 4248 + }, + { + "epoch": 1.7932489451476794, + "grad_norm": 0.9300920367240906, + "learning_rate": 8.668330227620475e-05, + "loss": 0.6460495591163635, + "step": 4250 + }, + { + "epoch": 1.7940928270042193, + "grad_norm": 1.3860046863555908, + "learning_rate": 8.666697986580357e-05, + "loss": 0.6949506998062134, + "step": 4252 + }, + { + "epoch": 1.7949367088607595, + "grad_norm": 1.2287555932998657, + "learning_rate": 8.665064899703433e-05, + "loss": 0.6320405602455139, + "step": 4254 + }, + { + "epoch": 1.7957805907172997, + "grad_norm": 1.1585466861724854, + "learning_rate": 8.663430967366426e-05, + "loss": 0.6635019779205322, + "step": 4256 + }, + { + "epoch": 1.7966244725738396, + "grad_norm": 1.1007941961288452, + "learning_rate": 8.661796189946252e-05, + "loss": 0.645052969455719, + "step": 4258 + }, + { + "epoch": 1.7974683544303798, + "grad_norm": 1.2059847116470337, + "learning_rate": 8.660160567820023e-05, + "loss": 0.70420902967453, + "step": 4260 + }, + { + "epoch": 1.79831223628692, + "grad_norm": 1.0648717880249023, + "learning_rate": 8.658524101365044e-05, + "loss": 0.6263765096664429, + "step": 4262 + }, + { + "epoch": 1.7991561181434599, + "grad_norm": 1.017052412033081, + "learning_rate": 8.656886790958821e-05, + "loss": 0.6199937462806702, + "step": 4264 + }, + { + "epoch": 1.8, + "grad_norm": 1.1153450012207031, + "learning_rate": 8.655248636979045e-05, + "loss": 0.5891271233558655, + "step": 4266 + }, + { + "epoch": 1.8008438818565402, + "grad_norm": 1.0661747455596924, + "learning_rate": 8.65360963980361e-05, + "loss": 0.5442121028900146, + "step": 4268 + }, + { + "epoch": 1.8016877637130801, + "grad_norm": 1.3049758672714233, + "learning_rate": 8.6519697998106e-05, + "loss": 0.6988245248794556, + "step": 4270 + }, + { + "epoch": 1.80253164556962, + "grad_norm": 1.2679938077926636, + "learning_rate": 8.650329117378294e-05, + "loss": 0.7260398864746094, + "step": 4272 + }, + { + "epoch": 1.8033755274261605, + "grad_norm": 1.0899536609649658, + "learning_rate": 8.648687592885168e-05, + "loss": 0.5757678151130676, + "step": 4274 + }, + { + "epoch": 1.8042194092827004, + "grad_norm": 1.4088575839996338, + "learning_rate": 8.647045226709887e-05, + "loss": 0.7042108178138733, + "step": 4276 + }, + { + "epoch": 1.8050632911392404, + "grad_norm": 1.2143783569335938, + "learning_rate": 8.645402019231316e-05, + "loss": 0.641275942325592, + "step": 4278 + }, + { + "epoch": 1.8059071729957807, + "grad_norm": 1.4072896242141724, + "learning_rate": 8.64375797082851e-05, + "loss": 0.7657124996185303, + "step": 4280 + }, + { + "epoch": 1.8067510548523207, + "grad_norm": 1.2563380002975464, + "learning_rate": 8.642113081880718e-05, + "loss": 0.713768720626831, + "step": 4282 + }, + { + "epoch": 1.8075949367088606, + "grad_norm": 1.1195416450500488, + "learning_rate": 8.64046735276739e-05, + "loss": 0.6276429295539856, + "step": 4284 + }, + { + "epoch": 1.808438818565401, + "grad_norm": 1.2472422122955322, + "learning_rate": 8.638820783868158e-05, + "loss": 0.5641238689422607, + "step": 4286 + }, + { + "epoch": 1.809282700421941, + "grad_norm": 1.1974313259124756, + "learning_rate": 8.637173375562855e-05, + "loss": 0.6312015056610107, + "step": 4288 + }, + { + "epoch": 1.810126582278481, + "grad_norm": 1.1673604249954224, + "learning_rate": 8.63552512823151e-05, + "loss": 0.6674410104751587, + "step": 4290 + }, + { + "epoch": 1.810970464135021, + "grad_norm": 1.199095368385315, + "learning_rate": 8.633876042254337e-05, + "loss": 0.6772016286849976, + "step": 4292 + }, + { + "epoch": 1.8118143459915612, + "grad_norm": 1.2302746772766113, + "learning_rate": 8.632226118011752e-05, + "loss": 0.6621671915054321, + "step": 4294 + }, + { + "epoch": 1.8126582278481012, + "grad_norm": 1.304010033607483, + "learning_rate": 8.63057535588436e-05, + "loss": 0.6965363621711731, + "step": 4296 + }, + { + "epoch": 1.8135021097046413, + "grad_norm": 1.223366618156433, + "learning_rate": 8.62892375625296e-05, + "loss": 0.6300807595252991, + "step": 4298 + }, + { + "epoch": 1.8143459915611815, + "grad_norm": 1.028496265411377, + "learning_rate": 8.627271319498544e-05, + "loss": 0.5610660910606384, + "step": 4300 + }, + { + "epoch": 1.8143459915611815, + "eval_loss": 0.6981000900268555, + "eval_runtime": 514.4659, + "eval_samples_per_second": 4.096, + "eval_steps_per_second": 4.096, + "step": 4300 + }, + { + "epoch": 1.8151898734177214, + "grad_norm": 1.2050007581710815, + "learning_rate": 8.625618046002298e-05, + "loss": 0.6666551232337952, + "step": 4302 + }, + { + "epoch": 1.8160337552742616, + "grad_norm": 1.1233220100402832, + "learning_rate": 8.6239639361456e-05, + "loss": 0.6631835103034973, + "step": 4304 + }, + { + "epoch": 1.8168776371308017, + "grad_norm": 1.1262956857681274, + "learning_rate": 8.622308990310021e-05, + "loss": 0.6395270228385925, + "step": 4306 + }, + { + "epoch": 1.8177215189873417, + "grad_norm": 1.0448222160339355, + "learning_rate": 8.620653208877328e-05, + "loss": 0.6165015697479248, + "step": 4308 + }, + { + "epoch": 1.8185654008438819, + "grad_norm": 1.1555759906768799, + "learning_rate": 8.618996592229473e-05, + "loss": 0.5915844440460205, + "step": 4310 + }, + { + "epoch": 1.819409282700422, + "grad_norm": 1.5407506227493286, + "learning_rate": 8.617339140748608e-05, + "loss": 0.6491456627845764, + "step": 4312 + }, + { + "epoch": 1.820253164556962, + "grad_norm": 1.3690788745880127, + "learning_rate": 8.615680854817077e-05, + "loss": 0.6053901314735413, + "step": 4314 + }, + { + "epoch": 1.8210970464135021, + "grad_norm": 1.052583932876587, + "learning_rate": 8.614021734817413e-05, + "loss": 0.5821644067764282, + "step": 4316 + }, + { + "epoch": 1.8219409282700423, + "grad_norm": 1.090567708015442, + "learning_rate": 8.612361781132344e-05, + "loss": 0.645878255367279, + "step": 4318 + }, + { + "epoch": 1.8227848101265822, + "grad_norm": 1.122719645500183, + "learning_rate": 8.610700994144787e-05, + "loss": 0.6883123517036438, + "step": 4320 + }, + { + "epoch": 1.8236286919831224, + "grad_norm": 1.3273001909255981, + "learning_rate": 8.609039374237856e-05, + "loss": 0.6918330788612366, + "step": 4322 + }, + { + "epoch": 1.8244725738396625, + "grad_norm": 1.0628443956375122, + "learning_rate": 8.607376921794855e-05, + "loss": 0.6292204856872559, + "step": 4324 + }, + { + "epoch": 1.8253164556962025, + "grad_norm": 1.287466287612915, + "learning_rate": 8.605713637199279e-05, + "loss": 0.6136105060577393, + "step": 4326 + }, + { + "epoch": 1.8261603375527427, + "grad_norm": 1.1399345397949219, + "learning_rate": 8.604049520834816e-05, + "loss": 0.6099681854248047, + "step": 4328 + }, + { + "epoch": 1.8270042194092828, + "grad_norm": 1.1131435632705688, + "learning_rate": 8.602384573085345e-05, + "loss": 0.6267056465148926, + "step": 4330 + }, + { + "epoch": 1.8278481012658228, + "grad_norm": 1.1312925815582275, + "learning_rate": 8.600718794334939e-05, + "loss": 0.609437882900238, + "step": 4332 + }, + { + "epoch": 1.828691983122363, + "grad_norm": 1.3711494207382202, + "learning_rate": 8.599052184967859e-05, + "loss": 0.727881669998169, + "step": 4334 + }, + { + "epoch": 1.829535864978903, + "grad_norm": 1.1403605937957764, + "learning_rate": 8.597384745368562e-05, + "loss": 0.6771696209907532, + "step": 4336 + }, + { + "epoch": 1.830379746835443, + "grad_norm": 1.2769951820373535, + "learning_rate": 8.595716475921693e-05, + "loss": 0.6812924742698669, + "step": 4338 + }, + { + "epoch": 1.831223628691983, + "grad_norm": 1.055721402168274, + "learning_rate": 8.59404737701209e-05, + "loss": 0.6403515338897705, + "step": 4340 + }, + { + "epoch": 1.8320675105485233, + "grad_norm": 1.1047639846801758, + "learning_rate": 8.592377449024784e-05, + "loss": 0.663240373134613, + "step": 4342 + }, + { + "epoch": 1.8329113924050633, + "grad_norm": 1.0808883905410767, + "learning_rate": 8.590706692344991e-05, + "loss": 0.6398993134498596, + "step": 4344 + }, + { + "epoch": 1.8337552742616032, + "grad_norm": 1.2433407306671143, + "learning_rate": 8.589035107358125e-05, + "loss": 0.6838348507881165, + "step": 4346 + }, + { + "epoch": 1.8345991561181436, + "grad_norm": 1.031216025352478, + "learning_rate": 8.58736269444979e-05, + "loss": 0.640884280204773, + "step": 4348 + }, + { + "epoch": 1.8354430379746836, + "grad_norm": 1.1417057514190674, + "learning_rate": 8.585689454005776e-05, + "loss": 0.6346741914749146, + "step": 4350 + }, + { + "epoch": 1.8362869198312235, + "grad_norm": 1.210988998413086, + "learning_rate": 8.584015386412072e-05, + "loss": 0.6209521889686584, + "step": 4352 + }, + { + "epoch": 1.8371308016877637, + "grad_norm": 1.2120760679244995, + "learning_rate": 8.582340492054847e-05, + "loss": 0.6699252128601074, + "step": 4354 + }, + { + "epoch": 1.8379746835443038, + "grad_norm": 1.1768114566802979, + "learning_rate": 8.580664771320475e-05, + "loss": 0.6472980380058289, + "step": 4356 + }, + { + "epoch": 1.8388185654008438, + "grad_norm": 1.060070276260376, + "learning_rate": 8.578988224595506e-05, + "loss": 0.6440452933311462, + "step": 4358 + }, + { + "epoch": 1.839662447257384, + "grad_norm": 1.1366443634033203, + "learning_rate": 8.57731085226669e-05, + "loss": 0.5894474387168884, + "step": 4360 + }, + { + "epoch": 1.840506329113924, + "grad_norm": 1.1571751832962036, + "learning_rate": 8.575632654720963e-05, + "loss": 0.5868900418281555, + "step": 4362 + }, + { + "epoch": 1.841350210970464, + "grad_norm": 1.1983840465545654, + "learning_rate": 8.573953632345453e-05, + "loss": 0.5841533541679382, + "step": 4364 + }, + { + "epoch": 1.8421940928270042, + "grad_norm": 1.101806640625, + "learning_rate": 8.572273785527481e-05, + "loss": 0.5503215193748474, + "step": 4366 + }, + { + "epoch": 1.8430379746835444, + "grad_norm": 1.0327471494674683, + "learning_rate": 8.570593114654552e-05, + "loss": 0.6131128072738647, + "step": 4368 + }, + { + "epoch": 1.8438818565400843, + "grad_norm": 1.1421098709106445, + "learning_rate": 8.568911620114368e-05, + "loss": 0.6614060401916504, + "step": 4370 + }, + { + "epoch": 1.8447257383966245, + "grad_norm": 1.1707026958465576, + "learning_rate": 8.567229302294814e-05, + "loss": 0.6392307877540588, + "step": 4372 + }, + { + "epoch": 1.8455696202531646, + "grad_norm": 1.1704418659210205, + "learning_rate": 8.565546161583969e-05, + "loss": 0.6560825109481812, + "step": 4374 + }, + { + "epoch": 1.8464135021097046, + "grad_norm": 1.3618037700653076, + "learning_rate": 8.563862198370103e-05, + "loss": 0.6996290683746338, + "step": 4376 + }, + { + "epoch": 1.8472573839662447, + "grad_norm": 1.116645097732544, + "learning_rate": 8.562177413041674e-05, + "loss": 0.6776535511016846, + "step": 4378 + }, + { + "epoch": 1.8481012658227849, + "grad_norm": 1.1669151782989502, + "learning_rate": 8.560491805987327e-05, + "loss": 0.6390423774719238, + "step": 4380 + }, + { + "epoch": 1.8489451476793248, + "grad_norm": 1.2188117504119873, + "learning_rate": 8.558805377595904e-05, + "loss": 0.6554020047187805, + "step": 4382 + }, + { + "epoch": 1.849789029535865, + "grad_norm": 1.216829776763916, + "learning_rate": 8.557118128256425e-05, + "loss": 0.6291787624359131, + "step": 4384 + }, + { + "epoch": 1.8506329113924052, + "grad_norm": 1.0431596040725708, + "learning_rate": 8.555430058358111e-05, + "loss": 0.6484442949295044, + "step": 4386 + }, + { + "epoch": 1.851476793248945, + "grad_norm": 1.3015289306640625, + "learning_rate": 8.553741168290367e-05, + "loss": 0.7034047842025757, + "step": 4388 + }, + { + "epoch": 1.8523206751054853, + "grad_norm": 1.2062040567398071, + "learning_rate": 8.552051458442785e-05, + "loss": 0.644135594367981, + "step": 4390 + }, + { + "epoch": 1.8531645569620254, + "grad_norm": 1.238461971282959, + "learning_rate": 8.55036092920515e-05, + "loss": 0.6767282485961914, + "step": 4392 + }, + { + "epoch": 1.8540084388185654, + "grad_norm": 1.2978830337524414, + "learning_rate": 8.548669580967435e-05, + "loss": 0.7292267680168152, + "step": 4394 + }, + { + "epoch": 1.8548523206751055, + "grad_norm": 1.1448328495025635, + "learning_rate": 8.546977414119801e-05, + "loss": 0.6788421273231506, + "step": 4396 + }, + { + "epoch": 1.8556962025316457, + "grad_norm": 1.0685368776321411, + "learning_rate": 8.5452844290526e-05, + "loss": 0.6745942234992981, + "step": 4398 + }, + { + "epoch": 1.8565400843881856, + "grad_norm": 1.125707983970642, + "learning_rate": 8.543590626156368e-05, + "loss": 0.6351125836372375, + "step": 4400 + }, + { + "epoch": 1.8565400843881856, + "eval_loss": 0.6961485147476196, + "eval_runtime": 513.5724, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 4400 + }, + { + "epoch": 1.8573839662447258, + "grad_norm": 1.072179913520813, + "learning_rate": 8.541896005821835e-05, + "loss": 0.5840762257575989, + "step": 4402 + }, + { + "epoch": 1.858227848101266, + "grad_norm": 1.2572803497314453, + "learning_rate": 8.540200568439915e-05, + "loss": 0.6431074738502502, + "step": 4404 + }, + { + "epoch": 1.859071729957806, + "grad_norm": 1.3294413089752197, + "learning_rate": 8.538504314401718e-05, + "loss": 0.708808183670044, + "step": 4406 + }, + { + "epoch": 1.8599156118143458, + "grad_norm": 1.1775587797164917, + "learning_rate": 8.536807244098533e-05, + "loss": 0.6580085754394531, + "step": 4408 + }, + { + "epoch": 1.8607594936708862, + "grad_norm": 1.1880089044570923, + "learning_rate": 8.53510935792184e-05, + "loss": 0.6500136256217957, + "step": 4410 + }, + { + "epoch": 1.8616033755274262, + "grad_norm": 1.2166204452514648, + "learning_rate": 8.533410656263313e-05, + "loss": 0.6922352313995361, + "step": 4412 + }, + { + "epoch": 1.862447257383966, + "grad_norm": 1.0405415296554565, + "learning_rate": 8.531711139514808e-05, + "loss": 0.6761626601219177, + "step": 4414 + }, + { + "epoch": 1.8632911392405065, + "grad_norm": 1.0674270391464233, + "learning_rate": 8.530010808068371e-05, + "loss": 0.672576904296875, + "step": 4416 + }, + { + "epoch": 1.8641350210970464, + "grad_norm": 1.0584741830825806, + "learning_rate": 8.528309662316236e-05, + "loss": 0.5521218180656433, + "step": 4418 + }, + { + "epoch": 1.8649789029535864, + "grad_norm": 1.3619039058685303, + "learning_rate": 8.526607702650824e-05, + "loss": 0.6546680927276611, + "step": 4420 + }, + { + "epoch": 1.8658227848101265, + "grad_norm": 0.9904745221138, + "learning_rate": 8.524904929464745e-05, + "loss": 0.6043933629989624, + "step": 4422 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 1.3046703338623047, + "learning_rate": 8.523201343150795e-05, + "loss": 0.7106801271438599, + "step": 4424 + }, + { + "epoch": 1.8675105485232066, + "grad_norm": 1.1166832447052002, + "learning_rate": 8.52149694410196e-05, + "loss": 0.6456703543663025, + "step": 4426 + }, + { + "epoch": 1.8683544303797468, + "grad_norm": 1.1260632276535034, + "learning_rate": 8.519791732711412e-05, + "loss": 0.5963318347930908, + "step": 4428 + }, + { + "epoch": 1.869198312236287, + "grad_norm": 1.0990599393844604, + "learning_rate": 8.51808570937251e-05, + "loss": 0.6295356750488281, + "step": 4430 + }, + { + "epoch": 1.870042194092827, + "grad_norm": 1.3689274787902832, + "learning_rate": 8.516378874478801e-05, + "loss": 0.6984617114067078, + "step": 4432 + }, + { + "epoch": 1.870886075949367, + "grad_norm": 1.0986580848693848, + "learning_rate": 8.514671228424018e-05, + "loss": 0.5598900318145752, + "step": 4434 + }, + { + "epoch": 1.8717299578059072, + "grad_norm": 0.9570761322975159, + "learning_rate": 8.512962771602085e-05, + "loss": 0.6286435723304749, + "step": 4436 + }, + { + "epoch": 1.8725738396624472, + "grad_norm": 1.1480669975280762, + "learning_rate": 8.511253504407107e-05, + "loss": 0.5956313014030457, + "step": 4438 + }, + { + "epoch": 1.8734177215189873, + "grad_norm": 1.1132479906082153, + "learning_rate": 8.50954342723338e-05, + "loss": 0.6523844599723816, + "step": 4440 + }, + { + "epoch": 1.8742616033755275, + "grad_norm": 1.1569167375564575, + "learning_rate": 8.507832540475387e-05, + "loss": 0.6231355667114258, + "step": 4442 + }, + { + "epoch": 1.8751054852320674, + "grad_norm": 1.1327043771743774, + "learning_rate": 8.506120844527796e-05, + "loss": 0.660773754119873, + "step": 4444 + }, + { + "epoch": 1.8759493670886076, + "grad_norm": 0.8939630389213562, + "learning_rate": 8.504408339785463e-05, + "loss": 0.6319235563278198, + "step": 4446 + }, + { + "epoch": 1.8767932489451478, + "grad_norm": 1.1910638809204102, + "learning_rate": 8.50269502664343e-05, + "loss": 0.6753001809120178, + "step": 4448 + }, + { + "epoch": 1.8776371308016877, + "grad_norm": 1.1502408981323242, + "learning_rate": 8.500980905496923e-05, + "loss": 0.6300671696662903, + "step": 4450 + }, + { + "epoch": 1.8784810126582279, + "grad_norm": 1.0639009475708008, + "learning_rate": 8.49926597674136e-05, + "loss": 0.6196691989898682, + "step": 4452 + }, + { + "epoch": 1.879324894514768, + "grad_norm": 1.1072754859924316, + "learning_rate": 8.497550240772341e-05, + "loss": 0.7029181122779846, + "step": 4454 + }, + { + "epoch": 1.880168776371308, + "grad_norm": 1.0440188646316528, + "learning_rate": 8.495833697985652e-05, + "loss": 0.65432208776474, + "step": 4456 + }, + { + "epoch": 1.8810126582278481, + "grad_norm": 1.0646617412567139, + "learning_rate": 8.494116348777269e-05, + "loss": 0.6446614861488342, + "step": 4458 + }, + { + "epoch": 1.8818565400843883, + "grad_norm": 1.2163805961608887, + "learning_rate": 8.492398193543349e-05, + "loss": 0.6430497765541077, + "step": 4460 + }, + { + "epoch": 1.8827004219409282, + "grad_norm": 1.2715297937393188, + "learning_rate": 8.490679232680241e-05, + "loss": 0.6609845161437988, + "step": 4462 + }, + { + "epoch": 1.8835443037974684, + "grad_norm": 1.0435588359832764, + "learning_rate": 8.488959466584469e-05, + "loss": 0.5791062712669373, + "step": 4464 + }, + { + "epoch": 1.8843881856540086, + "grad_norm": 1.229202151298523, + "learning_rate": 8.487238895652759e-05, + "loss": 0.6312171220779419, + "step": 4466 + }, + { + "epoch": 1.8852320675105485, + "grad_norm": 1.0713022947311401, + "learning_rate": 8.485517520282008e-05, + "loss": 0.6698815226554871, + "step": 4468 + }, + { + "epoch": 1.8860759493670884, + "grad_norm": 1.0172312259674072, + "learning_rate": 8.483795340869305e-05, + "loss": 0.6283810138702393, + "step": 4470 + }, + { + "epoch": 1.8869198312236288, + "grad_norm": 1.2880207300186157, + "learning_rate": 8.482072357811926e-05, + "loss": 0.6659437417984009, + "step": 4472 + }, + { + "epoch": 1.8877637130801688, + "grad_norm": 1.0840508937835693, + "learning_rate": 8.480348571507329e-05, + "loss": 0.6190289258956909, + "step": 4474 + }, + { + "epoch": 1.8886075949367087, + "grad_norm": 1.1101994514465332, + "learning_rate": 8.478623982353156e-05, + "loss": 0.5760066509246826, + "step": 4476 + }, + { + "epoch": 1.889451476793249, + "grad_norm": 1.2388770580291748, + "learning_rate": 8.476898590747237e-05, + "loss": 0.6151811480522156, + "step": 4478 + }, + { + "epoch": 1.890295358649789, + "grad_norm": 0.9986408948898315, + "learning_rate": 8.475172397087591e-05, + "loss": 0.5991593599319458, + "step": 4480 + }, + { + "epoch": 1.891139240506329, + "grad_norm": 1.1380778551101685, + "learning_rate": 8.473445401772415e-05, + "loss": 0.7262179255485535, + "step": 4482 + }, + { + "epoch": 1.8919831223628694, + "grad_norm": 1.3933676481246948, + "learning_rate": 8.471717605200092e-05, + "loss": 0.5806916356086731, + "step": 4484 + }, + { + "epoch": 1.8928270042194093, + "grad_norm": 1.0242944955825806, + "learning_rate": 8.469989007769194e-05, + "loss": 0.617904782295227, + "step": 4486 + }, + { + "epoch": 1.8936708860759492, + "grad_norm": 1.0909028053283691, + "learning_rate": 8.468259609878475e-05, + "loss": 0.6488202810287476, + "step": 4488 + }, + { + "epoch": 1.8945147679324894, + "grad_norm": 1.042611002922058, + "learning_rate": 8.466529411926874e-05, + "loss": 0.6015118956565857, + "step": 4490 + }, + { + "epoch": 1.8953586497890296, + "grad_norm": 1.3965784311294556, + "learning_rate": 8.46479841431351e-05, + "loss": 0.7035272717475891, + "step": 4492 + }, + { + "epoch": 1.8962025316455695, + "grad_norm": 1.1486462354660034, + "learning_rate": 8.463066617437698e-05, + "loss": 0.6611229777336121, + "step": 4494 + }, + { + "epoch": 1.8970464135021097, + "grad_norm": 1.0845859050750732, + "learning_rate": 8.461334021698925e-05, + "loss": 0.6378056406974792, + "step": 4496 + }, + { + "epoch": 1.8978902953586498, + "grad_norm": 0.936612069606781, + "learning_rate": 8.459600627496869e-05, + "loss": 0.642429769039154, + "step": 4498 + }, + { + "epoch": 1.8987341772151898, + "grad_norm": 1.1905454397201538, + "learning_rate": 8.457866435231391e-05, + "loss": 0.6341768503189087, + "step": 4500 + }, + { + "epoch": 1.8987341772151898, + "eval_loss": 0.6938078999519348, + "eval_runtime": 513.615, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 4.102, + "step": 4500 + }, + { + "epoch": 1.89957805907173, + "grad_norm": 0.9778118133544922, + "learning_rate": 8.456131445302538e-05, + "loss": 0.5973100662231445, + "step": 4502 + }, + { + "epoch": 1.90042194092827, + "grad_norm": 0.9587083458900452, + "learning_rate": 8.454395658110536e-05, + "loss": 0.5982911586761475, + "step": 4504 + }, + { + "epoch": 1.90126582278481, + "grad_norm": 1.327643871307373, + "learning_rate": 8.452659074055798e-05, + "loss": 0.6858586668968201, + "step": 4506 + }, + { + "epoch": 1.9021097046413502, + "grad_norm": 1.0740257501602173, + "learning_rate": 8.450921693538922e-05, + "loss": 0.6172328591346741, + "step": 4508 + }, + { + "epoch": 1.9029535864978904, + "grad_norm": 1.0705101490020752, + "learning_rate": 8.449183516960685e-05, + "loss": 0.5349634289741516, + "step": 4510 + }, + { + "epoch": 1.9037974683544303, + "grad_norm": 0.9151237607002258, + "learning_rate": 8.447444544722058e-05, + "loss": 0.5769277811050415, + "step": 4512 + }, + { + "epoch": 1.9046413502109705, + "grad_norm": 1.139900803565979, + "learning_rate": 8.44570477722418e-05, + "loss": 0.6579093933105469, + "step": 4514 + }, + { + "epoch": 1.9054852320675106, + "grad_norm": 1.2481658458709717, + "learning_rate": 8.443964214868387e-05, + "loss": 0.6748929619789124, + "step": 4516 + }, + { + "epoch": 1.9063291139240506, + "grad_norm": 1.1661686897277832, + "learning_rate": 8.442222858056193e-05, + "loss": 0.6492021083831787, + "step": 4518 + }, + { + "epoch": 1.9071729957805907, + "grad_norm": 1.241477370262146, + "learning_rate": 8.440480707189295e-05, + "loss": 0.635409951210022, + "step": 4520 + }, + { + "epoch": 1.908016877637131, + "grad_norm": 1.1102054119110107, + "learning_rate": 8.438737762669573e-05, + "loss": 0.631928026676178, + "step": 4522 + }, + { + "epoch": 1.9088607594936708, + "grad_norm": 1.0638107061386108, + "learning_rate": 8.43699402489909e-05, + "loss": 0.604518473148346, + "step": 4524 + }, + { + "epoch": 1.909704641350211, + "grad_norm": 1.0270655155181885, + "learning_rate": 8.435249494280096e-05, + "loss": 0.61314457654953, + "step": 4526 + }, + { + "epoch": 1.9105485232067512, + "grad_norm": 1.1840111017227173, + "learning_rate": 8.433504171215018e-05, + "loss": 0.661663293838501, + "step": 4528 + }, + { + "epoch": 1.9113924050632911, + "grad_norm": 1.1404399871826172, + "learning_rate": 8.43175805610647e-05, + "loss": 0.7026967406272888, + "step": 4530 + }, + { + "epoch": 1.9122362869198313, + "grad_norm": 1.2371265888214111, + "learning_rate": 8.430011149357246e-05, + "loss": 0.6599440574645996, + "step": 4532 + }, + { + "epoch": 1.9130801687763714, + "grad_norm": 1.0042651891708374, + "learning_rate": 8.428263451370326e-05, + "loss": 0.5728344321250916, + "step": 4534 + }, + { + "epoch": 1.9139240506329114, + "grad_norm": 1.04367196559906, + "learning_rate": 8.426514962548866e-05, + "loss": 0.6495450735092163, + "step": 4536 + }, + { + "epoch": 1.9147679324894513, + "grad_norm": 1.0867135524749756, + "learning_rate": 8.424765683296215e-05, + "loss": 0.6406553387641907, + "step": 4538 + }, + { + "epoch": 1.9156118143459917, + "grad_norm": 1.0751310586929321, + "learning_rate": 8.423015614015892e-05, + "loss": 0.6692186594009399, + "step": 4540 + }, + { + "epoch": 1.9164556962025316, + "grad_norm": 1.13556969165802, + "learning_rate": 8.421264755111607e-05, + "loss": 0.6029785871505737, + "step": 4542 + }, + { + "epoch": 1.9172995780590716, + "grad_norm": 1.1560977697372437, + "learning_rate": 8.419513106987251e-05, + "loss": 0.6457844972610474, + "step": 4544 + }, + { + "epoch": 1.918143459915612, + "grad_norm": 1.2192902565002441, + "learning_rate": 8.417760670046893e-05, + "loss": 0.7082147598266602, + "step": 4546 + }, + { + "epoch": 1.918987341772152, + "grad_norm": 1.1170696020126343, + "learning_rate": 8.41600744469479e-05, + "loss": 0.6919234991073608, + "step": 4548 + }, + { + "epoch": 1.9198312236286919, + "grad_norm": 1.061253547668457, + "learning_rate": 8.414253431335373e-05, + "loss": 0.6310052871704102, + "step": 4550 + }, + { + "epoch": 1.920675105485232, + "grad_norm": 1.0671885013580322, + "learning_rate": 8.412498630373263e-05, + "loss": 0.6330236792564392, + "step": 4552 + }, + { + "epoch": 1.9215189873417722, + "grad_norm": 1.2085163593292236, + "learning_rate": 8.410743042213256e-05, + "loss": 0.7031015157699585, + "step": 4554 + }, + { + "epoch": 1.9223628691983121, + "grad_norm": 1.2682013511657715, + "learning_rate": 8.408986667260334e-05, + "loss": 0.7078304290771484, + "step": 4556 + }, + { + "epoch": 1.9232067510548523, + "grad_norm": 1.2966876029968262, + "learning_rate": 8.407229505919658e-05, + "loss": 0.6542860865592957, + "step": 4558 + }, + { + "epoch": 1.9240506329113924, + "grad_norm": 1.1086169481277466, + "learning_rate": 8.405471558596573e-05, + "loss": 0.5856828093528748, + "step": 4560 + }, + { + "epoch": 1.9248945147679324, + "grad_norm": 1.3175504207611084, + "learning_rate": 8.403712825696604e-05, + "loss": 0.7382104992866516, + "step": 4562 + }, + { + "epoch": 1.9257383966244725, + "grad_norm": 1.163164496421814, + "learning_rate": 8.401953307625454e-05, + "loss": 0.6862360239028931, + "step": 4564 + }, + { + "epoch": 1.9265822784810127, + "grad_norm": 1.207650899887085, + "learning_rate": 8.400193004789013e-05, + "loss": 0.7442302703857422, + "step": 4566 + }, + { + "epoch": 1.9274261603375527, + "grad_norm": 1.1570589542388916, + "learning_rate": 8.398431917593345e-05, + "loss": 0.595226526260376, + "step": 4568 + }, + { + "epoch": 1.9282700421940928, + "grad_norm": 1.091927170753479, + "learning_rate": 8.396670046444704e-05, + "loss": 0.6360410451889038, + "step": 4570 + }, + { + "epoch": 1.929113924050633, + "grad_norm": 1.149559497833252, + "learning_rate": 8.394907391749516e-05, + "loss": 0.6343122124671936, + "step": 4572 + }, + { + "epoch": 1.929957805907173, + "grad_norm": 1.0585254430770874, + "learning_rate": 8.393143953914395e-05, + "loss": 0.7394745349884033, + "step": 4574 + }, + { + "epoch": 1.930801687763713, + "grad_norm": 1.1648521423339844, + "learning_rate": 8.391379733346128e-05, + "loss": 0.6489678025245667, + "step": 4576 + }, + { + "epoch": 1.9316455696202532, + "grad_norm": 1.1756316423416138, + "learning_rate": 8.389614730451692e-05, + "loss": 0.6687861084938049, + "step": 4578 + }, + { + "epoch": 1.9324894514767932, + "grad_norm": 0.9857237339019775, + "learning_rate": 8.387848945638235e-05, + "loss": 0.523727536201477, + "step": 4580 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 1.1038693189620972, + "learning_rate": 8.386082379313092e-05, + "loss": 0.6545047760009766, + "step": 4582 + }, + { + "epoch": 1.9341772151898735, + "grad_norm": 1.0780832767486572, + "learning_rate": 8.384315031883774e-05, + "loss": 0.6067036390304565, + "step": 4584 + }, + { + "epoch": 1.9350210970464135, + "grad_norm": 1.2915070056915283, + "learning_rate": 8.382546903757975e-05, + "loss": 0.6880824565887451, + "step": 4586 + }, + { + "epoch": 1.9358649789029536, + "grad_norm": 1.1243441104888916, + "learning_rate": 8.380777995343568e-05, + "loss": 0.7319117188453674, + "step": 4588 + }, + { + "epoch": 1.9367088607594938, + "grad_norm": 1.1143072843551636, + "learning_rate": 8.379008307048609e-05, + "loss": 0.6845395565032959, + "step": 4590 + }, + { + "epoch": 1.9375527426160337, + "grad_norm": 1.039494276046753, + "learning_rate": 8.377237839281327e-05, + "loss": 0.6653600335121155, + "step": 4592 + }, + { + "epoch": 1.9383966244725739, + "grad_norm": 1.299617886543274, + "learning_rate": 8.375466592450136e-05, + "loss": 0.6352495551109314, + "step": 4594 + }, + { + "epoch": 1.939240506329114, + "grad_norm": 0.9918657541275024, + "learning_rate": 8.373694566963631e-05, + "loss": 0.5660957098007202, + "step": 4596 + }, + { + "epoch": 1.940084388185654, + "grad_norm": 1.0540478229522705, + "learning_rate": 8.371921763230579e-05, + "loss": 0.6296496987342834, + "step": 4598 + }, + { + "epoch": 1.9409282700421941, + "grad_norm": 1.1309545040130615, + "learning_rate": 8.370148181659939e-05, + "loss": 0.6672025918960571, + "step": 4600 + }, + { + "epoch": 1.9409282700421941, + "eval_loss": 0.6930755376815796, + "eval_runtime": 617.8927, + "eval_samples_per_second": 3.41, + "eval_steps_per_second": 3.41, + "step": 4600 + }, + { + "epoch": 1.9417721518987343, + "grad_norm": 1.2338588237762451, + "learning_rate": 8.368373822660836e-05, + "loss": 0.6200884580612183, + "step": 4602 + }, + { + "epoch": 1.9426160337552743, + "grad_norm": 1.1756945848464966, + "learning_rate": 8.366598686642582e-05, + "loss": 0.653294026851654, + "step": 4604 + }, + { + "epoch": 1.9434599156118142, + "grad_norm": 1.032018780708313, + "learning_rate": 8.364822774014671e-05, + "loss": 0.5670395493507385, + "step": 4606 + }, + { + "epoch": 1.9443037974683546, + "grad_norm": 1.045280933380127, + "learning_rate": 8.363046085186766e-05, + "loss": 0.6819197535514832, + "step": 4608 + }, + { + "epoch": 1.9451476793248945, + "grad_norm": 1.3223930597305298, + "learning_rate": 8.36126862056872e-05, + "loss": 0.6952820420265198, + "step": 4610 + }, + { + "epoch": 1.9459915611814345, + "grad_norm": 1.0048432350158691, + "learning_rate": 8.359490380570556e-05, + "loss": 0.5291440486907959, + "step": 4612 + }, + { + "epoch": 1.9468354430379748, + "grad_norm": 1.1477346420288086, + "learning_rate": 8.357711365602483e-05, + "loss": 0.6857813000679016, + "step": 4614 + }, + { + "epoch": 1.9476793248945148, + "grad_norm": 0.959985077381134, + "learning_rate": 8.355931576074882e-05, + "loss": 0.5581508278846741, + "step": 4616 + }, + { + "epoch": 1.9485232067510547, + "grad_norm": 1.1104289293289185, + "learning_rate": 8.35415101239832e-05, + "loss": 0.6536211371421814, + "step": 4618 + }, + { + "epoch": 1.9493670886075949, + "grad_norm": 1.2344517707824707, + "learning_rate": 8.352369674983535e-05, + "loss": 0.6570560336112976, + "step": 4620 + }, + { + "epoch": 1.950210970464135, + "grad_norm": 1.3411606550216675, + "learning_rate": 8.350587564241451e-05, + "loss": 0.6070495247840881, + "step": 4622 + }, + { + "epoch": 1.951054852320675, + "grad_norm": 1.1713159084320068, + "learning_rate": 8.348804680583166e-05, + "loss": 0.6444135904312134, + "step": 4624 + }, + { + "epoch": 1.9518987341772152, + "grad_norm": 1.127242922782898, + "learning_rate": 8.347021024419954e-05, + "loss": 0.6517419815063477, + "step": 4626 + }, + { + "epoch": 1.9527426160337553, + "grad_norm": 1.0733028650283813, + "learning_rate": 8.345236596163274e-05, + "loss": 0.6174065470695496, + "step": 4628 + }, + { + "epoch": 1.9535864978902953, + "grad_norm": 1.1114680767059326, + "learning_rate": 8.343451396224757e-05, + "loss": 0.7163593769073486, + "step": 4630 + }, + { + "epoch": 1.9544303797468354, + "grad_norm": 1.0839568376541138, + "learning_rate": 8.341665425016216e-05, + "loss": 0.698553204536438, + "step": 4632 + }, + { + "epoch": 1.9552742616033756, + "grad_norm": 1.17001211643219, + "learning_rate": 8.339878682949638e-05, + "loss": 0.6224857568740845, + "step": 4634 + }, + { + "epoch": 1.9561181434599155, + "grad_norm": 3.483793020248413, + "learning_rate": 8.338091170437193e-05, + "loss": 0.5931200981140137, + "step": 4636 + }, + { + "epoch": 1.9569620253164557, + "grad_norm": 1.1575394868850708, + "learning_rate": 8.336302887891224e-05, + "loss": 0.6031442284584045, + "step": 4638 + }, + { + "epoch": 1.9578059071729959, + "grad_norm": 1.1494992971420288, + "learning_rate": 8.334513835724252e-05, + "loss": 0.6101768016815186, + "step": 4640 + }, + { + "epoch": 1.9586497890295358, + "grad_norm": 1.3858197927474976, + "learning_rate": 8.332724014348981e-05, + "loss": 0.6571711301803589, + "step": 4642 + }, + { + "epoch": 1.959493670886076, + "grad_norm": 1.1094943284988403, + "learning_rate": 8.330933424178284e-05, + "loss": 0.6391071677207947, + "step": 4644 + }, + { + "epoch": 1.9603375527426161, + "grad_norm": 1.1640198230743408, + "learning_rate": 8.329142065625218e-05, + "loss": 0.6542805433273315, + "step": 4646 + }, + { + "epoch": 1.961181434599156, + "grad_norm": 1.1080211400985718, + "learning_rate": 8.327349939103016e-05, + "loss": 0.6053075194358826, + "step": 4648 + }, + { + "epoch": 1.9620253164556962, + "grad_norm": 1.0137052536010742, + "learning_rate": 8.325557045025085e-05, + "loss": 0.6009573340415955, + "step": 4650 + }, + { + "epoch": 1.9628691983122364, + "grad_norm": 1.0867283344268799, + "learning_rate": 8.323763383805012e-05, + "loss": 0.5993483066558838, + "step": 4652 + }, + { + "epoch": 1.9637130801687763, + "grad_norm": 1.0577161312103271, + "learning_rate": 8.321968955856562e-05, + "loss": 0.6788463592529297, + "step": 4654 + }, + { + "epoch": 1.9645569620253165, + "grad_norm": 1.2002183198928833, + "learning_rate": 8.320173761593672e-05, + "loss": 0.5786917209625244, + "step": 4656 + }, + { + "epoch": 1.9654008438818567, + "grad_norm": 1.2266993522644043, + "learning_rate": 8.318377801430461e-05, + "loss": 0.7437994480133057, + "step": 4658 + }, + { + "epoch": 1.9662447257383966, + "grad_norm": 1.007582187652588, + "learning_rate": 8.316581075781223e-05, + "loss": 0.6763550639152527, + "step": 4660 + }, + { + "epoch": 1.9670886075949368, + "grad_norm": 1.2374811172485352, + "learning_rate": 8.314783585060425e-05, + "loss": 0.6953140497207642, + "step": 4662 + }, + { + "epoch": 1.967932489451477, + "grad_norm": 1.1791057586669922, + "learning_rate": 8.312985329682717e-05, + "loss": 0.6867341995239258, + "step": 4664 + }, + { + "epoch": 1.9687763713080169, + "grad_norm": 1.1903331279754639, + "learning_rate": 8.31118631006292e-05, + "loss": 0.6445001363754272, + "step": 4666 + }, + { + "epoch": 1.9696202531645568, + "grad_norm": 1.1731067895889282, + "learning_rate": 8.309386526616034e-05, + "loss": 0.6500589847564697, + "step": 4668 + }, + { + "epoch": 1.9704641350210972, + "grad_norm": 0.9470233917236328, + "learning_rate": 8.307585979757233e-05, + "loss": 0.6215718984603882, + "step": 4670 + }, + { + "epoch": 1.9713080168776371, + "grad_norm": 1.2900800704956055, + "learning_rate": 8.305784669901872e-05, + "loss": 0.6396787762641907, + "step": 4672 + }, + { + "epoch": 1.972151898734177, + "grad_norm": 1.1729133129119873, + "learning_rate": 8.303982597465474e-05, + "loss": 0.6581959128379822, + "step": 4674 + }, + { + "epoch": 1.9729957805907175, + "grad_norm": 1.1450555324554443, + "learning_rate": 8.302179762863746e-05, + "loss": 0.7013490796089172, + "step": 4676 + }, + { + "epoch": 1.9738396624472574, + "grad_norm": 1.1506338119506836, + "learning_rate": 8.300376166512567e-05, + "loss": 0.6796102523803711, + "step": 4678 + }, + { + "epoch": 1.9746835443037973, + "grad_norm": 1.149979591369629, + "learning_rate": 8.298571808827991e-05, + "loss": 0.6960519552230835, + "step": 4680 + }, + { + "epoch": 1.9755274261603377, + "grad_norm": 1.1078912019729614, + "learning_rate": 8.296766690226249e-05, + "loss": 0.6789507865905762, + "step": 4682 + }, + { + "epoch": 1.9763713080168777, + "grad_norm": 1.0199202299118042, + "learning_rate": 8.294960811123747e-05, + "loss": 0.5962659120559692, + "step": 4684 + }, + { + "epoch": 1.9772151898734176, + "grad_norm": 1.2226134538650513, + "learning_rate": 8.293154171937068e-05, + "loss": 0.6483094692230225, + "step": 4686 + }, + { + "epoch": 1.9780590717299578, + "grad_norm": 1.184095025062561, + "learning_rate": 8.291346773082965e-05, + "loss": 0.6750242710113525, + "step": 4688 + }, + { + "epoch": 1.978902953586498, + "grad_norm": 1.1018693447113037, + "learning_rate": 8.289538614978375e-05, + "loss": 0.7094066739082336, + "step": 4690 + }, + { + "epoch": 1.9797468354430379, + "grad_norm": 1.0342390537261963, + "learning_rate": 8.287729698040403e-05, + "loss": 0.6554126739501953, + "step": 4692 + }, + { + "epoch": 1.980590717299578, + "grad_norm": 1.0603563785552979, + "learning_rate": 8.285920022686332e-05, + "loss": 0.5493529438972473, + "step": 4694 + }, + { + "epoch": 1.9814345991561182, + "grad_norm": 1.139609932899475, + "learning_rate": 8.284109589333617e-05, + "loss": 0.6824741363525391, + "step": 4696 + }, + { + "epoch": 1.9822784810126581, + "grad_norm": 1.2167822122573853, + "learning_rate": 8.282298398399895e-05, + "loss": 0.7121000289916992, + "step": 4698 + }, + { + "epoch": 1.9831223628691983, + "grad_norm": 1.109857201576233, + "learning_rate": 8.280486450302968e-05, + "loss": 0.6711249351501465, + "step": 4700 + }, + { + "epoch": 1.9831223628691983, + "eval_loss": 0.6923081278800964, + "eval_runtime": 514.7729, + "eval_samples_per_second": 4.093, + "eval_steps_per_second": 4.093, + "step": 4700 + }, + { + "epoch": 1.9839662447257385, + "grad_norm": 1.1387107372283936, + "learning_rate": 8.27867374546082e-05, + "loss": 0.581635594367981, + "step": 4702 + }, + { + "epoch": 1.9848101265822784, + "grad_norm": 1.2519257068634033, + "learning_rate": 8.27686028429161e-05, + "loss": 0.6867302060127258, + "step": 4704 + }, + { + "epoch": 1.9856540084388186, + "grad_norm": 1.0927205085754395, + "learning_rate": 8.275046067213663e-05, + "loss": 0.6494556665420532, + "step": 4706 + }, + { + "epoch": 1.9864978902953587, + "grad_norm": 1.042035698890686, + "learning_rate": 8.273231094645487e-05, + "loss": 0.6949493288993835, + "step": 4708 + }, + { + "epoch": 1.9873417721518987, + "grad_norm": 1.0220824480056763, + "learning_rate": 8.271415367005762e-05, + "loss": 0.6535884737968445, + "step": 4710 + }, + { + "epoch": 1.9881856540084388, + "grad_norm": 1.3023611307144165, + "learning_rate": 8.269598884713339e-05, + "loss": 0.6635278463363647, + "step": 4712 + }, + { + "epoch": 1.989029535864979, + "grad_norm": 1.2526965141296387, + "learning_rate": 8.267781648187248e-05, + "loss": 0.7194697856903076, + "step": 4714 + }, + { + "epoch": 1.989873417721519, + "grad_norm": 1.0388038158416748, + "learning_rate": 8.265963657846691e-05, + "loss": 0.6355333924293518, + "step": 4716 + }, + { + "epoch": 1.990717299578059, + "grad_norm": 1.0852965116500854, + "learning_rate": 8.264144914111041e-05, + "loss": 0.6898305416107178, + "step": 4718 + }, + { + "epoch": 1.9915611814345993, + "grad_norm": 1.0714049339294434, + "learning_rate": 8.262325417399847e-05, + "loss": 0.6202836036682129, + "step": 4720 + }, + { + "epoch": 1.9924050632911392, + "grad_norm": 1.0767238140106201, + "learning_rate": 8.260505168132835e-05, + "loss": 0.6160458326339722, + "step": 4722 + }, + { + "epoch": 1.9932489451476794, + "grad_norm": 0.9605211615562439, + "learning_rate": 8.258684166729899e-05, + "loss": 0.6049920916557312, + "step": 4724 + }, + { + "epoch": 1.9940928270042195, + "grad_norm": 1.0580185651779175, + "learning_rate": 8.256862413611113e-05, + "loss": 0.5622014999389648, + "step": 4726 + }, + { + "epoch": 1.9949367088607595, + "grad_norm": 1.1039034128189087, + "learning_rate": 8.255039909196713e-05, + "loss": 0.6678924560546875, + "step": 4728 + }, + { + "epoch": 1.9957805907172996, + "grad_norm": 1.1482586860656738, + "learning_rate": 8.253216653907123e-05, + "loss": 0.658260703086853, + "step": 4730 + }, + { + "epoch": 1.9966244725738398, + "grad_norm": 1.135349988937378, + "learning_rate": 8.251392648162929e-05, + "loss": 0.6461613178253174, + "step": 4732 + }, + { + "epoch": 1.9974683544303797, + "grad_norm": 1.0155420303344727, + "learning_rate": 8.249567892384895e-05, + "loss": 0.6837426424026489, + "step": 4734 + }, + { + "epoch": 1.9983122362869197, + "grad_norm": 1.3392970561981201, + "learning_rate": 8.247742386993958e-05, + "loss": 0.6091697812080383, + "step": 4736 + }, + { + "epoch": 1.99915611814346, + "grad_norm": 1.0509974956512451, + "learning_rate": 8.245916132411226e-05, + "loss": 0.6539653539657593, + "step": 4738 + }, + { + "epoch": 2.0, + "grad_norm": 0.9777396321296692, + "learning_rate": 8.244089129057982e-05, + "loss": 0.5630147457122803, + "step": 4740 + }, + { + "epoch": 2.00084388185654, + "grad_norm": 1.1639164686203003, + "learning_rate": 8.24226137735568e-05, + "loss": 0.6190353631973267, + "step": 4742 + }, + { + "epoch": 2.0016877637130803, + "grad_norm": 1.119614839553833, + "learning_rate": 8.240432877725947e-05, + "loss": 0.6282529234886169, + "step": 4744 + }, + { + "epoch": 2.0025316455696203, + "grad_norm": 1.114739179611206, + "learning_rate": 8.238603630590581e-05, + "loss": 0.6176725625991821, + "step": 4746 + }, + { + "epoch": 2.00337552742616, + "grad_norm": 1.0543076992034912, + "learning_rate": 8.236773636371557e-05, + "loss": 0.5182007551193237, + "step": 4748 + }, + { + "epoch": 2.0042194092827006, + "grad_norm": 1.060389518737793, + "learning_rate": 8.234942895491019e-05, + "loss": 0.532536506652832, + "step": 4750 + }, + { + "epoch": 2.0050632911392405, + "grad_norm": 1.0824412107467651, + "learning_rate": 8.233111408371282e-05, + "loss": 0.5474061369895935, + "step": 4752 + }, + { + "epoch": 2.0059071729957805, + "grad_norm": 1.1450858116149902, + "learning_rate": 8.231279175434838e-05, + "loss": 0.586384654045105, + "step": 4754 + }, + { + "epoch": 2.006751054852321, + "grad_norm": 1.1225577592849731, + "learning_rate": 8.229446197104345e-05, + "loss": 0.6469444036483765, + "step": 4756 + }, + { + "epoch": 2.007594936708861, + "grad_norm": 1.7292449474334717, + "learning_rate": 8.227612473802637e-05, + "loss": 0.5371572971343994, + "step": 4758 + }, + { + "epoch": 2.0084388185654007, + "grad_norm": 1.1743781566619873, + "learning_rate": 8.22577800595272e-05, + "loss": 0.558707058429718, + "step": 4760 + }, + { + "epoch": 2.009282700421941, + "grad_norm": 1.0385273694992065, + "learning_rate": 8.223942793977769e-05, + "loss": 0.5943514108657837, + "step": 4762 + }, + { + "epoch": 2.010126582278481, + "grad_norm": 1.1302000284194946, + "learning_rate": 8.222106838301131e-05, + "loss": 0.5630753636360168, + "step": 4764 + }, + { + "epoch": 2.010970464135021, + "grad_norm": 1.140005111694336, + "learning_rate": 8.220270139346327e-05, + "loss": 0.527510404586792, + "step": 4766 + }, + { + "epoch": 2.0118143459915614, + "grad_norm": 1.1979734897613525, + "learning_rate": 8.21843269753705e-05, + "loss": 0.6315013766288757, + "step": 4768 + }, + { + "epoch": 2.0126582278481013, + "grad_norm": 1.3759459257125854, + "learning_rate": 8.21659451329716e-05, + "loss": 0.6225199699401855, + "step": 4770 + }, + { + "epoch": 2.0135021097046413, + "grad_norm": 1.330600380897522, + "learning_rate": 8.21475558705069e-05, + "loss": 0.6838938593864441, + "step": 4772 + }, + { + "epoch": 2.014345991561181, + "grad_norm": 1.2365351915359497, + "learning_rate": 8.21291591922185e-05, + "loss": 0.606302797794342, + "step": 4774 + }, + { + "epoch": 2.0151898734177216, + "grad_norm": 1.1886142492294312, + "learning_rate": 8.211075510235011e-05, + "loss": 0.6194182634353638, + "step": 4776 + }, + { + "epoch": 2.0160337552742615, + "grad_norm": 1.1414743661880493, + "learning_rate": 8.209234360514721e-05, + "loss": 0.639540433883667, + "step": 4778 + }, + { + "epoch": 2.0168776371308015, + "grad_norm": 1.2877455949783325, + "learning_rate": 8.2073924704857e-05, + "loss": 0.6350902318954468, + "step": 4780 + }, + { + "epoch": 2.017721518987342, + "grad_norm": 1.095578908920288, + "learning_rate": 8.205549840572834e-05, + "loss": 0.5152000784873962, + "step": 4782 + }, + { + "epoch": 2.018565400843882, + "grad_norm": 1.0043798685073853, + "learning_rate": 8.203706471201183e-05, + "loss": 0.46245837211608887, + "step": 4784 + }, + { + "epoch": 2.0194092827004217, + "grad_norm": 1.2133857011795044, + "learning_rate": 8.201862362795979e-05, + "loss": 0.6471722722053528, + "step": 4786 + }, + { + "epoch": 2.020253164556962, + "grad_norm": 1.0835390090942383, + "learning_rate": 8.200017515782619e-05, + "loss": 0.5790625214576721, + "step": 4788 + }, + { + "epoch": 2.021097046413502, + "grad_norm": 1.0176091194152832, + "learning_rate": 8.198171930586678e-05, + "loss": 0.5826238989830017, + "step": 4790 + }, + { + "epoch": 2.021940928270042, + "grad_norm": 1.1581370830535889, + "learning_rate": 8.196325607633893e-05, + "loss": 0.5781272649765015, + "step": 4792 + }, + { + "epoch": 2.0227848101265824, + "grad_norm": 1.243381142616272, + "learning_rate": 8.194478547350178e-05, + "loss": 0.6600401997566223, + "step": 4794 + }, + { + "epoch": 2.0236286919831223, + "grad_norm": 1.0718560218811035, + "learning_rate": 8.192630750161612e-05, + "loss": 0.5291268825531006, + "step": 4796 + }, + { + "epoch": 2.0244725738396623, + "grad_norm": 1.2338320016860962, + "learning_rate": 8.190782216494448e-05, + "loss": 0.6564924120903015, + "step": 4798 + }, + { + "epoch": 2.0253164556962027, + "grad_norm": 0.978547990322113, + "learning_rate": 8.188932946775107e-05, + "loss": 0.5471183657646179, + "step": 4800 + }, + { + "epoch": 2.0253164556962027, + "eval_loss": 0.6924457550048828, + "eval_runtime": 514.0427, + "eval_samples_per_second": 4.099, + "eval_steps_per_second": 4.099, + "step": 4800 + }, + { + "epoch": 2.0261603375527426, + "grad_norm": 1.1782792806625366, + "learning_rate": 8.18708294143018e-05, + "loss": 0.567442774772644, + "step": 4802 + }, + { + "epoch": 2.0270042194092825, + "grad_norm": 1.0768574476242065, + "learning_rate": 8.185232200886426e-05, + "loss": 0.6005180478096008, + "step": 4804 + }, + { + "epoch": 2.027848101265823, + "grad_norm": 1.3096717596054077, + "learning_rate": 8.18338072557078e-05, + "loss": 0.616436779499054, + "step": 4806 + }, + { + "epoch": 2.028691983122363, + "grad_norm": 1.0233508348464966, + "learning_rate": 8.181528515910336e-05, + "loss": 0.49587416648864746, + "step": 4808 + }, + { + "epoch": 2.029535864978903, + "grad_norm": 1.0800065994262695, + "learning_rate": 8.179675572332366e-05, + "loss": 0.5758571624755859, + "step": 4810 + }, + { + "epoch": 2.030379746835443, + "grad_norm": 1.09299898147583, + "learning_rate": 8.177821895264309e-05, + "loss": 0.561736524105072, + "step": 4812 + }, + { + "epoch": 2.031223628691983, + "grad_norm": 1.1439210176467896, + "learning_rate": 8.175967485133771e-05, + "loss": 0.5249468088150024, + "step": 4814 + }, + { + "epoch": 2.032067510548523, + "grad_norm": 1.15841805934906, + "learning_rate": 8.174112342368532e-05, + "loss": 0.6429001688957214, + "step": 4816 + }, + { + "epoch": 2.0329113924050635, + "grad_norm": 1.1720670461654663, + "learning_rate": 8.172256467396533e-05, + "loss": 0.60152667760849, + "step": 4818 + }, + { + "epoch": 2.0337552742616034, + "grad_norm": 1.2652091979980469, + "learning_rate": 8.170399860645892e-05, + "loss": 0.5553541779518127, + "step": 4820 + }, + { + "epoch": 2.0345991561181433, + "grad_norm": 1.0768507719039917, + "learning_rate": 8.168542522544893e-05, + "loss": 0.5369323492050171, + "step": 4822 + }, + { + "epoch": 2.0354430379746837, + "grad_norm": 0.9906469583511353, + "learning_rate": 8.166684453521986e-05, + "loss": 0.5468952655792236, + "step": 4824 + }, + { + "epoch": 2.0362869198312237, + "grad_norm": 1.3448988199234009, + "learning_rate": 8.164825654005792e-05, + "loss": 0.5795659422874451, + "step": 4826 + }, + { + "epoch": 2.0371308016877636, + "grad_norm": 1.2502341270446777, + "learning_rate": 8.162966124425103e-05, + "loss": 0.6465779542922974, + "step": 4828 + }, + { + "epoch": 2.037974683544304, + "grad_norm": 1.1512303352355957, + "learning_rate": 8.161105865208875e-05, + "loss": 0.5509394407272339, + "step": 4830 + }, + { + "epoch": 2.038818565400844, + "grad_norm": 1.2513408660888672, + "learning_rate": 8.159244876786232e-05, + "loss": 0.5515735745429993, + "step": 4832 + }, + { + "epoch": 2.039662447257384, + "grad_norm": 1.3035682439804077, + "learning_rate": 8.157383159586473e-05, + "loss": 0.757799506187439, + "step": 4834 + }, + { + "epoch": 2.0405063291139243, + "grad_norm": 1.1136540174484253, + "learning_rate": 8.155520714039056e-05, + "loss": 0.607295036315918, + "step": 4836 + }, + { + "epoch": 2.041350210970464, + "grad_norm": 1.220146656036377, + "learning_rate": 8.153657540573613e-05, + "loss": 0.5769712328910828, + "step": 4838 + }, + { + "epoch": 2.042194092827004, + "grad_norm": 1.2104195356369019, + "learning_rate": 8.151793639619944e-05, + "loss": 0.5746933817863464, + "step": 4840 + }, + { + "epoch": 2.043037974683544, + "grad_norm": 1.241708517074585, + "learning_rate": 8.149929011608014e-05, + "loss": 0.5932332277297974, + "step": 4842 + }, + { + "epoch": 2.0438818565400845, + "grad_norm": 1.1172713041305542, + "learning_rate": 8.148063656967955e-05, + "loss": 0.583284318447113, + "step": 4844 + }, + { + "epoch": 2.0447257383966244, + "grad_norm": 1.0867618322372437, + "learning_rate": 8.14619757613007e-05, + "loss": 0.5589476823806763, + "step": 4846 + }, + { + "epoch": 2.0455696202531644, + "grad_norm": 1.2470483779907227, + "learning_rate": 8.14433076952483e-05, + "loss": 0.6118156313896179, + "step": 4848 + }, + { + "epoch": 2.0464135021097047, + "grad_norm": 1.0908832550048828, + "learning_rate": 8.142463237582868e-05, + "loss": 0.5815895795822144, + "step": 4850 + }, + { + "epoch": 2.0472573839662447, + "grad_norm": 1.2589281797409058, + "learning_rate": 8.140594980734989e-05, + "loss": 0.6232373714447021, + "step": 4852 + }, + { + "epoch": 2.0481012658227846, + "grad_norm": 1.234152913093567, + "learning_rate": 8.138725999412165e-05, + "loss": 0.5992053151130676, + "step": 4854 + }, + { + "epoch": 2.048945147679325, + "grad_norm": 1.3304446935653687, + "learning_rate": 8.136856294045533e-05, + "loss": 0.6494496464729309, + "step": 4856 + }, + { + "epoch": 2.049789029535865, + "grad_norm": 1.1871088743209839, + "learning_rate": 8.134985865066398e-05, + "loss": 0.6263431906700134, + "step": 4858 + }, + { + "epoch": 2.050632911392405, + "grad_norm": 1.1454699039459229, + "learning_rate": 8.133114712906234e-05, + "loss": 0.6036502122879028, + "step": 4860 + }, + { + "epoch": 2.0514767932489453, + "grad_norm": 1.2953420877456665, + "learning_rate": 8.131242837996675e-05, + "loss": 0.5674451589584351, + "step": 4862 + }, + { + "epoch": 2.052320675105485, + "grad_norm": 1.1874405145645142, + "learning_rate": 8.129370240769534e-05, + "loss": 0.5616317987442017, + "step": 4864 + }, + { + "epoch": 2.053164556962025, + "grad_norm": 1.2936227321624756, + "learning_rate": 8.127496921656777e-05, + "loss": 0.6495023369789124, + "step": 4866 + }, + { + "epoch": 2.0540084388185655, + "grad_norm": 1.1935228109359741, + "learning_rate": 8.125622881090544e-05, + "loss": 0.6028099060058594, + "step": 4868 + }, + { + "epoch": 2.0548523206751055, + "grad_norm": 0.9932331442832947, + "learning_rate": 8.123748119503143e-05, + "loss": 0.476296067237854, + "step": 4870 + }, + { + "epoch": 2.0556962025316454, + "grad_norm": 1.3878839015960693, + "learning_rate": 8.121872637327042e-05, + "loss": 0.6191902756690979, + "step": 4872 + }, + { + "epoch": 2.056540084388186, + "grad_norm": 1.1185581684112549, + "learning_rate": 8.11999643499488e-05, + "loss": 0.566487729549408, + "step": 4874 + }, + { + "epoch": 2.0573839662447257, + "grad_norm": 1.3729257583618164, + "learning_rate": 8.118119512939464e-05, + "loss": 0.5970078706741333, + "step": 4876 + }, + { + "epoch": 2.0582278481012657, + "grad_norm": 1.1332688331604004, + "learning_rate": 8.11624187159376e-05, + "loss": 0.570341944694519, + "step": 4878 + }, + { + "epoch": 2.059071729957806, + "grad_norm": 1.2648937702178955, + "learning_rate": 8.114363511390903e-05, + "loss": 0.6302897334098816, + "step": 4880 + }, + { + "epoch": 2.059915611814346, + "grad_norm": 1.250616192817688, + "learning_rate": 8.112484432764197e-05, + "loss": 0.5619142651557922, + "step": 4882 + }, + { + "epoch": 2.060759493670886, + "grad_norm": 0.9710861444473267, + "learning_rate": 8.110604636147109e-05, + "loss": 0.5426228642463684, + "step": 4884 + }, + { + "epoch": 2.0616033755274263, + "grad_norm": 1.1979506015777588, + "learning_rate": 8.108724121973271e-05, + "loss": 0.5498107671737671, + "step": 4886 + }, + { + "epoch": 2.0624472573839663, + "grad_norm": 1.0936485528945923, + "learning_rate": 8.106842890676483e-05, + "loss": 0.5695134401321411, + "step": 4888 + }, + { + "epoch": 2.0632911392405062, + "grad_norm": 1.1246092319488525, + "learning_rate": 8.10496094269071e-05, + "loss": 0.5998331308364868, + "step": 4890 + }, + { + "epoch": 2.0641350210970466, + "grad_norm": 1.244438648223877, + "learning_rate": 8.103078278450075e-05, + "loss": 0.5702623128890991, + "step": 4892 + }, + { + "epoch": 2.0649789029535865, + "grad_norm": 1.1585633754730225, + "learning_rate": 8.101194898388881e-05, + "loss": 0.5392299890518188, + "step": 4894 + }, + { + "epoch": 2.0658227848101265, + "grad_norm": 1.3044285774230957, + "learning_rate": 8.099310802941582e-05, + "loss": 0.5640127658843994, + "step": 4896 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 1.2483032941818237, + "learning_rate": 8.097425992542804e-05, + "loss": 0.6103175282478333, + "step": 4898 + }, + { + "epoch": 2.067510548523207, + "grad_norm": 1.0845462083816528, + "learning_rate": 8.095540467627337e-05, + "loss": 0.5041166543960571, + "step": 4900 + }, + { + "epoch": 2.067510548523207, + "eval_loss": 0.6941288113594055, + "eval_runtime": 513.4497, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 4900 + }, + { + "epoch": 2.0683544303797468, + "grad_norm": 1.2493232488632202, + "learning_rate": 8.093654228630134e-05, + "loss": 0.6253946423530579, + "step": 4902 + }, + { + "epoch": 2.0691983122362867, + "grad_norm": 1.1668756008148193, + "learning_rate": 8.091767275986317e-05, + "loss": 0.523486852645874, + "step": 4904 + }, + { + "epoch": 2.070042194092827, + "grad_norm": 1.1709638833999634, + "learning_rate": 8.089879610131167e-05, + "loss": 0.5569989681243896, + "step": 4906 + }, + { + "epoch": 2.070886075949367, + "grad_norm": 1.1044740676879883, + "learning_rate": 8.087991231500133e-05, + "loss": 0.642728865146637, + "step": 4908 + }, + { + "epoch": 2.071729957805907, + "grad_norm": 1.1032549142837524, + "learning_rate": 8.086102140528828e-05, + "loss": 0.5998259782791138, + "step": 4910 + }, + { + "epoch": 2.0725738396624473, + "grad_norm": 0.9980027079582214, + "learning_rate": 8.08421233765303e-05, + "loss": 0.5460172891616821, + "step": 4912 + }, + { + "epoch": 2.0734177215189873, + "grad_norm": 1.0866090059280396, + "learning_rate": 8.082321823308679e-05, + "loss": 0.5643284916877747, + "step": 4914 + }, + { + "epoch": 2.0742616033755272, + "grad_norm": 1.1942687034606934, + "learning_rate": 8.080430597931878e-05, + "loss": 0.554400622844696, + "step": 4916 + }, + { + "epoch": 2.0751054852320676, + "grad_norm": 1.0680599212646484, + "learning_rate": 8.078538661958901e-05, + "loss": 0.5955621004104614, + "step": 4918 + }, + { + "epoch": 2.0759493670886076, + "grad_norm": 1.20845627784729, + "learning_rate": 8.076646015826179e-05, + "loss": 0.5970203280448914, + "step": 4920 + }, + { + "epoch": 2.0767932489451475, + "grad_norm": 1.8368924856185913, + "learning_rate": 8.074752659970308e-05, + "loss": 0.6467664837837219, + "step": 4922 + }, + { + "epoch": 2.077637130801688, + "grad_norm": 1.3291922807693481, + "learning_rate": 8.072858594828053e-05, + "loss": 0.630719006061554, + "step": 4924 + }, + { + "epoch": 2.078481012658228, + "grad_norm": 1.1496083736419678, + "learning_rate": 8.070963820836333e-05, + "loss": 0.601140022277832, + "step": 4926 + }, + { + "epoch": 2.0793248945147678, + "grad_norm": 1.1562724113464355, + "learning_rate": 8.069068338432239e-05, + "loss": 0.6096881031990051, + "step": 4928 + }, + { + "epoch": 2.080168776371308, + "grad_norm": 1.0115300416946411, + "learning_rate": 8.067172148053021e-05, + "loss": 0.5085908770561218, + "step": 4930 + }, + { + "epoch": 2.081012658227848, + "grad_norm": 1.2181830406188965, + "learning_rate": 8.065275250136097e-05, + "loss": 0.5268720984458923, + "step": 4932 + }, + { + "epoch": 2.081856540084388, + "grad_norm": 1.1249788999557495, + "learning_rate": 8.06337764511904e-05, + "loss": 0.6075665950775146, + "step": 4934 + }, + { + "epoch": 2.0827004219409284, + "grad_norm": 1.1143964529037476, + "learning_rate": 8.061479333439595e-05, + "loss": 0.59170001745224, + "step": 4936 + }, + { + "epoch": 2.0835443037974684, + "grad_norm": 1.4773131608963013, + "learning_rate": 8.059580315535664e-05, + "loss": 0.6689745187759399, + "step": 4938 + }, + { + "epoch": 2.0843881856540083, + "grad_norm": 1.143965244293213, + "learning_rate": 8.057680591845316e-05, + "loss": 0.5409777760505676, + "step": 4940 + }, + { + "epoch": 2.0852320675105487, + "grad_norm": 1.0384942293167114, + "learning_rate": 8.055780162806777e-05, + "loss": 0.5778636336326599, + "step": 4942 + }, + { + "epoch": 2.0860759493670886, + "grad_norm": 1.0102177858352661, + "learning_rate": 8.053879028858442e-05, + "loss": 0.5576038360595703, + "step": 4944 + }, + { + "epoch": 2.0869198312236286, + "grad_norm": 1.3792158365249634, + "learning_rate": 8.051977190438868e-05, + "loss": 0.5873376131057739, + "step": 4946 + }, + { + "epoch": 2.087763713080169, + "grad_norm": 1.4402949810028076, + "learning_rate": 8.050074647986768e-05, + "loss": 0.6067743301391602, + "step": 4948 + }, + { + "epoch": 2.088607594936709, + "grad_norm": 1.2719058990478516, + "learning_rate": 8.048171401941027e-05, + "loss": 0.604671835899353, + "step": 4950 + }, + { + "epoch": 2.089451476793249, + "grad_norm": 1.1054867506027222, + "learning_rate": 8.046267452740683e-05, + "loss": 0.5743544697761536, + "step": 4952 + }, + { + "epoch": 2.090295358649789, + "grad_norm": 1.0521535873413086, + "learning_rate": 8.044362800824944e-05, + "loss": 0.576278567314148, + "step": 4954 + }, + { + "epoch": 2.091139240506329, + "grad_norm": 1.2665088176727295, + "learning_rate": 8.042457446633174e-05, + "loss": 0.5903641581535339, + "step": 4956 + }, + { + "epoch": 2.091983122362869, + "grad_norm": 1.1283398866653442, + "learning_rate": 8.040551390604902e-05, + "loss": 0.5854214429855347, + "step": 4958 + }, + { + "epoch": 2.0928270042194095, + "grad_norm": 1.1194316148757935, + "learning_rate": 8.03864463317982e-05, + "loss": 0.5843619108200073, + "step": 4960 + }, + { + "epoch": 2.0936708860759494, + "grad_norm": 1.3581651449203491, + "learning_rate": 8.036737174797778e-05, + "loss": 0.6115096211433411, + "step": 4962 + }, + { + "epoch": 2.0945147679324894, + "grad_norm": 1.341748595237732, + "learning_rate": 8.034829015898793e-05, + "loss": 0.5998795032501221, + "step": 4964 + }, + { + "epoch": 2.0953586497890297, + "grad_norm": 1.2212611436843872, + "learning_rate": 8.032920156923038e-05, + "loss": 0.628372311592102, + "step": 4966 + }, + { + "epoch": 2.0962025316455697, + "grad_norm": 1.1348317861557007, + "learning_rate": 8.031010598310851e-05, + "loss": 0.5668916702270508, + "step": 4968 + }, + { + "epoch": 2.0970464135021096, + "grad_norm": 1.1106547117233276, + "learning_rate": 8.029100340502731e-05, + "loss": 0.5253881216049194, + "step": 4970 + }, + { + "epoch": 2.09789029535865, + "grad_norm": 1.2471354007720947, + "learning_rate": 8.027189383939339e-05, + "loss": 0.5790762901306152, + "step": 4972 + }, + { + "epoch": 2.09873417721519, + "grad_norm": 1.2477394342422485, + "learning_rate": 8.025277729061492e-05, + "loss": 0.6382888555526733, + "step": 4974 + }, + { + "epoch": 2.09957805907173, + "grad_norm": 1.2716054916381836, + "learning_rate": 8.023365376310176e-05, + "loss": 0.5962072610855103, + "step": 4976 + }, + { + "epoch": 2.10042194092827, + "grad_norm": 1.257820725440979, + "learning_rate": 8.021452326126532e-05, + "loss": 0.5882940292358398, + "step": 4978 + }, + { + "epoch": 2.1012658227848102, + "grad_norm": 1.0924186706542969, + "learning_rate": 8.019538578951864e-05, + "loss": 0.5640701055526733, + "step": 4980 + }, + { + "epoch": 2.10210970464135, + "grad_norm": 1.1250383853912354, + "learning_rate": 8.017624135227637e-05, + "loss": 0.5746428966522217, + "step": 4982 + }, + { + "epoch": 2.10295358649789, + "grad_norm": 1.131323218345642, + "learning_rate": 8.015708995395477e-05, + "loss": 0.5611346960067749, + "step": 4984 + }, + { + "epoch": 2.1037974683544305, + "grad_norm": 1.4267152547836304, + "learning_rate": 8.013793159897171e-05, + "loss": 0.6173797249794006, + "step": 4986 + }, + { + "epoch": 2.1046413502109704, + "grad_norm": 1.41414213180542, + "learning_rate": 8.011876629174662e-05, + "loss": 0.64865642786026, + "step": 4988 + }, + { + "epoch": 2.1054852320675104, + "grad_norm": 1.1498184204101562, + "learning_rate": 8.00995940367006e-05, + "loss": 0.6125827431678772, + "step": 4990 + }, + { + "epoch": 2.1063291139240508, + "grad_norm": 1.2327708005905151, + "learning_rate": 8.00804148382563e-05, + "loss": 0.670495867729187, + "step": 4992 + }, + { + "epoch": 2.1071729957805907, + "grad_norm": 1.2797311544418335, + "learning_rate": 8.0061228700838e-05, + "loss": 0.6020209193229675, + "step": 4994 + }, + { + "epoch": 2.1080168776371306, + "grad_norm": 1.079584002494812, + "learning_rate": 8.004203562887157e-05, + "loss": 0.5974310636520386, + "step": 4996 + }, + { + "epoch": 2.108860759493671, + "grad_norm": 1.4352604150772095, + "learning_rate": 8.002283562678452e-05, + "loss": 0.6424587368965149, + "step": 4998 + }, + { + "epoch": 2.109704641350211, + "grad_norm": 1.0876719951629639, + "learning_rate": 8.000362869900586e-05, + "loss": 0.6185846328735352, + "step": 5000 + }, + { + "epoch": 2.109704641350211, + "eval_loss": 0.6908889412879944, + "eval_runtime": 675.8398, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 3.118, + "step": 5000 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.194397741442583e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5000/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5500/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5500/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5500/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9ddf193a8f616595bba23d4e401cc8993f02cbad --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c502329e2786e5306d6409c7f519d7617bd852df4c7a9f6dd5606d34191624a +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5500/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a592178cb06b8520e63d07e4708fc114d9e9cb1 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb53c039a45aa66dfced0b82d03fe809e987c1c47541a242c1eaa298dbcf2101 +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5500/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..076bf071111abc1f0af080aca26d719329439c1b --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:232026cea0f8e9f70f1cec5c1cf0733db34d08750d23734a245309be5485eca8 +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5500/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..11c6d17a182f30e6ea4034a079cf421b918bf498 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93e9915cae72ba5435e7f2d5d2f55026b3ac135a5c1f1e70f902c96d3220ca87 +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5500/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b73690d16a09096383ad3c827c4e6030bc25e1e1 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/trainer_state.json @@ -0,0 +1,19733 @@ +{ + "best_global_step": 5500, + "best_metric": 0.6867148876190186, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-5500", + "epoch": 2.320675105485232, + "eval_steps": 100, + "global_step": 5500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + }, + { + "epoch": 0.8447257383966245, + "grad_norm": 1.145434021949768, + "learning_rate": 9.893006338154401e-05, + "loss": 0.731850802898407, + "step": 2002 + }, + { + "epoch": 0.8455696202531645, + "grad_norm": 1.0618077516555786, + "learning_rate": 9.892511635084101e-05, + "loss": 0.6711665391921997, + "step": 2004 + }, + { + "epoch": 0.8464135021097047, + "grad_norm": 1.1657867431640625, + "learning_rate": 9.892015803405331e-05, + "loss": 0.6894803643226624, + "step": 2006 + }, + { + "epoch": 0.8472573839662447, + "grad_norm": 1.080140233039856, + "learning_rate": 9.891518843232467e-05, + "loss": 0.628146231174469, + "step": 2008 + }, + { + "epoch": 0.8481012658227848, + "grad_norm": 1.0664509534835815, + "learning_rate": 9.891020754680151e-05, + "loss": 0.740858793258667, + "step": 2010 + }, + { + "epoch": 0.8489451476793249, + "grad_norm": 1.5567615032196045, + "learning_rate": 9.89052153786328e-05, + "loss": 0.7763919234275818, + "step": 2012 + }, + { + "epoch": 0.849789029535865, + "grad_norm": 1.4347095489501953, + "learning_rate": 9.890021192897016e-05, + "loss": 0.8131396770477295, + "step": 2014 + }, + { + "epoch": 0.850632911392405, + "grad_norm": 1.1787892580032349, + "learning_rate": 9.889519719896776e-05, + "loss": 0.6829051375389099, + "step": 2016 + }, + { + "epoch": 0.8514767932489451, + "grad_norm": 1.239745855331421, + "learning_rate": 9.889017118978241e-05, + "loss": 0.7664558291435242, + "step": 2018 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 1.1224207878112793, + "learning_rate": 9.888513390257352e-05, + "loss": 0.7307376861572266, + "step": 2020 + }, + { + "epoch": 0.8531645569620253, + "grad_norm": 1.100536823272705, + "learning_rate": 9.88800853385031e-05, + "loss": 0.6786578893661499, + "step": 2022 + }, + { + "epoch": 0.8540084388185654, + "grad_norm": 1.25773024559021, + "learning_rate": 9.887502549873576e-05, + "loss": 0.7971984148025513, + "step": 2024 + }, + { + "epoch": 0.8548523206751055, + "grad_norm": 0.9980104565620422, + "learning_rate": 9.886995438443868e-05, + "loss": 0.6990941166877747, + "step": 2026 + }, + { + "epoch": 0.8556962025316456, + "grad_norm": 1.0464621782302856, + "learning_rate": 9.886487199678171e-05, + "loss": 0.763938307762146, + "step": 2028 + }, + { + "epoch": 0.8565400843881856, + "grad_norm": 1.2303017377853394, + "learning_rate": 9.885977833693724e-05, + "loss": 0.7165632247924805, + "step": 2030 + }, + { + "epoch": 0.8573839662447258, + "grad_norm": 1.2203325033187866, + "learning_rate": 9.885467340608027e-05, + "loss": 0.7586364150047302, + "step": 2032 + }, + { + "epoch": 0.8582278481012658, + "grad_norm": 1.113882064819336, + "learning_rate": 9.884955720538843e-05, + "loss": 0.703253984451294, + "step": 2034 + }, + { + "epoch": 0.8590717299578059, + "grad_norm": 1.1731632947921753, + "learning_rate": 9.88444297360419e-05, + "loss": 0.8530917763710022, + "step": 2036 + }, + { + "epoch": 0.859915611814346, + "grad_norm": 1.4592338800430298, + "learning_rate": 9.883929099922349e-05, + "loss": 0.8166638612747192, + "step": 2038 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.1279125213623047, + "learning_rate": 9.883414099611864e-05, + "loss": 0.6762415170669556, + "step": 2040 + }, + { + "epoch": 0.8616033755274262, + "grad_norm": 1.1587293148040771, + "learning_rate": 9.882897972791534e-05, + "loss": 0.6826539039611816, + "step": 2042 + }, + { + "epoch": 0.8624472573839662, + "grad_norm": 1.1909502744674683, + "learning_rate": 9.88238071958042e-05, + "loss": 0.7372410893440247, + "step": 2044 + }, + { + "epoch": 0.8632911392405064, + "grad_norm": 1.0340155363082886, + "learning_rate": 9.881862340097841e-05, + "loss": 0.699260950088501, + "step": 2046 + }, + { + "epoch": 0.8641350210970464, + "grad_norm": 1.1745870113372803, + "learning_rate": 9.881342834463379e-05, + "loss": 0.7689789533615112, + "step": 2048 + }, + { + "epoch": 0.8649789029535865, + "grad_norm": 1.0003606081008911, + "learning_rate": 9.880822202796872e-05, + "loss": 0.6877372860908508, + "step": 2050 + }, + { + "epoch": 0.8658227848101265, + "grad_norm": 1.2546781301498413, + "learning_rate": 9.88030044521842e-05, + "loss": 0.7632413506507874, + "step": 2052 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.1178704500198364, + "learning_rate": 9.879777561848385e-05, + "loss": 0.6776729822158813, + "step": 2054 + }, + { + "epoch": 0.8675105485232067, + "grad_norm": 1.523606777191162, + "learning_rate": 9.879253552807384e-05, + "loss": 0.7592973709106445, + "step": 2056 + }, + { + "epoch": 0.8683544303797468, + "grad_norm": 1.3490995168685913, + "learning_rate": 9.878728418216296e-05, + "loss": 0.8028839230537415, + "step": 2058 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 1.1851624250411987, + "learning_rate": 9.87820215819626e-05, + "loss": 0.7499933838844299, + "step": 2060 + }, + { + "epoch": 0.870042194092827, + "grad_norm": 1.1877925395965576, + "learning_rate": 9.877674772868672e-05, + "loss": 0.7324717044830322, + "step": 2062 + }, + { + "epoch": 0.8708860759493671, + "grad_norm": 1.2982885837554932, + "learning_rate": 9.877146262355194e-05, + "loss": 0.7456585168838501, + "step": 2064 + }, + { + "epoch": 0.8717299578059071, + "grad_norm": 1.043912649154663, + "learning_rate": 9.876616626777739e-05, + "loss": 0.7552799582481384, + "step": 2066 + }, + { + "epoch": 0.8725738396624473, + "grad_norm": 1.172580599784851, + "learning_rate": 9.876085866258487e-05, + "loss": 0.6964990496635437, + "step": 2068 + }, + { + "epoch": 0.8734177215189873, + "grad_norm": 1.26815927028656, + "learning_rate": 9.875553980919871e-05, + "loss": 0.7368612289428711, + "step": 2070 + }, + { + "epoch": 0.8742616033755274, + "grad_norm": 1.1268136501312256, + "learning_rate": 9.875020970884587e-05, + "loss": 0.7400802969932556, + "step": 2072 + }, + { + "epoch": 0.8751054852320675, + "grad_norm": 1.0556721687316895, + "learning_rate": 9.874486836275594e-05, + "loss": 0.6931334137916565, + "step": 2074 + }, + { + "epoch": 0.8759493670886076, + "grad_norm": 1.1967823505401611, + "learning_rate": 9.873951577216106e-05, + "loss": 0.7124089002609253, + "step": 2076 + }, + { + "epoch": 0.8767932489451477, + "grad_norm": 1.1753164529800415, + "learning_rate": 9.873415193829591e-05, + "loss": 0.7462030053138733, + "step": 2078 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 1.326923131942749, + "learning_rate": 9.872877686239789e-05, + "loss": 0.778078019618988, + "step": 2080 + }, + { + "epoch": 0.8784810126582279, + "grad_norm": 1.1472662687301636, + "learning_rate": 9.87233905457069e-05, + "loss": 0.6592919826507568, + "step": 2082 + }, + { + "epoch": 0.8793248945147679, + "grad_norm": 1.1162762641906738, + "learning_rate": 9.871799298946544e-05, + "loss": 0.661717414855957, + "step": 2084 + }, + { + "epoch": 0.880168776371308, + "grad_norm": 1.1694408655166626, + "learning_rate": 9.871258419491866e-05, + "loss": 0.6203670501708984, + "step": 2086 + }, + { + "epoch": 0.8810126582278481, + "grad_norm": 1.229691505432129, + "learning_rate": 9.870716416331425e-05, + "loss": 0.758888304233551, + "step": 2088 + }, + { + "epoch": 0.8818565400843882, + "grad_norm": 1.540377140045166, + "learning_rate": 9.870173289590251e-05, + "loss": 0.760649561882019, + "step": 2090 + }, + { + "epoch": 0.8827004219409282, + "grad_norm": 1.173628568649292, + "learning_rate": 9.869629039393632e-05, + "loss": 0.6981227397918701, + "step": 2092 + }, + { + "epoch": 0.8835443037974684, + "grad_norm": 1.1404013633728027, + "learning_rate": 9.869083665867116e-05, + "loss": 0.7808336615562439, + "step": 2094 + }, + { + "epoch": 0.8843881856540085, + "grad_norm": 1.1038721799850464, + "learning_rate": 9.868537169136511e-05, + "loss": 0.7540555596351624, + "step": 2096 + }, + { + "epoch": 0.8852320675105485, + "grad_norm": 1.1510080099105835, + "learning_rate": 9.867989549327885e-05, + "loss": 0.6650454998016357, + "step": 2098 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.166912317276001, + "learning_rate": 9.867440806567561e-05, + "loss": 0.673769474029541, + "step": 2100 + }, + { + "epoch": 0.8860759493670886, + "eval_loss": 0.7559094429016113, + "eval_runtime": 847.8311, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2100 + }, + { + "epoch": 0.8869198312236287, + "grad_norm": 1.227583885192871, + "learning_rate": 9.866890940982121e-05, + "loss": 0.8314241766929626, + "step": 2102 + }, + { + "epoch": 0.8877637130801688, + "grad_norm": 1.1813976764678955, + "learning_rate": 9.866339952698413e-05, + "loss": 0.6770843863487244, + "step": 2104 + }, + { + "epoch": 0.8886075949367088, + "grad_norm": 1.2471063137054443, + "learning_rate": 9.865787841843539e-05, + "loss": 0.7142292857170105, + "step": 2106 + }, + { + "epoch": 0.889451476793249, + "grad_norm": 1.1602860689163208, + "learning_rate": 9.865234608544858e-05, + "loss": 0.6981731653213501, + "step": 2108 + }, + { + "epoch": 0.890295358649789, + "grad_norm": 1.145677089691162, + "learning_rate": 9.864680252929992e-05, + "loss": 0.7019379138946533, + "step": 2110 + }, + { + "epoch": 0.8911392405063291, + "grad_norm": 1.2222462892532349, + "learning_rate": 9.86412477512682e-05, + "loss": 0.7690986394882202, + "step": 2112 + }, + { + "epoch": 0.8919831223628693, + "grad_norm": 1.1288166046142578, + "learning_rate": 9.863568175263478e-05, + "loss": 0.7241792678833008, + "step": 2114 + }, + { + "epoch": 0.8928270042194093, + "grad_norm": 1.1773978471755981, + "learning_rate": 9.863010453468364e-05, + "loss": 0.7392162084579468, + "step": 2116 + }, + { + "epoch": 0.8936708860759494, + "grad_norm": 1.102638840675354, + "learning_rate": 9.862451609870136e-05, + "loss": 0.7603078484535217, + "step": 2118 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.861891644597707e-05, + "loss": 0.6804911494255066, + "step": 2120 + }, + { + "epoch": 0.8953586497890296, + "grad_norm": 1.1381969451904297, + "learning_rate": 9.86133055778025e-05, + "loss": 0.787288248538971, + "step": 2122 + }, + { + "epoch": 0.8962025316455696, + "grad_norm": 1.2454546689987183, + "learning_rate": 9.860768349547196e-05, + "loss": 0.7282505035400391, + "step": 2124 + }, + { + "epoch": 0.8970464135021097, + "grad_norm": 1.2568305730819702, + "learning_rate": 9.860205020028237e-05, + "loss": 0.7554803490638733, + "step": 2126 + }, + { + "epoch": 0.8978902953586498, + "grad_norm": 1.1523523330688477, + "learning_rate": 9.859640569353321e-05, + "loss": 0.7126525044441223, + "step": 2128 + }, + { + "epoch": 0.8987341772151899, + "grad_norm": 1.314878225326538, + "learning_rate": 9.859074997652658e-05, + "loss": 0.7300811409950256, + "step": 2130 + }, + { + "epoch": 0.8995780590717299, + "grad_norm": 1.1272218227386475, + "learning_rate": 9.858508305056713e-05, + "loss": 0.7217329144477844, + "step": 2132 + }, + { + "epoch": 0.90042194092827, + "grad_norm": 1.10934317111969, + "learning_rate": 9.857940491696211e-05, + "loss": 0.714308500289917, + "step": 2134 + }, + { + "epoch": 0.9012658227848102, + "grad_norm": 1.1991039514541626, + "learning_rate": 9.857371557702136e-05, + "loss": 0.6613366007804871, + "step": 2136 + }, + { + "epoch": 0.9021097046413502, + "grad_norm": 1.3176918029785156, + "learning_rate": 9.85680150320573e-05, + "loss": 0.6972863078117371, + "step": 2138 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 1.1966592073440552, + "learning_rate": 9.856230328338496e-05, + "loss": 0.7299100160598755, + "step": 2140 + }, + { + "epoch": 0.9037974683544304, + "grad_norm": 1.2889270782470703, + "learning_rate": 9.85565803323219e-05, + "loss": 0.7145020961761475, + "step": 2142 + }, + { + "epoch": 0.9046413502109705, + "grad_norm": 1.2112789154052734, + "learning_rate": 9.855084618018828e-05, + "loss": 0.6717942953109741, + "step": 2144 + }, + { + "epoch": 0.9054852320675105, + "grad_norm": 1.2550239562988281, + "learning_rate": 9.85451008283069e-05, + "loss": 0.7460196018218994, + "step": 2146 + }, + { + "epoch": 0.9063291139240506, + "grad_norm": 1.2926387786865234, + "learning_rate": 9.853934427800309e-05, + "loss": 0.8300626873970032, + "step": 2148 + }, + { + "epoch": 0.9071729957805907, + "grad_norm": 1.0690672397613525, + "learning_rate": 9.853357653060478e-05, + "loss": 0.715215802192688, + "step": 2150 + }, + { + "epoch": 0.9080168776371308, + "grad_norm": 1.1021424531936646, + "learning_rate": 9.852779758744245e-05, + "loss": 0.7021427154541016, + "step": 2152 + }, + { + "epoch": 0.9088607594936708, + "grad_norm": 1.0713517665863037, + "learning_rate": 9.852200744984921e-05, + "loss": 0.7576406598091125, + "step": 2154 + }, + { + "epoch": 0.909704641350211, + "grad_norm": 1.277526617050171, + "learning_rate": 9.851620611916075e-05, + "loss": 0.7008846998214722, + "step": 2156 + }, + { + "epoch": 0.9105485232067511, + "grad_norm": 1.2434618473052979, + "learning_rate": 9.85103935967153e-05, + "loss": 0.7536613345146179, + "step": 2158 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 1.1654841899871826, + "learning_rate": 9.850456988385371e-05, + "loss": 0.7435567378997803, + "step": 2160 + }, + { + "epoch": 0.9122362869198313, + "grad_norm": 1.0718246698379517, + "learning_rate": 9.849873498191939e-05, + "loss": 0.7725666165351868, + "step": 2162 + }, + { + "epoch": 0.9130801687763713, + "grad_norm": 1.3425630331039429, + "learning_rate": 9.849288889225835e-05, + "loss": 0.7833593487739563, + "step": 2164 + }, + { + "epoch": 0.9139240506329114, + "grad_norm": 1.1989985704421997, + "learning_rate": 9.848703161621917e-05, + "loss": 0.7290158867835999, + "step": 2166 + }, + { + "epoch": 0.9147679324894514, + "grad_norm": 1.0549380779266357, + "learning_rate": 9.8481163155153e-05, + "loss": 0.6787996888160706, + "step": 2168 + }, + { + "epoch": 0.9156118143459916, + "grad_norm": 1.0757017135620117, + "learning_rate": 9.847528351041359e-05, + "loss": 0.7645748853683472, + "step": 2170 + }, + { + "epoch": 0.9164556962025316, + "grad_norm": 1.0636975765228271, + "learning_rate": 9.846939268335726e-05, + "loss": 0.6640698313713074, + "step": 2172 + }, + { + "epoch": 0.9172995780590717, + "grad_norm": 1.2038439512252808, + "learning_rate": 9.846349067534291e-05, + "loss": 0.7216284275054932, + "step": 2174 + }, + { + "epoch": 0.9181434599156119, + "grad_norm": 1.17854642868042, + "learning_rate": 9.845757748773203e-05, + "loss": 0.7244991660118103, + "step": 2176 + }, + { + "epoch": 0.9189873417721519, + "grad_norm": 1.0391159057617188, + "learning_rate": 9.845165312188864e-05, + "loss": 0.6043152809143066, + "step": 2178 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 1.2382071018218994, + "learning_rate": 9.844571757917944e-05, + "loss": 0.7791659832000732, + "step": 2180 + }, + { + "epoch": 0.920675105485232, + "grad_norm": 1.0855708122253418, + "learning_rate": 9.84397708609736e-05, + "loss": 0.7190433144569397, + "step": 2182 + }, + { + "epoch": 0.9215189873417722, + "grad_norm": 1.103308916091919, + "learning_rate": 9.843381296864291e-05, + "loss": 0.6648658514022827, + "step": 2184 + }, + { + "epoch": 0.9223628691983122, + "grad_norm": 1.073517918586731, + "learning_rate": 9.842784390356178e-05, + "loss": 0.6891760230064392, + "step": 2186 + }, + { + "epoch": 0.9232067510548523, + "grad_norm": 1.0806199312210083, + "learning_rate": 9.842186366710712e-05, + "loss": 0.6880859136581421, + "step": 2188 + }, + { + "epoch": 0.9240506329113924, + "grad_norm": 1.0631483793258667, + "learning_rate": 9.841587226065848e-05, + "loss": 0.6238307952880859, + "step": 2190 + }, + { + "epoch": 0.9248945147679325, + "grad_norm": 1.2630863189697266, + "learning_rate": 9.840986968559795e-05, + "loss": 0.6905744075775146, + "step": 2192 + }, + { + "epoch": 0.9257383966244725, + "grad_norm": 1.1307560205459595, + "learning_rate": 9.840385594331022e-05, + "loss": 0.7531564235687256, + "step": 2194 + }, + { + "epoch": 0.9265822784810127, + "grad_norm": 1.0294862985610962, + "learning_rate": 9.839783103518254e-05, + "loss": 0.6750671863555908, + "step": 2196 + }, + { + "epoch": 0.9274261603375528, + "grad_norm": 1.2446976900100708, + "learning_rate": 9.839179496260472e-05, + "loss": 0.7200804352760315, + "step": 2198 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 1.2673420906066895, + "learning_rate": 9.83857477269692e-05, + "loss": 0.7002623677253723, + "step": 2200 + }, + { + "epoch": 0.9282700421940928, + "eval_loss": 0.7497645616531372, + "eval_runtime": 856.8766, + "eval_samples_per_second": 2.459, + "eval_steps_per_second": 2.459, + "step": 2200 + }, + { + "epoch": 0.9291139240506329, + "grad_norm": 1.5114624500274658, + "learning_rate": 9.837968932967094e-05, + "loss": 0.7718265056610107, + "step": 2202 + }, + { + "epoch": 0.929957805907173, + "grad_norm": 1.2059369087219238, + "learning_rate": 9.837361977210751e-05, + "loss": 0.7204271554946899, + "step": 2204 + }, + { + "epoch": 0.9308016877637131, + "grad_norm": 1.2077301740646362, + "learning_rate": 9.836753905567902e-05, + "loss": 0.7371073961257935, + "step": 2206 + }, + { + "epoch": 0.9316455696202531, + "grad_norm": 1.120097279548645, + "learning_rate": 9.836144718178818e-05, + "loss": 0.6601167321205139, + "step": 2208 + }, + { + "epoch": 0.9324894514767933, + "grad_norm": 1.1755714416503906, + "learning_rate": 9.835534415184029e-05, + "loss": 0.6897423267364502, + "step": 2210 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3587000370025635, + "learning_rate": 9.834922996724317e-05, + "loss": 0.758438229560852, + "step": 2212 + }, + { + "epoch": 0.9341772151898734, + "grad_norm": 1.1898177862167358, + "learning_rate": 9.834310462940727e-05, + "loss": 0.7489214539527893, + "step": 2214 + }, + { + "epoch": 0.9350210970464135, + "grad_norm": 1.0814623832702637, + "learning_rate": 9.833696813974558e-05, + "loss": 0.6844488382339478, + "step": 2216 + }, + { + "epoch": 0.9358649789029536, + "grad_norm": 1.1060179471969604, + "learning_rate": 9.833082049967366e-05, + "loss": 0.6617586016654968, + "step": 2218 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 1.1780575513839722, + "learning_rate": 9.832466171060968e-05, + "loss": 0.7383584976196289, + "step": 2220 + }, + { + "epoch": 0.9375527426160337, + "grad_norm": 1.3734618425369263, + "learning_rate": 9.831849177397432e-05, + "loss": 0.7764308452606201, + "step": 2222 + }, + { + "epoch": 0.9383966244725739, + "grad_norm": 1.1367733478546143, + "learning_rate": 9.831231069119089e-05, + "loss": 0.6834397912025452, + "step": 2224 + }, + { + "epoch": 0.9392405063291139, + "grad_norm": 1.1695492267608643, + "learning_rate": 9.830611846368524e-05, + "loss": 0.7054480910301208, + "step": 2226 + }, + { + "epoch": 0.940084388185654, + "grad_norm": 1.0345736742019653, + "learning_rate": 9.829991509288579e-05, + "loss": 0.694448709487915, + "step": 2228 + }, + { + "epoch": 0.9409282700421941, + "grad_norm": 1.298105239868164, + "learning_rate": 9.829370058022356e-05, + "loss": 0.6839741468429565, + "step": 2230 + }, + { + "epoch": 0.9417721518987342, + "grad_norm": 1.2905502319335938, + "learning_rate": 9.828747492713209e-05, + "loss": 0.7886884212493896, + "step": 2232 + }, + { + "epoch": 0.9426160337552743, + "grad_norm": 1.12301504611969, + "learning_rate": 9.828123813504753e-05, + "loss": 0.7206413149833679, + "step": 2234 + }, + { + "epoch": 0.9434599156118143, + "grad_norm": 1.2644896507263184, + "learning_rate": 9.82749902054086e-05, + "loss": 0.7700693607330322, + "step": 2236 + }, + { + "epoch": 0.9443037974683545, + "grad_norm": 1.1626365184783936, + "learning_rate": 9.826873113965655e-05, + "loss": 0.7199711203575134, + "step": 2238 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 1.0728627443313599, + "learning_rate": 9.826246093923528e-05, + "loss": 0.7183539271354675, + "step": 2240 + }, + { + "epoch": 0.9459915611814346, + "grad_norm": 1.1444766521453857, + "learning_rate": 9.825617960559114e-05, + "loss": 0.7417964935302734, + "step": 2242 + }, + { + "epoch": 0.9468354430379747, + "grad_norm": 1.4059823751449585, + "learning_rate": 9.824988714017316e-05, + "loss": 0.7949740290641785, + "step": 2244 + }, + { + "epoch": 0.9476793248945148, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.824358354443286e-05, + "loss": 0.6433083415031433, + "step": 2246 + }, + { + "epoch": 0.9485232067510548, + "grad_norm": 1.0879144668579102, + "learning_rate": 9.823726881982438e-05, + "loss": 0.6519861817359924, + "step": 2248 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.2289162874221802, + "learning_rate": 9.82309429678044e-05, + "loss": 0.7280195355415344, + "step": 2250 + }, + { + "epoch": 0.950210970464135, + "grad_norm": 1.1755765676498413, + "learning_rate": 9.822460598983217e-05, + "loss": 0.7524687647819519, + "step": 2252 + }, + { + "epoch": 0.9510548523206751, + "grad_norm": 1.179807186126709, + "learning_rate": 9.821825788736949e-05, + "loss": 0.7543174624443054, + "step": 2254 + }, + { + "epoch": 0.9518987341772152, + "grad_norm": 1.1234289407730103, + "learning_rate": 9.821189866188079e-05, + "loss": 0.716377854347229, + "step": 2256 + }, + { + "epoch": 0.9527426160337553, + "grad_norm": 1.0324063301086426, + "learning_rate": 9.820552831483297e-05, + "loss": 0.6403332948684692, + "step": 2258 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 1.1459579467773438, + "learning_rate": 9.819914684769558e-05, + "loss": 0.7406947612762451, + "step": 2260 + }, + { + "epoch": 0.9544303797468354, + "grad_norm": 1.2886124849319458, + "learning_rate": 9.819275426194072e-05, + "loss": 0.749687671661377, + "step": 2262 + }, + { + "epoch": 0.9552742616033755, + "grad_norm": 1.3349844217300415, + "learning_rate": 9.818635055904299e-05, + "loss": 0.778410017490387, + "step": 2264 + }, + { + "epoch": 0.9561181434599156, + "grad_norm": 1.0994901657104492, + "learning_rate": 9.81799357404796e-05, + "loss": 0.6701914668083191, + "step": 2266 + }, + { + "epoch": 0.9569620253164557, + "grad_norm": 1.1787796020507812, + "learning_rate": 9.817350980773038e-05, + "loss": 0.7205135226249695, + "step": 2268 + }, + { + "epoch": 0.9578059071729957, + "grad_norm": 1.100813627243042, + "learning_rate": 9.816707276227763e-05, + "loss": 0.6897916197776794, + "step": 2270 + }, + { + "epoch": 0.9586497890295359, + "grad_norm": 1.1280698776245117, + "learning_rate": 9.816062460560627e-05, + "loss": 0.6763570308685303, + "step": 2272 + }, + { + "epoch": 0.959493670886076, + "grad_norm": 1.2322514057159424, + "learning_rate": 9.815416533920374e-05, + "loss": 0.6948683857917786, + "step": 2274 + }, + { + "epoch": 0.960337552742616, + "grad_norm": 1.3963630199432373, + "learning_rate": 9.814769496456008e-05, + "loss": 0.7876828908920288, + "step": 2276 + }, + { + "epoch": 0.9611814345991562, + "grad_norm": 1.2093676328659058, + "learning_rate": 9.814121348316792e-05, + "loss": 0.8191362619400024, + "step": 2278 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 1.2223572731018066, + "learning_rate": 9.813472089652233e-05, + "loss": 0.7162626385688782, + "step": 2280 + }, + { + "epoch": 0.9628691983122363, + "grad_norm": 1.1498078107833862, + "learning_rate": 9.812821720612111e-05, + "loss": 0.7183970212936401, + "step": 2282 + }, + { + "epoch": 0.9637130801687763, + "grad_norm": 1.1563853025436401, + "learning_rate": 9.812170241346449e-05, + "loss": 0.734487771987915, + "step": 2284 + }, + { + "epoch": 0.9645569620253165, + "grad_norm": 1.1823415756225586, + "learning_rate": 9.81151765200553e-05, + "loss": 0.7312371730804443, + "step": 2286 + }, + { + "epoch": 0.9654008438818565, + "grad_norm": 1.1336151361465454, + "learning_rate": 9.810863952739899e-05, + "loss": 0.7668377757072449, + "step": 2288 + }, + { + "epoch": 0.9662447257383966, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.810209143700347e-05, + "loss": 0.7100399732589722, + "step": 2290 + }, + { + "epoch": 0.9670886075949368, + "grad_norm": 1.1368129253387451, + "learning_rate": 9.809553225037926e-05, + "loss": 0.7169836163520813, + "step": 2292 + }, + { + "epoch": 0.9679324894514768, + "grad_norm": 1.141107439994812, + "learning_rate": 9.808896196903947e-05, + "loss": 0.7709535956382751, + "step": 2294 + }, + { + "epoch": 0.9687763713080169, + "grad_norm": 1.276405930519104, + "learning_rate": 9.808238059449971e-05, + "loss": 0.7300511002540588, + "step": 2296 + }, + { + "epoch": 0.9696202531645569, + "grad_norm": 0.9817046523094177, + "learning_rate": 9.80757881282782e-05, + "loss": 0.6259129047393799, + "step": 2298 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 1.3965257406234741, + "learning_rate": 9.806918457189566e-05, + "loss": 0.7361716032028198, + "step": 2300 + }, + { + "epoch": 0.9704641350210971, + "eval_loss": 0.7464568614959717, + "eval_runtime": 864.2128, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 2.438, + "step": 2300 + }, + { + "epoch": 0.9713080168776371, + "grad_norm": 1.2168612480163574, + "learning_rate": 9.806256992687544e-05, + "loss": 0.805477499961853, + "step": 2302 + }, + { + "epoch": 0.9721518987341772, + "grad_norm": 1.0418168306350708, + "learning_rate": 9.80559441947434e-05, + "loss": 0.6673368811607361, + "step": 2304 + }, + { + "epoch": 0.9729957805907173, + "grad_norm": 1.223128318786621, + "learning_rate": 9.804930737702796e-05, + "loss": 0.7585647106170654, + "step": 2306 + }, + { + "epoch": 0.9738396624472574, + "grad_norm": 1.264511227607727, + "learning_rate": 9.804265947526011e-05, + "loss": 0.7642034888267517, + "step": 2308 + }, + { + "epoch": 0.9746835443037974, + "grad_norm": 1.076887607574463, + "learning_rate": 9.803600049097339e-05, + "loss": 0.7094541192054749, + "step": 2310 + }, + { + "epoch": 0.9755274261603376, + "grad_norm": 1.0214987993240356, + "learning_rate": 9.802933042570392e-05, + "loss": 0.7370059490203857, + "step": 2312 + }, + { + "epoch": 0.9763713080168777, + "grad_norm": 1.3075295686721802, + "learning_rate": 9.802264928099035e-05, + "loss": 0.726834237575531, + "step": 2314 + }, + { + "epoch": 0.9772151898734177, + "grad_norm": 1.057386040687561, + "learning_rate": 9.801595705837385e-05, + "loss": 0.6742353439331055, + "step": 2316 + }, + { + "epoch": 0.9780590717299578, + "grad_norm": 1.3998085260391235, + "learning_rate": 9.800925375939825e-05, + "loss": 0.6862425208091736, + "step": 2318 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 1.080574631690979, + "learning_rate": 9.800253938560983e-05, + "loss": 0.6212031245231628, + "step": 2320 + }, + { + "epoch": 0.979746835443038, + "grad_norm": 1.3643771409988403, + "learning_rate": 9.799581393855748e-05, + "loss": 0.7522522211074829, + "step": 2322 + }, + { + "epoch": 0.980590717299578, + "grad_norm": 1.2455768585205078, + "learning_rate": 9.798907741979264e-05, + "loss": 0.7265716791152954, + "step": 2324 + }, + { + "epoch": 0.9814345991561182, + "grad_norm": 1.078774333000183, + "learning_rate": 9.798232983086927e-05, + "loss": 0.7160419225692749, + "step": 2326 + }, + { + "epoch": 0.9822784810126582, + "grad_norm": 1.3013948202133179, + "learning_rate": 9.797557117334394e-05, + "loss": 0.7991124391555786, + "step": 2328 + }, + { + "epoch": 0.9831223628691983, + "grad_norm": 1.2216732501983643, + "learning_rate": 9.796880144877572e-05, + "loss": 0.7193916440010071, + "step": 2330 + }, + { + "epoch": 0.9839662447257383, + "grad_norm": 1.1469542980194092, + "learning_rate": 9.796202065872627e-05, + "loss": 0.7184370756149292, + "step": 2332 + }, + { + "epoch": 0.9848101265822785, + "grad_norm": 1.0431830883026123, + "learning_rate": 9.795522880475979e-05, + "loss": 0.6474619507789612, + "step": 2334 + }, + { + "epoch": 0.9856540084388186, + "grad_norm": 1.1819576025009155, + "learning_rate": 9.794842588844299e-05, + "loss": 0.6392545700073242, + "step": 2336 + }, + { + "epoch": 0.9864978902953586, + "grad_norm": 1.1984983682632446, + "learning_rate": 9.794161191134525e-05, + "loss": 0.7358114719390869, + "step": 2338 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 1.3378512859344482, + "learning_rate": 9.793478687503834e-05, + "loss": 0.6762020587921143, + "step": 2340 + }, + { + "epoch": 0.9881856540084388, + "grad_norm": 1.272674560546875, + "learning_rate": 9.792795078109673e-05, + "loss": 0.7478934526443481, + "step": 2342 + }, + { + "epoch": 0.9890295358649789, + "grad_norm": 1.153746247291565, + "learning_rate": 9.792110363109733e-05, + "loss": 0.7316533923149109, + "step": 2344 + }, + { + "epoch": 0.9898734177215189, + "grad_norm": 1.1361702680587769, + "learning_rate": 9.791424542661967e-05, + "loss": 0.7078539133071899, + "step": 2346 + }, + { + "epoch": 0.9907172995780591, + "grad_norm": 1.3043115139007568, + "learning_rate": 9.790737616924581e-05, + "loss": 0.7945935130119324, + "step": 2348 + }, + { + "epoch": 0.9915611814345991, + "grad_norm": 1.1913264989852905, + "learning_rate": 9.790049586056034e-05, + "loss": 0.8247197866439819, + "step": 2350 + }, + { + "epoch": 0.9924050632911392, + "grad_norm": 1.1560171842575073, + "learning_rate": 9.789360450215041e-05, + "loss": 0.7099657654762268, + "step": 2352 + }, + { + "epoch": 0.9932489451476794, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.788670209560575e-05, + "loss": 0.7480318546295166, + "step": 2354 + }, + { + "epoch": 0.9940928270042194, + "grad_norm": 1.1584707498550415, + "learning_rate": 9.787978864251859e-05, + "loss": 0.6870889067649841, + "step": 2356 + }, + { + "epoch": 0.9949367088607595, + "grad_norm": 1.057478666305542, + "learning_rate": 9.787286414448375e-05, + "loss": 0.6114922165870667, + "step": 2358 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 1.1431775093078613, + "learning_rate": 9.786592860309856e-05, + "loss": 0.6955118179321289, + "step": 2360 + }, + { + "epoch": 0.9966244725738397, + "grad_norm": 1.232142448425293, + "learning_rate": 9.785898201996292e-05, + "loss": 0.735048770904541, + "step": 2362 + }, + { + "epoch": 0.9974683544303797, + "grad_norm": 1.1236306428909302, + "learning_rate": 9.785202439667928e-05, + "loss": 0.7150241136550903, + "step": 2364 + }, + { + "epoch": 0.9983122362869198, + "grad_norm": 1.0517534017562866, + "learning_rate": 9.784505573485263e-05, + "loss": 0.6870222687721252, + "step": 2366 + }, + { + "epoch": 0.99915611814346, + "grad_norm": 1.1747480630874634, + "learning_rate": 9.78380760360905e-05, + "loss": 0.7521567940711975, + "step": 2368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2790346145629883, + "learning_rate": 9.783108530200298e-05, + "loss": 0.7336234450340271, + "step": 2370 + }, + { + "epoch": 1.0008438818565402, + "grad_norm": 1.1216399669647217, + "learning_rate": 9.78240835342027e-05, + "loss": 0.6378109455108643, + "step": 2372 + }, + { + "epoch": 1.00168776371308, + "grad_norm": 1.267336368560791, + "learning_rate": 9.781707073430482e-05, + "loss": 0.6174905300140381, + "step": 2374 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 1.1342934370040894, + "learning_rate": 9.781004690392706e-05, + "loss": 0.6579123139381409, + "step": 2376 + }, + { + "epoch": 1.0033755274261604, + "grad_norm": 1.1317468881607056, + "learning_rate": 9.78030120446897e-05, + "loss": 0.6679617166519165, + "step": 2378 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 1.2992616891860962, + "learning_rate": 9.779596615821552e-05, + "loss": 0.7368149161338806, + "step": 2380 + }, + { + "epoch": 1.0050632911392405, + "grad_norm": 1.1714510917663574, + "learning_rate": 9.77889092461299e-05, + "loss": 0.6887164115905762, + "step": 2382 + }, + { + "epoch": 1.0059071729957807, + "grad_norm": 1.1670639514923096, + "learning_rate": 9.778184131006071e-05, + "loss": 0.681344211101532, + "step": 2384 + }, + { + "epoch": 1.0067510548523206, + "grad_norm": 1.2487291097640991, + "learning_rate": 9.77747623516384e-05, + "loss": 0.7342769503593445, + "step": 2386 + }, + { + "epoch": 1.0075949367088608, + "grad_norm": 1.2408956289291382, + "learning_rate": 9.776767237249595e-05, + "loss": 0.577454149723053, + "step": 2388 + }, + { + "epoch": 1.0084388185654007, + "grad_norm": 1.067991852760315, + "learning_rate": 9.776057137426889e-05, + "loss": 0.6588307023048401, + "step": 2390 + }, + { + "epoch": 1.009282700421941, + "grad_norm": 1.2821543216705322, + "learning_rate": 9.775345935859525e-05, + "loss": 0.7045041918754578, + "step": 2392 + }, + { + "epoch": 1.010126582278481, + "grad_norm": 1.3160134553909302, + "learning_rate": 9.774633632711569e-05, + "loss": 0.7141479253768921, + "step": 2394 + }, + { + "epoch": 1.010970464135021, + "grad_norm": 1.66774320602417, + "learning_rate": 9.773920228147329e-05, + "loss": 0.723293662071228, + "step": 2396 + }, + { + "epoch": 1.0118143459915612, + "grad_norm": 1.027588963508606, + "learning_rate": 9.77320572233138e-05, + "loss": 0.5812023878097534, + "step": 2398 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.406507968902588, + "learning_rate": 9.77249011542854e-05, + "loss": 0.7071458101272583, + "step": 2400 + }, + { + "epoch": 1.0126582278481013, + "eval_loss": 0.7421699166297913, + "eval_runtime": 854.2185, + "eval_samples_per_second": 2.467, + "eval_steps_per_second": 2.467, + "step": 2400 + }, + { + "epoch": 1.0135021097046413, + "grad_norm": 1.1236240863800049, + "learning_rate": 9.771773407603889e-05, + "loss": 0.7049722671508789, + "step": 2402 + }, + { + "epoch": 1.0143459915611814, + "grad_norm": 1.1924289464950562, + "learning_rate": 9.771055599022756e-05, + "loss": 0.635308027267456, + "step": 2404 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 1.1744966506958008, + "learning_rate": 9.770336689850727e-05, + "loss": 0.7286487817764282, + "step": 2406 + }, + { + "epoch": 1.0160337552742615, + "grad_norm": 1.2131173610687256, + "learning_rate": 9.769616680253639e-05, + "loss": 0.6828222274780273, + "step": 2408 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 1.0517828464508057, + "learning_rate": 9.768895570397585e-05, + "loss": 0.6652156114578247, + "step": 2410 + }, + { + "epoch": 1.0177215189873419, + "grad_norm": 1.1603758335113525, + "learning_rate": 9.768173360448912e-05, + "loss": 0.7278267741203308, + "step": 2412 + }, + { + "epoch": 1.0185654008438818, + "grad_norm": 1.3167752027511597, + "learning_rate": 9.767450050574218e-05, + "loss": 0.6082334518432617, + "step": 2414 + }, + { + "epoch": 1.019409282700422, + "grad_norm": 1.1754449605941772, + "learning_rate": 9.766725640940358e-05, + "loss": 0.67228102684021, + "step": 2416 + }, + { + "epoch": 1.0202531645569621, + "grad_norm": 1.060952067375183, + "learning_rate": 9.766000131714442e-05, + "loss": 0.5984366536140442, + "step": 2418 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 1.0826152563095093, + "learning_rate": 9.765273523063825e-05, + "loss": 0.690661609172821, + "step": 2420 + }, + { + "epoch": 1.0219409282700422, + "grad_norm": 1.423723816871643, + "learning_rate": 9.764545815156125e-05, + "loss": 0.7960668802261353, + "step": 2422 + }, + { + "epoch": 1.0227848101265822, + "grad_norm": 1.0882549285888672, + "learning_rate": 9.763817008159212e-05, + "loss": 0.6971074342727661, + "step": 2424 + }, + { + "epoch": 1.0236286919831223, + "grad_norm": 1.1053040027618408, + "learning_rate": 9.763087102241206e-05, + "loss": 0.6854458451271057, + "step": 2426 + }, + { + "epoch": 1.0244725738396625, + "grad_norm": 1.1975224018096924, + "learning_rate": 9.762356097570482e-05, + "loss": 0.6724489331245422, + "step": 2428 + }, + { + "epoch": 1.0253164556962024, + "grad_norm": 1.1692171096801758, + "learning_rate": 9.76162399431567e-05, + "loss": 0.7064506411552429, + "step": 2430 + }, + { + "epoch": 1.0261603375527426, + "grad_norm": 1.1927787065505981, + "learning_rate": 9.760890792645649e-05, + "loss": 0.6605257391929626, + "step": 2432 + }, + { + "epoch": 1.0270042194092828, + "grad_norm": 1.4147427082061768, + "learning_rate": 9.760156492729558e-05, + "loss": 0.6872501373291016, + "step": 2434 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 1.2503126859664917, + "learning_rate": 9.759421094736785e-05, + "loss": 0.7117500305175781, + "step": 2436 + }, + { + "epoch": 1.0286919831223629, + "grad_norm": 1.229978084564209, + "learning_rate": 9.758684598836971e-05, + "loss": 0.6740369200706482, + "step": 2438 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 1.4765945672988892, + "learning_rate": 9.757947005200014e-05, + "loss": 0.7215790748596191, + "step": 2440 + }, + { + "epoch": 1.030379746835443, + "grad_norm": 1.282632827758789, + "learning_rate": 9.757208313996061e-05, + "loss": 0.6961746215820312, + "step": 2442 + }, + { + "epoch": 1.0312236286919831, + "grad_norm": 1.259828805923462, + "learning_rate": 9.756468525395512e-05, + "loss": 0.6348349452018738, + "step": 2444 + }, + { + "epoch": 1.0320675105485233, + "grad_norm": 1.0984172821044922, + "learning_rate": 9.755727639569024e-05, + "loss": 0.6756057739257812, + "step": 2446 + }, + { + "epoch": 1.0329113924050632, + "grad_norm": 1.235835075378418, + "learning_rate": 9.754985656687506e-05, + "loss": 0.6968509554862976, + "step": 2448 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 1.273032546043396, + "learning_rate": 9.754242576922119e-05, + "loss": 0.6793950796127319, + "step": 2450 + }, + { + "epoch": 1.0345991561181433, + "grad_norm": 1.251996397972107, + "learning_rate": 9.753498400444274e-05, + "loss": 0.645270586013794, + "step": 2452 + }, + { + "epoch": 1.0354430379746835, + "grad_norm": 1.4310805797576904, + "learning_rate": 9.752753127425642e-05, + "loss": 0.7291322350502014, + "step": 2454 + }, + { + "epoch": 1.0362869198312237, + "grad_norm": 1.6582196950912476, + "learning_rate": 9.752006758038142e-05, + "loss": 0.7553019523620605, + "step": 2456 + }, + { + "epoch": 1.0371308016877636, + "grad_norm": 1.081773042678833, + "learning_rate": 9.751259292453947e-05, + "loss": 0.5637331008911133, + "step": 2458 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 1.1483876705169678, + "learning_rate": 9.750510730845483e-05, + "loss": 0.6012396216392517, + "step": 2460 + }, + { + "epoch": 1.038818565400844, + "grad_norm": 1.0879185199737549, + "learning_rate": 9.749761073385428e-05, + "loss": 0.6795822381973267, + "step": 2462 + }, + { + "epoch": 1.0396624472573839, + "grad_norm": 1.2378218173980713, + "learning_rate": 9.749010320246714e-05, + "loss": 0.6895145773887634, + "step": 2464 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 1.253233790397644, + "learning_rate": 9.748258471602527e-05, + "loss": 0.7124115228652954, + "step": 2466 + }, + { + "epoch": 1.0413502109704642, + "grad_norm": 1.3994864225387573, + "learning_rate": 9.747505527626302e-05, + "loss": 0.7304861545562744, + "step": 2468 + }, + { + "epoch": 1.0421940928270041, + "grad_norm": 1.2360669374465942, + "learning_rate": 9.74675148849173e-05, + "loss": 0.6845837831497192, + "step": 2470 + }, + { + "epoch": 1.0430379746835443, + "grad_norm": 1.126849889755249, + "learning_rate": 9.74599635437275e-05, + "loss": 0.6780203580856323, + "step": 2472 + }, + { + "epoch": 1.0438818565400845, + "grad_norm": 1.169788122177124, + "learning_rate": 9.745240125443562e-05, + "loss": 0.7550003528594971, + "step": 2474 + }, + { + "epoch": 1.0447257383966244, + "grad_norm": 1.1311867237091064, + "learning_rate": 9.744482801878612e-05, + "loss": 0.6910399198532104, + "step": 2476 + }, + { + "epoch": 1.0455696202531646, + "grad_norm": 1.1267731189727783, + "learning_rate": 9.743724383852597e-05, + "loss": 0.7164814472198486, + "step": 2478 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 1.2239704132080078, + "learning_rate": 9.742964871540472e-05, + "loss": 0.6428439617156982, + "step": 2480 + }, + { + "epoch": 1.0472573839662447, + "grad_norm": 1.1854743957519531, + "learning_rate": 9.742204265117443e-05, + "loss": 0.6994290351867676, + "step": 2482 + }, + { + "epoch": 1.0481012658227848, + "grad_norm": 1.0695894956588745, + "learning_rate": 9.741442564758964e-05, + "loss": 0.6725777983665466, + "step": 2484 + }, + { + "epoch": 1.048945147679325, + "grad_norm": 1.1799863576889038, + "learning_rate": 9.740679770640748e-05, + "loss": 0.6538674235343933, + "step": 2486 + }, + { + "epoch": 1.049789029535865, + "grad_norm": 1.295546293258667, + "learning_rate": 9.739915882938754e-05, + "loss": 0.780756950378418, + "step": 2488 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 1.2371755838394165, + "learning_rate": 9.739150901829198e-05, + "loss": 0.6657930612564087, + "step": 2490 + }, + { + "epoch": 1.051476793248945, + "grad_norm": 1.103037714958191, + "learning_rate": 9.738384827488547e-05, + "loss": 0.6675208210945129, + "step": 2492 + }, + { + "epoch": 1.0523206751054852, + "grad_norm": 1.1835435628890991, + "learning_rate": 9.737617660093517e-05, + "loss": 0.6693358421325684, + "step": 2494 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 1.003771424293518, + "learning_rate": 9.736849399821082e-05, + "loss": 0.624502956867218, + "step": 2496 + }, + { + "epoch": 1.0540084388185653, + "grad_norm": 1.1391769647598267, + "learning_rate": 9.736080046848463e-05, + "loss": 0.6350868344306946, + "step": 2498 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 1.376518726348877, + "learning_rate": 9.735309601353134e-05, + "loss": 0.6721012592315674, + "step": 2500 + }, + { + "epoch": 1.0548523206751055, + "eval_loss": 0.741338849067688, + "eval_runtime": 847.7478, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2500 + }, + { + "epoch": 1.0556962025316456, + "grad_norm": 1.194190502166748, + "learning_rate": 9.734538063512824e-05, + "loss": 0.6888233423233032, + "step": 2502 + }, + { + "epoch": 1.0565400843881856, + "grad_norm": 1.378830909729004, + "learning_rate": 9.733765433505513e-05, + "loss": 0.7095553278923035, + "step": 2504 + }, + { + "epoch": 1.0573839662447257, + "grad_norm": 1.1289541721343994, + "learning_rate": 9.732991711509428e-05, + "loss": 0.6734166145324707, + "step": 2506 + }, + { + "epoch": 1.058227848101266, + "grad_norm": 1.1858116388320923, + "learning_rate": 9.732216897703054e-05, + "loss": 0.7006195187568665, + "step": 2508 + }, + { + "epoch": 1.0590717299578059, + "grad_norm": 1.1365686655044556, + "learning_rate": 9.731440992265127e-05, + "loss": 0.6481205821037292, + "step": 2510 + }, + { + "epoch": 1.059915611814346, + "grad_norm": 1.2886228561401367, + "learning_rate": 9.730663995374632e-05, + "loss": 0.679282546043396, + "step": 2512 + }, + { + "epoch": 1.0607594936708862, + "grad_norm": 1.355322003364563, + "learning_rate": 9.729885907210808e-05, + "loss": 0.7656359672546387, + "step": 2514 + }, + { + "epoch": 1.0616033755274261, + "grad_norm": 1.1552364826202393, + "learning_rate": 9.729106727953142e-05, + "loss": 0.5996183156967163, + "step": 2516 + }, + { + "epoch": 1.0624472573839663, + "grad_norm": 1.1419235467910767, + "learning_rate": 9.728326457781381e-05, + "loss": 0.7599716782569885, + "step": 2518 + }, + { + "epoch": 1.0632911392405062, + "grad_norm": 1.2240079641342163, + "learning_rate": 9.727545096875512e-05, + "loss": 0.7150241732597351, + "step": 2520 + }, + { + "epoch": 1.0641350210970464, + "grad_norm": 1.2463440895080566, + "learning_rate": 9.726762645415785e-05, + "loss": 0.734352171421051, + "step": 2522 + }, + { + "epoch": 1.0649789029535865, + "grad_norm": 1.1680364608764648, + "learning_rate": 9.725979103582697e-05, + "loss": 0.6950796842575073, + "step": 2524 + }, + { + "epoch": 1.0658227848101265, + "grad_norm": 1.1680421829223633, + "learning_rate": 9.725194471556991e-05, + "loss": 0.7096341252326965, + "step": 2526 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.043717861175537, + "learning_rate": 9.724408749519671e-05, + "loss": 0.6486304402351379, + "step": 2528 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 1.1240284442901611, + "learning_rate": 9.723621937651985e-05, + "loss": 0.6519505381584167, + "step": 2530 + }, + { + "epoch": 1.0683544303797468, + "grad_norm": 1.185223937034607, + "learning_rate": 9.722834036135439e-05, + "loss": 0.6724293231964111, + "step": 2532 + }, + { + "epoch": 1.069198312236287, + "grad_norm": 1.3234196901321411, + "learning_rate": 9.722045045151784e-05, + "loss": 0.6886576414108276, + "step": 2534 + }, + { + "epoch": 1.070042194092827, + "grad_norm": 1.333084225654602, + "learning_rate": 9.721254964883024e-05, + "loss": 0.688493549823761, + "step": 2536 + }, + { + "epoch": 1.070886075949367, + "grad_norm": 1.2435462474822998, + "learning_rate": 9.720463795511419e-05, + "loss": 0.6527412533760071, + "step": 2538 + }, + { + "epoch": 1.0717299578059072, + "grad_norm": 1.1521880626678467, + "learning_rate": 9.719671537219472e-05, + "loss": 0.6508163809776306, + "step": 2540 + }, + { + "epoch": 1.0725738396624473, + "grad_norm": 1.015013575553894, + "learning_rate": 9.718878190189947e-05, + "loss": 0.6954023838043213, + "step": 2542 + }, + { + "epoch": 1.0734177215189873, + "grad_norm": 1.1507678031921387, + "learning_rate": 9.718083754605851e-05, + "loss": 0.7201322913169861, + "step": 2544 + }, + { + "epoch": 1.0742616033755275, + "grad_norm": 1.0569016933441162, + "learning_rate": 9.717288230650444e-05, + "loss": 0.6688649654388428, + "step": 2546 + }, + { + "epoch": 1.0751054852320676, + "grad_norm": 1.2178492546081543, + "learning_rate": 9.716491618507241e-05, + "loss": 0.7077898979187012, + "step": 2548 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 1.3587230443954468, + "learning_rate": 9.715693918360002e-05, + "loss": 0.7312119603157043, + "step": 2550 + }, + { + "epoch": 1.0767932489451477, + "grad_norm": 1.1930122375488281, + "learning_rate": 9.714895130392744e-05, + "loss": 0.6910589337348938, + "step": 2552 + }, + { + "epoch": 1.0776371308016879, + "grad_norm": 1.2440707683563232, + "learning_rate": 9.71409525478973e-05, + "loss": 0.7942836284637451, + "step": 2554 + }, + { + "epoch": 1.0784810126582278, + "grad_norm": 1.3755065202713013, + "learning_rate": 9.713294291735477e-05, + "loss": 0.6652286052703857, + "step": 2556 + }, + { + "epoch": 1.079324894514768, + "grad_norm": 1.165448784828186, + "learning_rate": 9.71249224141475e-05, + "loss": 0.6025735139846802, + "step": 2558 + }, + { + "epoch": 1.080168776371308, + "grad_norm": 1.2981204986572266, + "learning_rate": 9.711689104012569e-05, + "loss": 0.7343734502792358, + "step": 2560 + }, + { + "epoch": 1.081012658227848, + "grad_norm": 1.2040622234344482, + "learning_rate": 9.710884879714202e-05, + "loss": 0.6903306841850281, + "step": 2562 + }, + { + "epoch": 1.0818565400843883, + "grad_norm": 1.1835904121398926, + "learning_rate": 9.710079568705168e-05, + "loss": 0.69134920835495, + "step": 2564 + }, + { + "epoch": 1.0827004219409282, + "grad_norm": 1.3345229625701904, + "learning_rate": 9.709273171171235e-05, + "loss": 0.6471185088157654, + "step": 2566 + }, + { + "epoch": 1.0835443037974684, + "grad_norm": 1.0884469747543335, + "learning_rate": 9.708465687298425e-05, + "loss": 0.6302382349967957, + "step": 2568 + }, + { + "epoch": 1.0843881856540085, + "grad_norm": 1.1994211673736572, + "learning_rate": 9.707657117273007e-05, + "loss": 0.7329678535461426, + "step": 2570 + }, + { + "epoch": 1.0852320675105485, + "grad_norm": 1.2609503269195557, + "learning_rate": 9.706847461281507e-05, + "loss": 0.719862163066864, + "step": 2572 + }, + { + "epoch": 1.0860759493670886, + "grad_norm": 1.2686879634857178, + "learning_rate": 9.706036719510694e-05, + "loss": 0.7142901420593262, + "step": 2574 + }, + { + "epoch": 1.0869198312236288, + "grad_norm": 1.2763310670852661, + "learning_rate": 9.705224892147591e-05, + "loss": 0.7009075284004211, + "step": 2576 + }, + { + "epoch": 1.0877637130801687, + "grad_norm": 1.1704022884368896, + "learning_rate": 9.70441197937947e-05, + "loss": 0.6873779296875, + "step": 2578 + }, + { + "epoch": 1.0886075949367089, + "grad_norm": 1.0482875108718872, + "learning_rate": 9.703597981393856e-05, + "loss": 0.6437726020812988, + "step": 2580 + }, + { + "epoch": 1.0894514767932488, + "grad_norm": 1.28431236743927, + "learning_rate": 9.702782898378521e-05, + "loss": 0.6933431625366211, + "step": 2582 + }, + { + "epoch": 1.090295358649789, + "grad_norm": 1.0962283611297607, + "learning_rate": 9.701966730521491e-05, + "loss": 0.6488757133483887, + "step": 2584 + }, + { + "epoch": 1.0911392405063292, + "grad_norm": 1.2177873849868774, + "learning_rate": 9.70114947801104e-05, + "loss": 0.6385396122932434, + "step": 2586 + }, + { + "epoch": 1.091983122362869, + "grad_norm": 1.197059988975525, + "learning_rate": 9.70033114103569e-05, + "loss": 0.6826614737510681, + "step": 2588 + }, + { + "epoch": 1.0928270042194093, + "grad_norm": 1.1624075174331665, + "learning_rate": 9.699511719784217e-05, + "loss": 0.605629563331604, + "step": 2590 + }, + { + "epoch": 1.0936708860759494, + "grad_norm": 1.2975167036056519, + "learning_rate": 9.698691214445648e-05, + "loss": 0.734926700592041, + "step": 2592 + }, + { + "epoch": 1.0945147679324894, + "grad_norm": 1.215414047241211, + "learning_rate": 9.697869625209255e-05, + "loss": 0.7281333804130554, + "step": 2594 + }, + { + "epoch": 1.0953586497890295, + "grad_norm": 1.1862860918045044, + "learning_rate": 9.697046952264563e-05, + "loss": 0.7388250827789307, + "step": 2596 + }, + { + "epoch": 1.0962025316455697, + "grad_norm": 1.1127797365188599, + "learning_rate": 9.696223195801348e-05, + "loss": 0.6495320796966553, + "step": 2598 + }, + { + "epoch": 1.0970464135021096, + "grad_norm": 1.0863338708877563, + "learning_rate": 9.695398356009636e-05, + "loss": 0.7157143950462341, + "step": 2600 + }, + { + "epoch": 1.0970464135021096, + "eval_loss": 0.7377332448959351, + "eval_runtime": 859.6612, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 2600 + }, + { + "epoch": 1.0978902953586498, + "grad_norm": 1.1228652000427246, + "learning_rate": 9.694572433079699e-05, + "loss": 0.6597335934638977, + "step": 2602 + }, + { + "epoch": 1.09873417721519, + "grad_norm": 1.3077653646469116, + "learning_rate": 9.69374542720206e-05, + "loss": 0.6715680360794067, + "step": 2604 + }, + { + "epoch": 1.09957805907173, + "grad_norm": 1.241603970527649, + "learning_rate": 9.692917338567499e-05, + "loss": 0.6910243034362793, + "step": 2606 + }, + { + "epoch": 1.10042194092827, + "grad_norm": 1.1372551918029785, + "learning_rate": 9.692088167367037e-05, + "loss": 0.6519553065299988, + "step": 2608 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 1.2894765138626099, + "learning_rate": 9.691257913791949e-05, + "loss": 0.6542758941650391, + "step": 2610 + }, + { + "epoch": 1.1021097046413502, + "grad_norm": 1.0800915956497192, + "learning_rate": 9.690426578033755e-05, + "loss": 0.6886795163154602, + "step": 2612 + }, + { + "epoch": 1.1029535864978903, + "grad_norm": 1.3394384384155273, + "learning_rate": 9.689594160284233e-05, + "loss": 0.7512150406837463, + "step": 2614 + }, + { + "epoch": 1.1037974683544305, + "grad_norm": 1.2175323963165283, + "learning_rate": 9.688760660735402e-05, + "loss": 0.67207932472229, + "step": 2616 + }, + { + "epoch": 1.1046413502109704, + "grad_norm": 1.2181185483932495, + "learning_rate": 9.687926079579537e-05, + "loss": 0.6591740846633911, + "step": 2618 + }, + { + "epoch": 1.1054852320675106, + "grad_norm": 1.1740983724594116, + "learning_rate": 9.68709041700916e-05, + "loss": 0.6431041359901428, + "step": 2620 + }, + { + "epoch": 1.1063291139240505, + "grad_norm": 1.1792434453964233, + "learning_rate": 9.686253673217038e-05, + "loss": 0.6573615074157715, + "step": 2622 + }, + { + "epoch": 1.1071729957805907, + "grad_norm": 1.058391809463501, + "learning_rate": 9.685415848396196e-05, + "loss": 0.5576209425926208, + "step": 2624 + }, + { + "epoch": 1.1080168776371309, + "grad_norm": 1.3203206062316895, + "learning_rate": 9.684576942739903e-05, + "loss": 0.668684184551239, + "step": 2626 + }, + { + "epoch": 1.1088607594936708, + "grad_norm": 1.2391762733459473, + "learning_rate": 9.68373695644168e-05, + "loss": 0.6800089478492737, + "step": 2628 + }, + { + "epoch": 1.109704641350211, + "grad_norm": 1.2323405742645264, + "learning_rate": 9.682895889695292e-05, + "loss": 0.6433757543563843, + "step": 2630 + }, + { + "epoch": 1.1105485232067511, + "grad_norm": 1.2656551599502563, + "learning_rate": 9.682053742694759e-05, + "loss": 0.6628785729408264, + "step": 2632 + }, + { + "epoch": 1.111392405063291, + "grad_norm": 1.2984392642974854, + "learning_rate": 9.681210515634349e-05, + "loss": 0.6838971972465515, + "step": 2634 + }, + { + "epoch": 1.1122362869198312, + "grad_norm": 1.3200393915176392, + "learning_rate": 9.680366208708576e-05, + "loss": 0.7548647522926331, + "step": 2636 + }, + { + "epoch": 1.1130801687763714, + "grad_norm": 1.225388526916504, + "learning_rate": 9.679520822112208e-05, + "loss": 0.6553335189819336, + "step": 2638 + }, + { + "epoch": 1.1139240506329113, + "grad_norm": 1.2350653409957886, + "learning_rate": 9.678674356040259e-05, + "loss": 0.631401538848877, + "step": 2640 + }, + { + "epoch": 1.1147679324894515, + "grad_norm": 1.2325507402420044, + "learning_rate": 9.677826810687989e-05, + "loss": 0.6459156274795532, + "step": 2642 + }, + { + "epoch": 1.1156118143459917, + "grad_norm": 1.0008996725082397, + "learning_rate": 9.676978186250915e-05, + "loss": 0.6425284743309021, + "step": 2644 + }, + { + "epoch": 1.1164556962025316, + "grad_norm": 1.3767247200012207, + "learning_rate": 9.676128482924796e-05, + "loss": 0.6451422572135925, + "step": 2646 + }, + { + "epoch": 1.1172995780590718, + "grad_norm": 1.2070895433425903, + "learning_rate": 9.675277700905643e-05, + "loss": 0.6713272929191589, + "step": 2648 + }, + { + "epoch": 1.1181434599156117, + "grad_norm": 1.1582069396972656, + "learning_rate": 9.674425840389716e-05, + "loss": 0.6285044550895691, + "step": 2650 + }, + { + "epoch": 1.1189873417721519, + "grad_norm": 1.1641311645507812, + "learning_rate": 9.67357290157352e-05, + "loss": 0.624229907989502, + "step": 2652 + }, + { + "epoch": 1.119831223628692, + "grad_norm": 1.3071147203445435, + "learning_rate": 9.672718884653814e-05, + "loss": 0.7214919328689575, + "step": 2654 + }, + { + "epoch": 1.120675105485232, + "grad_norm": 1.2157800197601318, + "learning_rate": 9.671863789827602e-05, + "loss": 0.8062215447425842, + "step": 2656 + }, + { + "epoch": 1.1215189873417721, + "grad_norm": 1.2843927145004272, + "learning_rate": 9.671007617292138e-05, + "loss": 0.6362426280975342, + "step": 2658 + }, + { + "epoch": 1.1223628691983123, + "grad_norm": 1.1182712316513062, + "learning_rate": 9.670150367244927e-05, + "loss": 0.6181318163871765, + "step": 2660 + }, + { + "epoch": 1.1232067510548522, + "grad_norm": 1.566605806350708, + "learning_rate": 9.669292039883717e-05, + "loss": 0.6973897218704224, + "step": 2662 + }, + { + "epoch": 1.1240506329113924, + "grad_norm": 1.0726850032806396, + "learning_rate": 9.66843263540651e-05, + "loss": 0.6117324829101562, + "step": 2664 + }, + { + "epoch": 1.1248945147679326, + "grad_norm": 1.2953020334243774, + "learning_rate": 9.66757215401155e-05, + "loss": 0.642676830291748, + "step": 2666 + }, + { + "epoch": 1.1257383966244725, + "grad_norm": 1.1184383630752563, + "learning_rate": 9.66671059589734e-05, + "loss": 0.6757452487945557, + "step": 2668 + }, + { + "epoch": 1.1265822784810127, + "grad_norm": 1.2732970714569092, + "learning_rate": 9.66584796126262e-05, + "loss": 0.6861951947212219, + "step": 2670 + }, + { + "epoch": 1.1274261603375528, + "grad_norm": 1.2713000774383545, + "learning_rate": 9.664984250306383e-05, + "loss": 0.6727077960968018, + "step": 2672 + }, + { + "epoch": 1.1282700421940928, + "grad_norm": 1.269827961921692, + "learning_rate": 9.664119463227874e-05, + "loss": 0.7355974912643433, + "step": 2674 + }, + { + "epoch": 1.129113924050633, + "grad_norm": 1.3067172765731812, + "learning_rate": 9.663253600226581e-05, + "loss": 0.7121313214302063, + "step": 2676 + }, + { + "epoch": 1.129957805907173, + "grad_norm": 1.2958797216415405, + "learning_rate": 9.662386661502242e-05, + "loss": 0.6671369075775146, + "step": 2678 + }, + { + "epoch": 1.130801687763713, + "grad_norm": 1.2943401336669922, + "learning_rate": 9.661518647254842e-05, + "loss": 0.6153768301010132, + "step": 2680 + }, + { + "epoch": 1.1316455696202532, + "grad_norm": 1.1744167804718018, + "learning_rate": 9.660649557684616e-05, + "loss": 0.6070778965950012, + "step": 2682 + }, + { + "epoch": 1.1324894514767934, + "grad_norm": 1.159209132194519, + "learning_rate": 9.659779392992047e-05, + "loss": 0.676887035369873, + "step": 2684 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.1937510967254639, + "learning_rate": 9.658908153377866e-05, + "loss": 0.6086745262145996, + "step": 2686 + }, + { + "epoch": 1.1341772151898735, + "grad_norm": 1.1461687088012695, + "learning_rate": 9.658035839043049e-05, + "loss": 0.6493708491325378, + "step": 2688 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 2.066361665725708, + "learning_rate": 9.657162450188824e-05, + "loss": 0.6813004016876221, + "step": 2690 + }, + { + "epoch": 1.1358649789029536, + "grad_norm": 1.086910367012024, + "learning_rate": 9.656287987016664e-05, + "loss": 0.721062183380127, + "step": 2692 + }, + { + "epoch": 1.1367088607594937, + "grad_norm": 1.1869292259216309, + "learning_rate": 9.65541244972829e-05, + "loss": 0.5975021123886108, + "step": 2694 + }, + { + "epoch": 1.1375527426160337, + "grad_norm": 1.2456518411636353, + "learning_rate": 9.654535838525674e-05, + "loss": 0.6818324327468872, + "step": 2696 + }, + { + "epoch": 1.1383966244725738, + "grad_norm": 1.5271464586257935, + "learning_rate": 9.653658153611031e-05, + "loss": 0.6844469308853149, + "step": 2698 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.1403794288635254, + "learning_rate": 9.652779395186827e-05, + "loss": 0.6388684511184692, + "step": 2700 + }, + { + "epoch": 1.139240506329114, + "eval_loss": 0.7335711717605591, + "eval_runtime": 861.9651, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 2700 + }, + { + "epoch": 1.140084388185654, + "grad_norm": 1.1091634035110474, + "learning_rate": 9.651899563455775e-05, + "loss": 0.6154619455337524, + "step": 2702 + }, + { + "epoch": 1.140928270042194, + "grad_norm": 1.3280601501464844, + "learning_rate": 9.651018658620837e-05, + "loss": 0.629319429397583, + "step": 2704 + }, + { + "epoch": 1.1417721518987343, + "grad_norm": 1.226806402206421, + "learning_rate": 9.650136680885216e-05, + "loss": 0.6088175773620605, + "step": 2706 + }, + { + "epoch": 1.1426160337552742, + "grad_norm": 1.0593408346176147, + "learning_rate": 9.649253630452372e-05, + "loss": 0.6199659705162048, + "step": 2708 + }, + { + "epoch": 1.1434599156118144, + "grad_norm": 1.1112475395202637, + "learning_rate": 9.648369507526008e-05, + "loss": 0.7233364582061768, + "step": 2710 + }, + { + "epoch": 1.1443037974683543, + "grad_norm": 1.1737885475158691, + "learning_rate": 9.647484312310068e-05, + "loss": 0.6687955856323242, + "step": 2712 + }, + { + "epoch": 1.1451476793248945, + "grad_norm": 1.194532036781311, + "learning_rate": 9.646598045008756e-05, + "loss": 0.6508969068527222, + "step": 2714 + }, + { + "epoch": 1.1459915611814346, + "grad_norm": 1.069395899772644, + "learning_rate": 9.645710705826517e-05, + "loss": 0.6408317685127258, + "step": 2716 + }, + { + "epoch": 1.1468354430379746, + "grad_norm": 1.2429133653640747, + "learning_rate": 9.644822294968037e-05, + "loss": 0.650763750076294, + "step": 2718 + }, + { + "epoch": 1.1476793248945147, + "grad_norm": 1.2950133085250854, + "learning_rate": 9.64393281263826e-05, + "loss": 0.6952191591262817, + "step": 2720 + }, + { + "epoch": 1.148523206751055, + "grad_norm": 1.1972628831863403, + "learning_rate": 9.643042259042372e-05, + "loss": 0.6772956252098083, + "step": 2722 + }, + { + "epoch": 1.1493670886075948, + "grad_norm": 1.1670407056808472, + "learning_rate": 9.642150634385805e-05, + "loss": 0.6734447479248047, + "step": 2724 + }, + { + "epoch": 1.150210970464135, + "grad_norm": 1.120302677154541, + "learning_rate": 9.641257938874243e-05, + "loss": 0.6387717127799988, + "step": 2726 + }, + { + "epoch": 1.1510548523206752, + "grad_norm": 1.1241344213485718, + "learning_rate": 9.640364172713609e-05, + "loss": 0.6592874526977539, + "step": 2728 + }, + { + "epoch": 1.1518987341772151, + "grad_norm": 1.2627261877059937, + "learning_rate": 9.639469336110083e-05, + "loss": 0.7257466912269592, + "step": 2730 + }, + { + "epoch": 1.1527426160337553, + "grad_norm": 1.0528618097305298, + "learning_rate": 9.638573429270083e-05, + "loss": 0.572188138961792, + "step": 2732 + }, + { + "epoch": 1.1535864978902954, + "grad_norm": 1.212536334991455, + "learning_rate": 9.637676452400277e-05, + "loss": 0.678981602191925, + "step": 2734 + }, + { + "epoch": 1.1544303797468354, + "grad_norm": 1.152167797088623, + "learning_rate": 9.636778405707582e-05, + "loss": 0.6375001072883606, + "step": 2736 + }, + { + "epoch": 1.1552742616033755, + "grad_norm": 1.2400429248809814, + "learning_rate": 9.635879289399161e-05, + "loss": 0.7602289319038391, + "step": 2738 + }, + { + "epoch": 1.1561181434599157, + "grad_norm": 1.3488622903823853, + "learning_rate": 9.634979103682421e-05, + "loss": 0.6209543943405151, + "step": 2740 + }, + { + "epoch": 1.1569620253164556, + "grad_norm": 1.1999555826187134, + "learning_rate": 9.634077848765019e-05, + "loss": 0.6215830445289612, + "step": 2742 + }, + { + "epoch": 1.1578059071729958, + "grad_norm": 1.2008578777313232, + "learning_rate": 9.633175524854855e-05, + "loss": 0.6634654998779297, + "step": 2744 + }, + { + "epoch": 1.158649789029536, + "grad_norm": 1.3920676708221436, + "learning_rate": 9.63227213216008e-05, + "loss": 0.7515161633491516, + "step": 2746 + }, + { + "epoch": 1.159493670886076, + "grad_norm": 1.0551656484603882, + "learning_rate": 9.631367670889089e-05, + "loss": 0.724361777305603, + "step": 2748 + }, + { + "epoch": 1.160337552742616, + "grad_norm": 1.2820028066635132, + "learning_rate": 9.630462141250523e-05, + "loss": 0.6673553586006165, + "step": 2750 + }, + { + "epoch": 1.1611814345991562, + "grad_norm": 1.1452983617782593, + "learning_rate": 9.62955554345327e-05, + "loss": 0.7029784917831421, + "step": 2752 + }, + { + "epoch": 1.1620253164556962, + "grad_norm": 1.1808624267578125, + "learning_rate": 9.628647877706466e-05, + "loss": 0.7355457544326782, + "step": 2754 + }, + { + "epoch": 1.1628691983122363, + "grad_norm": 1.0574703216552734, + "learning_rate": 9.627739144219492e-05, + "loss": 0.6144933700561523, + "step": 2756 + }, + { + "epoch": 1.1637130801687763, + "grad_norm": 1.215733528137207, + "learning_rate": 9.626829343201974e-05, + "loss": 0.6843759417533875, + "step": 2758 + }, + { + "epoch": 1.1645569620253164, + "grad_norm": 1.1667706966400146, + "learning_rate": 9.625918474863787e-05, + "loss": 0.6197049617767334, + "step": 2760 + }, + { + "epoch": 1.1654008438818566, + "grad_norm": 1.3765631914138794, + "learning_rate": 9.62500653941505e-05, + "loss": 0.715958297252655, + "step": 2762 + }, + { + "epoch": 1.1662447257383965, + "grad_norm": 1.173715591430664, + "learning_rate": 9.62409353706613e-05, + "loss": 0.7433139085769653, + "step": 2764 + }, + { + "epoch": 1.1670886075949367, + "grad_norm": 1.1837430000305176, + "learning_rate": 9.623179468027637e-05, + "loss": 0.7174371480941772, + "step": 2766 + }, + { + "epoch": 1.1679324894514769, + "grad_norm": 1.1577154397964478, + "learning_rate": 9.622264332510432e-05, + "loss": 0.7184823751449585, + "step": 2768 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 1.165246605873108, + "learning_rate": 9.621348130725617e-05, + "loss": 0.693343460559845, + "step": 2770 + }, + { + "epoch": 1.169620253164557, + "grad_norm": 1.2853080034255981, + "learning_rate": 9.620430862884542e-05, + "loss": 0.6999852061271667, + "step": 2772 + }, + { + "epoch": 1.1704641350210971, + "grad_norm": 1.1782865524291992, + "learning_rate": 9.619512529198806e-05, + "loss": 0.6034331321716309, + "step": 2774 + }, + { + "epoch": 1.171308016877637, + "grad_norm": 1.4055447578430176, + "learning_rate": 9.61859312988025e-05, + "loss": 0.7588269710540771, + "step": 2776 + }, + { + "epoch": 1.1721518987341772, + "grad_norm": 1.1148805618286133, + "learning_rate": 9.617672665140957e-05, + "loss": 0.6913981437683105, + "step": 2778 + }, + { + "epoch": 1.1729957805907172, + "grad_norm": 1.1311042308807373, + "learning_rate": 9.616751135193266e-05, + "loss": 0.5976925492286682, + "step": 2780 + }, + { + "epoch": 1.1738396624472573, + "grad_norm": 1.2378602027893066, + "learning_rate": 9.615828540249754e-05, + "loss": 0.6897050142288208, + "step": 2782 + }, + { + "epoch": 1.1746835443037975, + "grad_norm": 1.3445732593536377, + "learning_rate": 9.614904880523248e-05, + "loss": 0.6772098541259766, + "step": 2784 + }, + { + "epoch": 1.1755274261603375, + "grad_norm": 1.3380862474441528, + "learning_rate": 9.613980156226815e-05, + "loss": 0.6354818344116211, + "step": 2786 + }, + { + "epoch": 1.1763713080168776, + "grad_norm": 1.0955157279968262, + "learning_rate": 9.613054367573773e-05, + "loss": 0.6541208028793335, + "step": 2788 + }, + { + "epoch": 1.1772151898734178, + "grad_norm": 1.0176626443862915, + "learning_rate": 9.612127514777686e-05, + "loss": 0.6472887992858887, + "step": 2790 + }, + { + "epoch": 1.1780590717299577, + "grad_norm": 1.2644864320755005, + "learning_rate": 9.611199598052357e-05, + "loss": 0.7511212229728699, + "step": 2792 + }, + { + "epoch": 1.1789029535864979, + "grad_norm": 1.248197317123413, + "learning_rate": 9.61027061761184e-05, + "loss": 0.696236789226532, + "step": 2794 + }, + { + "epoch": 1.179746835443038, + "grad_norm": 1.189935564994812, + "learning_rate": 9.609340573670436e-05, + "loss": 0.5962010622024536, + "step": 2796 + }, + { + "epoch": 1.180590717299578, + "grad_norm": 1.1760492324829102, + "learning_rate": 9.608409466442685e-05, + "loss": 0.5981685519218445, + "step": 2798 + }, + { + "epoch": 1.1814345991561181, + "grad_norm": 1.1820716857910156, + "learning_rate": 9.607477296143374e-05, + "loss": 0.6186091303825378, + "step": 2800 + }, + { + "epoch": 1.1814345991561181, + "eval_loss": 0.7298192977905273, + "eval_runtime": 849.544, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 2.48, + "step": 2800 + }, + { + "epoch": 1.1822784810126583, + "grad_norm": 1.0353888273239136, + "learning_rate": 9.606544062987541e-05, + "loss": 0.5859389901161194, + "step": 2802 + }, + { + "epoch": 1.1831223628691983, + "grad_norm": 1.3141933679580688, + "learning_rate": 9.605609767190464e-05, + "loss": 0.6573460698127747, + "step": 2804 + }, + { + "epoch": 1.1839662447257384, + "grad_norm": 1.1209372282028198, + "learning_rate": 9.604674408967664e-05, + "loss": 0.6991921067237854, + "step": 2806 + }, + { + "epoch": 1.1848101265822786, + "grad_norm": 1.2830493450164795, + "learning_rate": 9.603737988534913e-05, + "loss": 0.6438087821006775, + "step": 2808 + }, + { + "epoch": 1.1856540084388185, + "grad_norm": 1.1427195072174072, + "learning_rate": 9.602800506108225e-05, + "loss": 0.6452094316482544, + "step": 2810 + }, + { + "epoch": 1.1864978902953587, + "grad_norm": 1.316420078277588, + "learning_rate": 9.601861961903857e-05, + "loss": 0.6745601296424866, + "step": 2812 + }, + { + "epoch": 1.1873417721518988, + "grad_norm": 1.1643308401107788, + "learning_rate": 9.600922356138317e-05, + "loss": 0.6761514544487, + "step": 2814 + }, + { + "epoch": 1.1881856540084388, + "grad_norm": 1.036056399345398, + "learning_rate": 9.59998168902835e-05, + "loss": 0.6453908681869507, + "step": 2816 + }, + { + "epoch": 1.189029535864979, + "grad_norm": 1.2211129665374756, + "learning_rate": 9.599039960790954e-05, + "loss": 0.6576406359672546, + "step": 2818 + }, + { + "epoch": 1.189873417721519, + "grad_norm": 1.084114670753479, + "learning_rate": 9.598097171643364e-05, + "loss": 0.6214181780815125, + "step": 2820 + }, + { + "epoch": 1.190717299578059, + "grad_norm": 1.1297314167022705, + "learning_rate": 9.597153321803064e-05, + "loss": 0.6381646990776062, + "step": 2822 + }, + { + "epoch": 1.1915611814345992, + "grad_norm": 1.2568120956420898, + "learning_rate": 9.596208411487784e-05, + "loss": 0.7129076719284058, + "step": 2824 + }, + { + "epoch": 1.1924050632911392, + "grad_norm": 1.07041335105896, + "learning_rate": 9.595262440915493e-05, + "loss": 0.7123546004295349, + "step": 2826 + }, + { + "epoch": 1.1932489451476793, + "grad_norm": 1.3950074911117554, + "learning_rate": 9.594315410304413e-05, + "loss": 0.7263038158416748, + "step": 2828 + }, + { + "epoch": 1.1940928270042195, + "grad_norm": 1.2470672130584717, + "learning_rate": 9.593367319873002e-05, + "loss": 0.6863036751747131, + "step": 2830 + }, + { + "epoch": 1.1949367088607594, + "grad_norm": 1.2065461874008179, + "learning_rate": 9.592418169839968e-05, + "loss": 0.745354175567627, + "step": 2832 + }, + { + "epoch": 1.1957805907172996, + "grad_norm": 1.1710152626037598, + "learning_rate": 9.591467960424261e-05, + "loss": 0.6401656866073608, + "step": 2834 + }, + { + "epoch": 1.1966244725738397, + "grad_norm": 1.3324087858200073, + "learning_rate": 9.590516691845077e-05, + "loss": 0.7402615547180176, + "step": 2836 + }, + { + "epoch": 1.1974683544303797, + "grad_norm": 1.0100195407867432, + "learning_rate": 9.589564364321855e-05, + "loss": 0.5723769068717957, + "step": 2838 + }, + { + "epoch": 1.1983122362869199, + "grad_norm": 1.2706246376037598, + "learning_rate": 9.588610978074277e-05, + "loss": 0.6618966460227966, + "step": 2840 + }, + { + "epoch": 1.1991561181434598, + "grad_norm": 1.1921758651733398, + "learning_rate": 9.587656533322273e-05, + "loss": 0.7090804576873779, + "step": 2842 + }, + { + "epoch": 1.2, + "grad_norm": 1.36713445186615, + "learning_rate": 9.586701030286014e-05, + "loss": 0.6930652856826782, + "step": 2844 + }, + { + "epoch": 1.2008438818565401, + "grad_norm": 1.3084295988082886, + "learning_rate": 9.585744469185917e-05, + "loss": 0.7386236190795898, + "step": 2846 + }, + { + "epoch": 1.20168776371308, + "grad_norm": 1.198922038078308, + "learning_rate": 9.584786850242642e-05, + "loss": 0.6179903149604797, + "step": 2848 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.2106369733810425, + "learning_rate": 9.583828173677092e-05, + "loss": 0.7027528882026672, + "step": 2850 + }, + { + "epoch": 1.2033755274261604, + "grad_norm": 1.2959522008895874, + "learning_rate": 9.582868439710418e-05, + "loss": 0.6612945199012756, + "step": 2852 + }, + { + "epoch": 1.2042194092827003, + "grad_norm": 1.1441705226898193, + "learning_rate": 9.58190764856401e-05, + "loss": 0.7085917592048645, + "step": 2854 + }, + { + "epoch": 1.2050632911392405, + "grad_norm": 1.1586185693740845, + "learning_rate": 9.580945800459504e-05, + "loss": 0.7480600476264954, + "step": 2856 + }, + { + "epoch": 1.2059071729957807, + "grad_norm": 1.2068266868591309, + "learning_rate": 9.579982895618783e-05, + "loss": 0.7185836434364319, + "step": 2858 + }, + { + "epoch": 1.2067510548523206, + "grad_norm": 1.2188525199890137, + "learning_rate": 9.579018934263966e-05, + "loss": 0.6737306118011475, + "step": 2860 + }, + { + "epoch": 1.2075949367088608, + "grad_norm": 1.1513181924819946, + "learning_rate": 9.578053916617423e-05, + "loss": 0.7239293456077576, + "step": 2862 + }, + { + "epoch": 1.208438818565401, + "grad_norm": 1.2063703536987305, + "learning_rate": 9.577087842901764e-05, + "loss": 0.6416276097297668, + "step": 2864 + }, + { + "epoch": 1.2092827004219409, + "grad_norm": 1.102460503578186, + "learning_rate": 9.576120713339844e-05, + "loss": 0.697213351726532, + "step": 2866 + }, + { + "epoch": 1.210126582278481, + "grad_norm": 1.2484638690948486, + "learning_rate": 9.575152528154763e-05, + "loss": 0.6664742231369019, + "step": 2868 + }, + { + "epoch": 1.2109704641350212, + "grad_norm": 1.4476624727249146, + "learning_rate": 9.57418328756986e-05, + "loss": 0.6914868354797363, + "step": 2870 + }, + { + "epoch": 1.2118143459915611, + "grad_norm": 1.0130122900009155, + "learning_rate": 9.573212991808722e-05, + "loss": 0.662024736404419, + "step": 2872 + }, + { + "epoch": 1.2126582278481013, + "grad_norm": 1.014470100402832, + "learning_rate": 9.572241641095177e-05, + "loss": 0.6330409646034241, + "step": 2874 + }, + { + "epoch": 1.2135021097046415, + "grad_norm": 1.1803333759307861, + "learning_rate": 9.571269235653298e-05, + "loss": 0.6607463955879211, + "step": 2876 + }, + { + "epoch": 1.2143459915611814, + "grad_norm": 1.261366844177246, + "learning_rate": 9.570295775707398e-05, + "loss": 0.6925629377365112, + "step": 2878 + }, + { + "epoch": 1.2151898734177216, + "grad_norm": 1.226670503616333, + "learning_rate": 9.569321261482037e-05, + "loss": 0.7070510983467102, + "step": 2880 + }, + { + "epoch": 1.2160337552742617, + "grad_norm": 1.164565920829773, + "learning_rate": 9.568345693202016e-05, + "loss": 0.7243561744689941, + "step": 2882 + }, + { + "epoch": 1.2168776371308017, + "grad_norm": 1.060331106185913, + "learning_rate": 9.567369071092382e-05, + "loss": 0.6316909790039062, + "step": 2884 + }, + { + "epoch": 1.2177215189873418, + "grad_norm": 1.1998693943023682, + "learning_rate": 9.566391395378419e-05, + "loss": 0.6139125227928162, + "step": 2886 + }, + { + "epoch": 1.2185654008438818, + "grad_norm": 1.1875834465026855, + "learning_rate": 9.565412666285661e-05, + "loss": 0.688897430896759, + "step": 2888 + }, + { + "epoch": 1.219409282700422, + "grad_norm": 1.199174404144287, + "learning_rate": 9.564432884039882e-05, + "loss": 0.684590756893158, + "step": 2890 + }, + { + "epoch": 1.220253164556962, + "grad_norm": 1.2428219318389893, + "learning_rate": 9.563452048867099e-05, + "loss": 0.67433100938797, + "step": 2892 + }, + { + "epoch": 1.221097046413502, + "grad_norm": 1.0826431512832642, + "learning_rate": 9.562470160993568e-05, + "loss": 0.6959785223007202, + "step": 2894 + }, + { + "epoch": 1.2219409282700422, + "grad_norm": 1.3140246868133545, + "learning_rate": 9.561487220645797e-05, + "loss": 0.6443175673484802, + "step": 2896 + }, + { + "epoch": 1.2227848101265824, + "grad_norm": 1.2758334875106812, + "learning_rate": 9.560503228050529e-05, + "loss": 0.6715332865715027, + "step": 2898 + }, + { + "epoch": 1.2236286919831223, + "grad_norm": 1.3326421976089478, + "learning_rate": 9.559518183434753e-05, + "loss": 0.6896081566810608, + "step": 2900 + }, + { + "epoch": 1.2236286919831223, + "eval_loss": 0.7281573414802551, + "eval_runtime": 854.563, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 2.466, + "step": 2900 + }, + { + "epoch": 1.2244725738396625, + "grad_norm": 1.3225606679916382, + "learning_rate": 9.558532087025697e-05, + "loss": 0.6797633171081543, + "step": 2902 + }, + { + "epoch": 1.2253164556962026, + "grad_norm": 1.3058340549468994, + "learning_rate": 9.55754493905084e-05, + "loss": 0.6510948538780212, + "step": 2904 + }, + { + "epoch": 1.2261603375527426, + "grad_norm": 1.140268087387085, + "learning_rate": 9.556556739737892e-05, + "loss": 0.6481176614761353, + "step": 2906 + }, + { + "epoch": 1.2270042194092827, + "grad_norm": 1.465113639831543, + "learning_rate": 9.555567489314816e-05, + "loss": 0.7533771991729736, + "step": 2908 + }, + { + "epoch": 1.2278481012658227, + "grad_norm": 1.1468979120254517, + "learning_rate": 9.554577188009812e-05, + "loss": 0.6924305558204651, + "step": 2910 + }, + { + "epoch": 1.2286919831223628, + "grad_norm": 1.2193517684936523, + "learning_rate": 9.553585836051321e-05, + "loss": 0.7082820534706116, + "step": 2912 + }, + { + "epoch": 1.229535864978903, + "grad_norm": 1.2015037536621094, + "learning_rate": 9.552593433668034e-05, + "loss": 0.6735695004463196, + "step": 2914 + }, + { + "epoch": 1.230379746835443, + "grad_norm": 1.1915435791015625, + "learning_rate": 9.551599981088874e-05, + "loss": 0.7312048673629761, + "step": 2916 + }, + { + "epoch": 1.231223628691983, + "grad_norm": 1.2849410772323608, + "learning_rate": 9.550605478543013e-05, + "loss": 0.6590308547019958, + "step": 2918 + }, + { + "epoch": 1.2320675105485233, + "grad_norm": 1.192238688468933, + "learning_rate": 9.549609926259866e-05, + "loss": 0.6237715482711792, + "step": 2920 + }, + { + "epoch": 1.2329113924050632, + "grad_norm": 1.141845703125, + "learning_rate": 9.548613324469085e-05, + "loss": 0.6546295881271362, + "step": 2922 + }, + { + "epoch": 1.2337552742616034, + "grad_norm": 1.1662311553955078, + "learning_rate": 9.547615673400566e-05, + "loss": 0.5800934433937073, + "step": 2924 + }, + { + "epoch": 1.2345991561181435, + "grad_norm": 1.120578646659851, + "learning_rate": 9.546616973284453e-05, + "loss": 0.6487136483192444, + "step": 2926 + }, + { + "epoch": 1.2354430379746835, + "grad_norm": 1.0884860754013062, + "learning_rate": 9.54561722435112e-05, + "loss": 0.7515342235565186, + "step": 2928 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 1.4208670854568481, + "learning_rate": 9.544616426831196e-05, + "loss": 0.7162003517150879, + "step": 2930 + }, + { + "epoch": 1.2371308016877638, + "grad_norm": 1.083389401435852, + "learning_rate": 9.543614580955543e-05, + "loss": 0.708450198173523, + "step": 2932 + }, + { + "epoch": 1.2379746835443037, + "grad_norm": 1.141364336013794, + "learning_rate": 9.542611686955268e-05, + "loss": 0.6255859732627869, + "step": 2934 + }, + { + "epoch": 1.238818565400844, + "grad_norm": 1.122036099433899, + "learning_rate": 9.54160774506172e-05, + "loss": 0.6485402584075928, + "step": 2936 + }, + { + "epoch": 1.239662447257384, + "grad_norm": 1.3514165878295898, + "learning_rate": 9.540602755506487e-05, + "loss": 0.6735473871231079, + "step": 2938 + }, + { + "epoch": 1.240506329113924, + "grad_norm": 1.1762629747390747, + "learning_rate": 9.539596718521403e-05, + "loss": 0.6154970526695251, + "step": 2940 + }, + { + "epoch": 1.2413502109704642, + "grad_norm": 1.1609408855438232, + "learning_rate": 9.53858963433854e-05, + "loss": 0.6410251259803772, + "step": 2942 + }, + { + "epoch": 1.2421940928270043, + "grad_norm": 1.1750361919403076, + "learning_rate": 9.537581503190214e-05, + "loss": 0.6841039657592773, + "step": 2944 + }, + { + "epoch": 1.2430379746835443, + "grad_norm": 1.3125680685043335, + "learning_rate": 9.536572325308982e-05, + "loss": 0.7293462753295898, + "step": 2946 + }, + { + "epoch": 1.2438818565400844, + "grad_norm": 1.1737277507781982, + "learning_rate": 9.53556210092764e-05, + "loss": 0.7713663578033447, + "step": 2948 + }, + { + "epoch": 1.2447257383966246, + "grad_norm": 1.1702152490615845, + "learning_rate": 9.53455083027923e-05, + "loss": 0.6612298488616943, + "step": 2950 + }, + { + "epoch": 1.2455696202531645, + "grad_norm": 1.2594486474990845, + "learning_rate": 9.533538513597028e-05, + "loss": 0.6725803017616272, + "step": 2952 + }, + { + "epoch": 1.2464135021097047, + "grad_norm": 1.180816411972046, + "learning_rate": 9.532525151114562e-05, + "loss": 0.6421069502830505, + "step": 2954 + }, + { + "epoch": 1.2472573839662446, + "grad_norm": 1.25814688205719, + "learning_rate": 9.531510743065593e-05, + "loss": 0.7042996287345886, + "step": 2956 + }, + { + "epoch": 1.2481012658227848, + "grad_norm": 1.2101783752441406, + "learning_rate": 9.530495289684122e-05, + "loss": 0.7359137535095215, + "step": 2958 + }, + { + "epoch": 1.248945147679325, + "grad_norm": 1.1438405513763428, + "learning_rate": 9.5294787912044e-05, + "loss": 0.6186386346817017, + "step": 2960 + }, + { + "epoch": 1.249789029535865, + "grad_norm": 1.163364291191101, + "learning_rate": 9.52846124786091e-05, + "loss": 0.6243056058883667, + "step": 2962 + }, + { + "epoch": 1.250632911392405, + "grad_norm": 1.0695953369140625, + "learning_rate": 9.52744265988838e-05, + "loss": 0.6568763852119446, + "step": 2964 + }, + { + "epoch": 1.2514767932489452, + "grad_norm": 1.2228879928588867, + "learning_rate": 9.52642302752178e-05, + "loss": 0.6486776471138, + "step": 2966 + }, + { + "epoch": 1.2523206751054852, + "grad_norm": 1.2262967824935913, + "learning_rate": 9.52540235099632e-05, + "loss": 0.6293455958366394, + "step": 2968 + }, + { + "epoch": 1.2531645569620253, + "grad_norm": 1.0862956047058105, + "learning_rate": 9.524380630547449e-05, + "loss": 0.6549884080886841, + "step": 2970 + }, + { + "epoch": 1.2540084388185653, + "grad_norm": 1.1721880435943604, + "learning_rate": 9.52335786641086e-05, + "loss": 0.6126490831375122, + "step": 2972 + }, + { + "epoch": 1.2548523206751054, + "grad_norm": 1.2452391386032104, + "learning_rate": 9.522334058822483e-05, + "loss": 0.7078590393066406, + "step": 2974 + }, + { + "epoch": 1.2556962025316456, + "grad_norm": 1.2290222644805908, + "learning_rate": 9.521309208018492e-05, + "loss": 0.6166214942932129, + "step": 2976 + }, + { + "epoch": 1.2565400843881855, + "grad_norm": 1.1823618412017822, + "learning_rate": 9.520283314235299e-05, + "loss": 0.666228175163269, + "step": 2978 + }, + { + "epoch": 1.2573839662447257, + "grad_norm": 1.1702475547790527, + "learning_rate": 9.51925637770956e-05, + "loss": 0.7436795830726624, + "step": 2980 + }, + { + "epoch": 1.2582278481012659, + "grad_norm": 1.0879321098327637, + "learning_rate": 9.518228398678168e-05, + "loss": 0.7120893001556396, + "step": 2982 + }, + { + "epoch": 1.2590717299578058, + "grad_norm": 1.1608418226242065, + "learning_rate": 9.517199377378261e-05, + "loss": 0.6931713223457336, + "step": 2984 + }, + { + "epoch": 1.259915611814346, + "grad_norm": 1.1289087533950806, + "learning_rate": 9.51616931404721e-05, + "loss": 0.6803538799285889, + "step": 2986 + }, + { + "epoch": 1.2607594936708861, + "grad_norm": 1.1622236967086792, + "learning_rate": 9.515138208922633e-05, + "loss": 0.6499706506729126, + "step": 2988 + }, + { + "epoch": 1.261603375527426, + "grad_norm": 1.2492594718933105, + "learning_rate": 9.514106062242386e-05, + "loss": 0.6132655739784241, + "step": 2990 + }, + { + "epoch": 1.2624472573839662, + "grad_norm": 1.1538822650909424, + "learning_rate": 9.513072874244567e-05, + "loss": 0.6309265494346619, + "step": 2992 + }, + { + "epoch": 1.2632911392405064, + "grad_norm": 1.0828478336334229, + "learning_rate": 9.512038645167509e-05, + "loss": 0.6297751665115356, + "step": 2994 + }, + { + "epoch": 1.2641350210970463, + "grad_norm": 1.2440937757492065, + "learning_rate": 9.511003375249792e-05, + "loss": 0.6335258483886719, + "step": 2996 + }, + { + "epoch": 1.2649789029535865, + "grad_norm": 1.1259970664978027, + "learning_rate": 9.50996706473023e-05, + "loss": 0.6513770818710327, + "step": 2998 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.1530309915542603, + "learning_rate": 9.508929713847884e-05, + "loss": 0.6490892767906189, + "step": 3000 + }, + { + "epoch": 1.2658227848101267, + "eval_loss": 0.72515869140625, + "eval_runtime": 868.0515, + "eval_samples_per_second": 2.427, + "eval_steps_per_second": 2.427, + "step": 3000 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.2257169485092163, + "learning_rate": 9.507891322842048e-05, + "loss": 0.6936060786247253, + "step": 3002 + }, + { + "epoch": 1.2675105485232068, + "grad_norm": 1.0380109548568726, + "learning_rate": 9.506851891952259e-05, + "loss": 0.5941951870918274, + "step": 3004 + }, + { + "epoch": 1.268354430379747, + "grad_norm": 1.2830222845077515, + "learning_rate": 9.505811421418296e-05, + "loss": 0.648429811000824, + "step": 3006 + }, + { + "epoch": 1.2691983122362869, + "grad_norm": 1.2212986946105957, + "learning_rate": 9.504769911480171e-05, + "loss": 0.6868565678596497, + "step": 3008 + }, + { + "epoch": 1.270042194092827, + "grad_norm": 1.104656457901001, + "learning_rate": 9.503727362378145e-05, + "loss": 0.6777986288070679, + "step": 3010 + }, + { + "epoch": 1.2708860759493672, + "grad_norm": 1.1449005603790283, + "learning_rate": 9.502683774352713e-05, + "loss": 0.6581128239631653, + "step": 3012 + }, + { + "epoch": 1.2717299578059071, + "grad_norm": 1.2753362655639648, + "learning_rate": 9.501639147644608e-05, + "loss": 0.689930260181427, + "step": 3014 + }, + { + "epoch": 1.2725738396624473, + "grad_norm": 1.3367106914520264, + "learning_rate": 9.500593482494809e-05, + "loss": 0.7549214363098145, + "step": 3016 + }, + { + "epoch": 1.2734177215189875, + "grad_norm": 1.2309048175811768, + "learning_rate": 9.499546779144528e-05, + "loss": 0.6713513135910034, + "step": 3018 + }, + { + "epoch": 1.2742616033755274, + "grad_norm": 1.3833240270614624, + "learning_rate": 9.49849903783522e-05, + "loss": 0.7045458555221558, + "step": 3020 + }, + { + "epoch": 1.2751054852320676, + "grad_norm": 1.1402570009231567, + "learning_rate": 9.49745025880858e-05, + "loss": 0.708249568939209, + "step": 3022 + }, + { + "epoch": 1.2759493670886077, + "grad_norm": 1.0476267337799072, + "learning_rate": 9.496400442306541e-05, + "loss": 0.616210401058197, + "step": 3024 + }, + { + "epoch": 1.2767932489451477, + "grad_norm": 1.1045979261398315, + "learning_rate": 9.495349588571274e-05, + "loss": 0.6691827178001404, + "step": 3026 + }, + { + "epoch": 1.2776371308016878, + "grad_norm": 1.1760368347167969, + "learning_rate": 9.494297697845194e-05, + "loss": 0.6198306083679199, + "step": 3028 + }, + { + "epoch": 1.2784810126582278, + "grad_norm": 1.0015549659729004, + "learning_rate": 9.493244770370946e-05, + "loss": 0.5756480097770691, + "step": 3030 + }, + { + "epoch": 1.279324894514768, + "grad_norm": 1.2190428972244263, + "learning_rate": 9.492190806391427e-05, + "loss": 0.6794419884681702, + "step": 3032 + }, + { + "epoch": 1.2801687763713079, + "grad_norm": 1.0210410356521606, + "learning_rate": 9.491135806149762e-05, + "loss": 0.5847988724708557, + "step": 3034 + }, + { + "epoch": 1.281012658227848, + "grad_norm": 1.0678503513336182, + "learning_rate": 9.490079769889319e-05, + "loss": 0.6760231256484985, + "step": 3036 + }, + { + "epoch": 1.2818565400843882, + "grad_norm": 1.1811012029647827, + "learning_rate": 9.489022697853709e-05, + "loss": 0.7188448309898376, + "step": 3038 + }, + { + "epoch": 1.2827004219409281, + "grad_norm": 1.1134302616119385, + "learning_rate": 9.487964590286776e-05, + "loss": 0.674904465675354, + "step": 3040 + }, + { + "epoch": 1.2835443037974683, + "grad_norm": 1.1868232488632202, + "learning_rate": 9.486905447432603e-05, + "loss": 0.6016344428062439, + "step": 3042 + }, + { + "epoch": 1.2843881856540085, + "grad_norm": 1.1586613655090332, + "learning_rate": 9.485845269535517e-05, + "loss": 0.6965603828430176, + "step": 3044 + }, + { + "epoch": 1.2852320675105484, + "grad_norm": 1.149837613105774, + "learning_rate": 9.48478405684008e-05, + "loss": 0.656144380569458, + "step": 3046 + }, + { + "epoch": 1.2860759493670886, + "grad_norm": 1.228752613067627, + "learning_rate": 9.48372180959109e-05, + "loss": 0.6388653516769409, + "step": 3048 + }, + { + "epoch": 1.2869198312236287, + "grad_norm": 1.2403100728988647, + "learning_rate": 9.482658528033595e-05, + "loss": 0.6255465745925903, + "step": 3050 + }, + { + "epoch": 1.2877637130801687, + "grad_norm": 1.2483839988708496, + "learning_rate": 9.481594212412865e-05, + "loss": 0.6828253269195557, + "step": 3052 + }, + { + "epoch": 1.2886075949367088, + "grad_norm": 1.4161021709442139, + "learning_rate": 9.480528862974422e-05, + "loss": 0.7072080373764038, + "step": 3054 + }, + { + "epoch": 1.289451476793249, + "grad_norm": 1.1500437259674072, + "learning_rate": 9.479462479964021e-05, + "loss": 0.6082415580749512, + "step": 3056 + }, + { + "epoch": 1.290295358649789, + "grad_norm": 1.196595549583435, + "learning_rate": 9.478395063627654e-05, + "loss": 0.6653015613555908, + "step": 3058 + }, + { + "epoch": 1.2911392405063291, + "grad_norm": 1.2832285165786743, + "learning_rate": 9.477326614211557e-05, + "loss": 0.7095832824707031, + "step": 3060 + }, + { + "epoch": 1.2919831223628693, + "grad_norm": 1.2234288454055786, + "learning_rate": 9.476257131962198e-05, + "loss": 0.7183426022529602, + "step": 3062 + }, + { + "epoch": 1.2928270042194092, + "grad_norm": 1.2350459098815918, + "learning_rate": 9.475186617126286e-05, + "loss": 0.713284432888031, + "step": 3064 + }, + { + "epoch": 1.2936708860759494, + "grad_norm": 1.2079555988311768, + "learning_rate": 9.47411506995077e-05, + "loss": 0.6580002307891846, + "step": 3066 + }, + { + "epoch": 1.2945147679324895, + "grad_norm": 1.129796028137207, + "learning_rate": 9.473042490682835e-05, + "loss": 0.5967763662338257, + "step": 3068 + }, + { + "epoch": 1.2953586497890295, + "grad_norm": 1.1706618070602417, + "learning_rate": 9.471968879569901e-05, + "loss": 0.6724388003349304, + "step": 3070 + }, + { + "epoch": 1.2962025316455696, + "grad_norm": 1.0336005687713623, + "learning_rate": 9.470894236859635e-05, + "loss": 0.6527577638626099, + "step": 3072 + }, + { + "epoch": 1.2970464135021098, + "grad_norm": 1.1124558448791504, + "learning_rate": 9.469818562799932e-05, + "loss": 0.677132785320282, + "step": 3074 + }, + { + "epoch": 1.2978902953586497, + "grad_norm": 1.158069372177124, + "learning_rate": 9.468741857638933e-05, + "loss": 0.649718165397644, + "step": 3076 + }, + { + "epoch": 1.29873417721519, + "grad_norm": 1.092926263809204, + "learning_rate": 9.46766412162501e-05, + "loss": 0.6872133612632751, + "step": 3078 + }, + { + "epoch": 1.29957805907173, + "grad_norm": 1.1324822902679443, + "learning_rate": 9.466585355006777e-05, + "loss": 0.6495246291160583, + "step": 3080 + }, + { + "epoch": 1.30042194092827, + "grad_norm": 1.5882837772369385, + "learning_rate": 9.465505558033086e-05, + "loss": 0.6730570197105408, + "step": 3082 + }, + { + "epoch": 1.3012658227848102, + "grad_norm": 0.9866069555282593, + "learning_rate": 9.464424730953023e-05, + "loss": 0.5677527785301208, + "step": 3084 + }, + { + "epoch": 1.3021097046413503, + "grad_norm": 1.1560224294662476, + "learning_rate": 9.463342874015917e-05, + "loss": 0.6247856020927429, + "step": 3086 + }, + { + "epoch": 1.3029535864978903, + "grad_norm": 1.135939359664917, + "learning_rate": 9.462259987471329e-05, + "loss": 0.6889358758926392, + "step": 3088 + }, + { + "epoch": 1.3037974683544304, + "grad_norm": 1.3935760259628296, + "learning_rate": 9.461176071569063e-05, + "loss": 0.7097522020339966, + "step": 3090 + }, + { + "epoch": 1.3046413502109704, + "grad_norm": 1.153518795967102, + "learning_rate": 9.460091126559155e-05, + "loss": 0.7044580578804016, + "step": 3092 + }, + { + "epoch": 1.3054852320675105, + "grad_norm": 1.2112717628479004, + "learning_rate": 9.45900515269188e-05, + "loss": 0.6119300723075867, + "step": 3094 + }, + { + "epoch": 1.3063291139240507, + "grad_norm": 1.295591115951538, + "learning_rate": 9.457918150217754e-05, + "loss": 0.7150222063064575, + "step": 3096 + }, + { + "epoch": 1.3071729957805907, + "grad_norm": 1.1175775527954102, + "learning_rate": 9.456830119387527e-05, + "loss": 0.6043334007263184, + "step": 3098 + }, + { + "epoch": 1.3080168776371308, + "grad_norm": 1.4022588729858398, + "learning_rate": 9.455741060452186e-05, + "loss": 0.6354425549507141, + "step": 3100 + }, + { + "epoch": 1.3080168776371308, + "eval_loss": 0.7225774526596069, + "eval_runtime": 862.4006, + "eval_samples_per_second": 2.443, + "eval_steps_per_second": 2.443, + "step": 3100 + }, + { + "epoch": 1.3088607594936708, + "grad_norm": 1.1657692193984985, + "learning_rate": 9.454650973662957e-05, + "loss": 0.7281571626663208, + "step": 3102 + }, + { + "epoch": 1.309704641350211, + "grad_norm": 1.6169127225875854, + "learning_rate": 9.453559859271301e-05, + "loss": 0.8038214445114136, + "step": 3104 + }, + { + "epoch": 1.310548523206751, + "grad_norm": 1.1256520748138428, + "learning_rate": 9.452467717528918e-05, + "loss": 0.6488606333732605, + "step": 3106 + }, + { + "epoch": 1.311392405063291, + "grad_norm": 1.1224530935287476, + "learning_rate": 9.451374548687745e-05, + "loss": 0.6897066235542297, + "step": 3108 + }, + { + "epoch": 1.3122362869198312, + "grad_norm": 1.1123055219650269, + "learning_rate": 9.450280352999952e-05, + "loss": 0.6332913041114807, + "step": 3110 + }, + { + "epoch": 1.3130801687763713, + "grad_norm": 1.1688940525054932, + "learning_rate": 9.449185130717952e-05, + "loss": 0.7426630854606628, + "step": 3112 + }, + { + "epoch": 1.3139240506329113, + "grad_norm": 1.1898044347763062, + "learning_rate": 9.44808888209439e-05, + "loss": 0.7156099677085876, + "step": 3114 + }, + { + "epoch": 1.3147679324894515, + "grad_norm": 1.3030686378479004, + "learning_rate": 9.44699160738215e-05, + "loss": 0.7150979042053223, + "step": 3116 + }, + { + "epoch": 1.3156118143459916, + "grad_norm": 1.1539074182510376, + "learning_rate": 9.445893306834352e-05, + "loss": 0.6687285900115967, + "step": 3118 + }, + { + "epoch": 1.3164556962025316, + "grad_norm": 1.311808466911316, + "learning_rate": 9.444793980704355e-05, + "loss": 0.7340983152389526, + "step": 3120 + }, + { + "epoch": 1.3172995780590717, + "grad_norm": 1.3325430154800415, + "learning_rate": 9.44369362924575e-05, + "loss": 0.6620677709579468, + "step": 3122 + }, + { + "epoch": 1.3181434599156119, + "grad_norm": 1.201518177986145, + "learning_rate": 9.442592252712365e-05, + "loss": 0.6169955134391785, + "step": 3124 + }, + { + "epoch": 1.3189873417721518, + "grad_norm": 1.2124013900756836, + "learning_rate": 9.441489851358272e-05, + "loss": 0.6696792840957642, + "step": 3126 + }, + { + "epoch": 1.319831223628692, + "grad_norm": 1.2186850309371948, + "learning_rate": 9.440386425437768e-05, + "loss": 0.7303428649902344, + "step": 3128 + }, + { + "epoch": 1.3206751054852321, + "grad_norm": 1.3780523538589478, + "learning_rate": 9.439281975205396e-05, + "loss": 0.7093026638031006, + "step": 3130 + }, + { + "epoch": 1.321518987341772, + "grad_norm": 1.233353614807129, + "learning_rate": 9.438176500915932e-05, + "loss": 0.6821767687797546, + "step": 3132 + }, + { + "epoch": 1.3223628691983123, + "grad_norm": 1.2425329685211182, + "learning_rate": 9.437070002824385e-05, + "loss": 0.700680136680603, + "step": 3134 + }, + { + "epoch": 1.3232067510548524, + "grad_norm": 1.1600432395935059, + "learning_rate": 9.435962481186003e-05, + "loss": 0.6173145771026611, + "step": 3136 + }, + { + "epoch": 1.3240506329113924, + "grad_norm": 1.279336929321289, + "learning_rate": 9.434853936256272e-05, + "loss": 0.6597106456756592, + "step": 3138 + }, + { + "epoch": 1.3248945147679325, + "grad_norm": 1.1787258386611938, + "learning_rate": 9.433744368290909e-05, + "loss": 0.6655287742614746, + "step": 3140 + }, + { + "epoch": 1.3257383966244727, + "grad_norm": 1.3658509254455566, + "learning_rate": 9.432633777545874e-05, + "loss": 0.6312944889068604, + "step": 3142 + }, + { + "epoch": 1.3265822784810126, + "grad_norm": 1.1220000982284546, + "learning_rate": 9.431522164277356e-05, + "loss": 0.6696156859397888, + "step": 3144 + }, + { + "epoch": 1.3274261603375528, + "grad_norm": 1.224761724472046, + "learning_rate": 9.430409528741783e-05, + "loss": 0.6586571335792542, + "step": 3146 + }, + { + "epoch": 1.328270042194093, + "grad_norm": 1.227510929107666, + "learning_rate": 9.429295871195821e-05, + "loss": 0.64905846118927, + "step": 3148 + }, + { + "epoch": 1.3291139240506329, + "grad_norm": 1.1359103918075562, + "learning_rate": 9.428181191896366e-05, + "loss": 0.6407933831214905, + "step": 3150 + }, + { + "epoch": 1.329957805907173, + "grad_norm": 1.2729473114013672, + "learning_rate": 9.427065491100556e-05, + "loss": 0.7004884481430054, + "step": 3152 + }, + { + "epoch": 1.3308016877637132, + "grad_norm": 1.1182841062545776, + "learning_rate": 9.42594876906576e-05, + "loss": 0.6835907101631165, + "step": 3154 + }, + { + "epoch": 1.3316455696202532, + "grad_norm": 1.2309781312942505, + "learning_rate": 9.424831026049585e-05, + "loss": 0.7476315498352051, + "step": 3156 + }, + { + "epoch": 1.3324894514767933, + "grad_norm": 1.0857728719711304, + "learning_rate": 9.423712262309873e-05, + "loss": 0.6811426281929016, + "step": 3158 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.299680233001709, + "learning_rate": 9.4225924781047e-05, + "loss": 0.6403942108154297, + "step": 3160 + }, + { + "epoch": 1.3341772151898734, + "grad_norm": 1.226472020149231, + "learning_rate": 9.421471673692382e-05, + "loss": 0.6758930683135986, + "step": 3162 + }, + { + "epoch": 1.3350210970464136, + "grad_norm": 1.1403205394744873, + "learning_rate": 9.420349849331463e-05, + "loss": 0.7119444608688354, + "step": 3164 + }, + { + "epoch": 1.3358649789029535, + "grad_norm": 1.2888442277908325, + "learning_rate": 9.419227005280729e-05, + "loss": 0.7411463260650635, + "step": 3166 + }, + { + "epoch": 1.3367088607594937, + "grad_norm": 1.1929190158843994, + "learning_rate": 9.418103141799197e-05, + "loss": 0.5992606282234192, + "step": 3168 + }, + { + "epoch": 1.3375527426160336, + "grad_norm": 1.2574355602264404, + "learning_rate": 9.416978259146122e-05, + "loss": 0.6728890538215637, + "step": 3170 + }, + { + "epoch": 1.3383966244725738, + "grad_norm": 0.9653727412223816, + "learning_rate": 9.415852357580992e-05, + "loss": 0.6294883489608765, + "step": 3172 + }, + { + "epoch": 1.339240506329114, + "grad_norm": 1.2107670307159424, + "learning_rate": 9.414725437363532e-05, + "loss": 0.6816665530204773, + "step": 3174 + }, + { + "epoch": 1.340084388185654, + "grad_norm": 1.024849534034729, + "learning_rate": 9.4135974987537e-05, + "loss": 0.6186381578445435, + "step": 3176 + }, + { + "epoch": 1.340928270042194, + "grad_norm": 1.1556614637374878, + "learning_rate": 9.41246854201169e-05, + "loss": 0.6071005463600159, + "step": 3178 + }, + { + "epoch": 1.3417721518987342, + "grad_norm": 1.2382808923721313, + "learning_rate": 9.41133856739793e-05, + "loss": 0.7871434092521667, + "step": 3180 + }, + { + "epoch": 1.3426160337552742, + "grad_norm": 1.0499578714370728, + "learning_rate": 9.410207575173082e-05, + "loss": 0.6578201651573181, + "step": 3182 + }, + { + "epoch": 1.3434599156118143, + "grad_norm": 1.2048250436782837, + "learning_rate": 9.409075565598049e-05, + "loss": 0.6271620392799377, + "step": 3184 + }, + { + "epoch": 1.3443037974683545, + "grad_norm": 1.0287591218948364, + "learning_rate": 9.407942538933958e-05, + "loss": 0.5773864388465881, + "step": 3186 + }, + { + "epoch": 1.3451476793248944, + "grad_norm": 1.1125097274780273, + "learning_rate": 9.406808495442181e-05, + "loss": 0.6745175719261169, + "step": 3188 + }, + { + "epoch": 1.3459915611814346, + "grad_norm": 1.036125898361206, + "learning_rate": 9.405673435384319e-05, + "loss": 0.6001214385032654, + "step": 3190 + }, + { + "epoch": 1.3468354430379748, + "grad_norm": 1.2771985530853271, + "learning_rate": 9.404537359022207e-05, + "loss": 0.6703945994377136, + "step": 3192 + }, + { + "epoch": 1.3476793248945147, + "grad_norm": 1.0891097784042358, + "learning_rate": 9.403400266617918e-05, + "loss": 0.6159096360206604, + "step": 3194 + }, + { + "epoch": 1.3485232067510549, + "grad_norm": 1.1926233768463135, + "learning_rate": 9.402262158433755e-05, + "loss": 0.6439315676689148, + "step": 3196 + }, + { + "epoch": 1.349367088607595, + "grad_norm": 1.272557020187378, + "learning_rate": 9.40112303473226e-05, + "loss": 0.7125352025032043, + "step": 3198 + }, + { + "epoch": 1.350210970464135, + "grad_norm": 1.052037239074707, + "learning_rate": 9.399982895776207e-05, + "loss": 0.594719648361206, + "step": 3200 + }, + { + "epoch": 1.350210970464135, + "eval_loss": 0.7200453281402588, + "eval_runtime": 846.2953, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 3200 + }, + { + "epoch": 1.3510548523206751, + "grad_norm": 1.204728126525879, + "learning_rate": 9.398841741828601e-05, + "loss": 0.6390520334243774, + "step": 3202 + }, + { + "epoch": 1.3518987341772153, + "grad_norm": 1.0873899459838867, + "learning_rate": 9.397699573152689e-05, + "loss": 0.6010531187057495, + "step": 3204 + }, + { + "epoch": 1.3527426160337552, + "grad_norm": 1.3124359846115112, + "learning_rate": 9.396556390011944e-05, + "loss": 0.724280834197998, + "step": 3206 + }, + { + "epoch": 1.3535864978902954, + "grad_norm": 1.2179948091506958, + "learning_rate": 9.395412192670075e-05, + "loss": 0.6430405378341675, + "step": 3208 + }, + { + "epoch": 1.3544303797468356, + "grad_norm": 1.2617219686508179, + "learning_rate": 9.394266981391031e-05, + "loss": 0.7188641428947449, + "step": 3210 + }, + { + "epoch": 1.3552742616033755, + "grad_norm": 1.2151501178741455, + "learning_rate": 9.393120756438988e-05, + "loss": 0.6724364757537842, + "step": 3212 + }, + { + "epoch": 1.3561181434599157, + "grad_norm": 1.221528172492981, + "learning_rate": 9.391973518078357e-05, + "loss": 0.6340664625167847, + "step": 3214 + }, + { + "epoch": 1.3569620253164558, + "grad_norm": 1.3180092573165894, + "learning_rate": 9.390825266573786e-05, + "loss": 0.6914255023002625, + "step": 3216 + }, + { + "epoch": 1.3578059071729958, + "grad_norm": 1.103994369506836, + "learning_rate": 9.38967600219015e-05, + "loss": 0.6137136220932007, + "step": 3218 + }, + { + "epoch": 1.358649789029536, + "grad_norm": 1.33389413356781, + "learning_rate": 9.38852572519257e-05, + "loss": 0.7173700332641602, + "step": 3220 + }, + { + "epoch": 1.3594936708860759, + "grad_norm": 1.1074159145355225, + "learning_rate": 9.387374435846386e-05, + "loss": 0.5942243933677673, + "step": 3222 + }, + { + "epoch": 1.360337552742616, + "grad_norm": 1.1157063245773315, + "learning_rate": 9.386222134417182e-05, + "loss": 0.6362866163253784, + "step": 3224 + }, + { + "epoch": 1.3611814345991562, + "grad_norm": 1.1717792749404907, + "learning_rate": 9.38506882117077e-05, + "loss": 0.6784523129463196, + "step": 3226 + }, + { + "epoch": 1.3620253164556961, + "grad_norm": 1.0946043729782104, + "learning_rate": 9.383914496373197e-05, + "loss": 0.6647377014160156, + "step": 3228 + }, + { + "epoch": 1.3628691983122363, + "grad_norm": 1.1519699096679688, + "learning_rate": 9.382759160290746e-05, + "loss": 0.6302075982093811, + "step": 3230 + }, + { + "epoch": 1.3637130801687762, + "grad_norm": 0.9928684830665588, + "learning_rate": 9.381602813189929e-05, + "loss": 0.5979090332984924, + "step": 3232 + }, + { + "epoch": 1.3645569620253164, + "grad_norm": 1.2488124370574951, + "learning_rate": 9.380445455337492e-05, + "loss": 0.6949353218078613, + "step": 3234 + }, + { + "epoch": 1.3654008438818566, + "grad_norm": 1.3884797096252441, + "learning_rate": 9.379287087000416e-05, + "loss": 0.7225558161735535, + "step": 3236 + }, + { + "epoch": 1.3662447257383965, + "grad_norm": 1.2981176376342773, + "learning_rate": 9.378127708445917e-05, + "loss": 0.6993390917778015, + "step": 3238 + }, + { + "epoch": 1.3670886075949367, + "grad_norm": 0.9884640574455261, + "learning_rate": 9.376967319941438e-05, + "loss": 0.6983805894851685, + "step": 3240 + }, + { + "epoch": 1.3679324894514768, + "grad_norm": 1.2051894664764404, + "learning_rate": 9.375805921754659e-05, + "loss": 0.7062534689903259, + "step": 3242 + }, + { + "epoch": 1.3687763713080168, + "grad_norm": 1.1943434476852417, + "learning_rate": 9.374643514153494e-05, + "loss": 0.6405107378959656, + "step": 3244 + }, + { + "epoch": 1.369620253164557, + "grad_norm": 1.249214768409729, + "learning_rate": 9.373480097406086e-05, + "loss": 0.6844781637191772, + "step": 3246 + }, + { + "epoch": 1.370464135021097, + "grad_norm": 1.1847131252288818, + "learning_rate": 9.372315671780813e-05, + "loss": 0.6048306226730347, + "step": 3248 + }, + { + "epoch": 1.371308016877637, + "grad_norm": 1.125545859336853, + "learning_rate": 9.37115023754629e-05, + "loss": 0.6772685050964355, + "step": 3250 + }, + { + "epoch": 1.3721518987341772, + "grad_norm": 1.466615915298462, + "learning_rate": 9.369983794971354e-05, + "loss": 0.7536272406578064, + "step": 3252 + }, + { + "epoch": 1.3729957805907174, + "grad_norm": 1.066699504852295, + "learning_rate": 9.368816344325084e-05, + "loss": 0.6640655398368835, + "step": 3254 + }, + { + "epoch": 1.3738396624472573, + "grad_norm": 1.4793988466262817, + "learning_rate": 9.367647885876787e-05, + "loss": 0.7029458284378052, + "step": 3256 + }, + { + "epoch": 1.3746835443037975, + "grad_norm": 1.258540153503418, + "learning_rate": 9.366478419896006e-05, + "loss": 0.7231863737106323, + "step": 3258 + }, + { + "epoch": 1.3755274261603376, + "grad_norm": 1.176106333732605, + "learning_rate": 9.365307946652512e-05, + "loss": 0.6679144501686096, + "step": 3260 + }, + { + "epoch": 1.3763713080168776, + "grad_norm": 1.3301753997802734, + "learning_rate": 9.364136466416316e-05, + "loss": 0.6282188296318054, + "step": 3262 + }, + { + "epoch": 1.3772151898734177, + "grad_norm": 1.3616732358932495, + "learning_rate": 9.362963979457648e-05, + "loss": 0.6870840191841125, + "step": 3264 + }, + { + "epoch": 1.378059071729958, + "grad_norm": 1.1982418298721313, + "learning_rate": 9.361790486046985e-05, + "loss": 0.6823731660842896, + "step": 3266 + }, + { + "epoch": 1.3789029535864978, + "grad_norm": 1.1869033575057983, + "learning_rate": 9.360615986455024e-05, + "loss": 0.6582897305488586, + "step": 3268 + }, + { + "epoch": 1.379746835443038, + "grad_norm": 1.1192975044250488, + "learning_rate": 9.359440480952703e-05, + "loss": 0.716654360294342, + "step": 3270 + }, + { + "epoch": 1.3805907172995782, + "grad_norm": 1.2210016250610352, + "learning_rate": 9.358263969811189e-05, + "loss": 0.6880061626434326, + "step": 3272 + }, + { + "epoch": 1.381434599156118, + "grad_norm": 1.0358284711837769, + "learning_rate": 9.357086453301878e-05, + "loss": 0.666864812374115, + "step": 3274 + }, + { + "epoch": 1.3822784810126583, + "grad_norm": 1.2790803909301758, + "learning_rate": 9.355907931696401e-05, + "loss": 0.6872087121009827, + "step": 3276 + }, + { + "epoch": 1.3831223628691984, + "grad_norm": 1.182991623878479, + "learning_rate": 9.354728405266623e-05, + "loss": 0.5929665565490723, + "step": 3278 + }, + { + "epoch": 1.3839662447257384, + "grad_norm": 1.1071184873580933, + "learning_rate": 9.353547874284634e-05, + "loss": 0.5928181409835815, + "step": 3280 + }, + { + "epoch": 1.3848101265822785, + "grad_norm": 1.3139623403549194, + "learning_rate": 9.352366339022763e-05, + "loss": 0.6783652901649475, + "step": 3282 + }, + { + "epoch": 1.3856540084388187, + "grad_norm": 1.2534632682800293, + "learning_rate": 9.351183799753567e-05, + "loss": 0.7652941346168518, + "step": 3284 + }, + { + "epoch": 1.3864978902953586, + "grad_norm": 1.4487930536270142, + "learning_rate": 9.350000256749833e-05, + "loss": 0.7430433630943298, + "step": 3286 + }, + { + "epoch": 1.3873417721518988, + "grad_norm": 1.0786021947860718, + "learning_rate": 9.348815710284584e-05, + "loss": 0.5854598879814148, + "step": 3288 + }, + { + "epoch": 1.3881856540084387, + "grad_norm": 1.0544480085372925, + "learning_rate": 9.347630160631071e-05, + "loss": 0.6365222334861755, + "step": 3290 + }, + { + "epoch": 1.389029535864979, + "grad_norm": 0.9989988207817078, + "learning_rate": 9.346443608062778e-05, + "loss": 0.6485803127288818, + "step": 3292 + }, + { + "epoch": 1.389873417721519, + "grad_norm": 1.100951910018921, + "learning_rate": 9.345256052853419e-05, + "loss": 0.6417753100395203, + "step": 3294 + }, + { + "epoch": 1.390717299578059, + "grad_norm": 1.1398471593856812, + "learning_rate": 9.344067495276942e-05, + "loss": 0.6333693861961365, + "step": 3296 + }, + { + "epoch": 1.3915611814345992, + "grad_norm": 1.1745941638946533, + "learning_rate": 9.342877935607521e-05, + "loss": 0.677288293838501, + "step": 3298 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 1.2651115655899048, + "learning_rate": 9.34168737411957e-05, + "loss": 0.7408396005630493, + "step": 3300 + }, + { + "epoch": 1.3924050632911391, + "eval_loss": 0.7173135876655579, + "eval_runtime": 853.5344, + "eval_samples_per_second": 2.469, + "eval_steps_per_second": 2.469, + "step": 3300 + }, + { + "epoch": 1.3932489451476793, + "grad_norm": 1.0747730731964111, + "learning_rate": 9.340495811087723e-05, + "loss": 0.6810371279716492, + "step": 3302 + }, + { + "epoch": 1.3940928270042194, + "grad_norm": 1.2857651710510254, + "learning_rate": 9.339303246786854e-05, + "loss": 0.6693953275680542, + "step": 3304 + }, + { + "epoch": 1.3949367088607594, + "grad_norm": 1.4544212818145752, + "learning_rate": 9.338109681492063e-05, + "loss": 0.7019274234771729, + "step": 3306 + }, + { + "epoch": 1.3957805907172995, + "grad_norm": 1.687755823135376, + "learning_rate": 9.336915115478685e-05, + "loss": 0.6074224710464478, + "step": 3308 + }, + { + "epoch": 1.3966244725738397, + "grad_norm": 1.1645431518554688, + "learning_rate": 9.33571954902228e-05, + "loss": 0.6981383562088013, + "step": 3310 + }, + { + "epoch": 1.3974683544303796, + "grad_norm": 1.6173527240753174, + "learning_rate": 9.334522982398646e-05, + "loss": 0.7282926440238953, + "step": 3312 + }, + { + "epoch": 1.3983122362869198, + "grad_norm": 1.3132909536361694, + "learning_rate": 9.333325415883804e-05, + "loss": 0.6574883460998535, + "step": 3314 + }, + { + "epoch": 1.39915611814346, + "grad_norm": 1.1629762649536133, + "learning_rate": 9.332126849754014e-05, + "loss": 0.6559937596321106, + "step": 3316 + }, + { + "epoch": 1.4, + "grad_norm": 1.1666897535324097, + "learning_rate": 9.33092728428576e-05, + "loss": 0.683718740940094, + "step": 3318 + }, + { + "epoch": 1.40084388185654, + "grad_norm": 1.2269554138183594, + "learning_rate": 9.329726719755756e-05, + "loss": 0.6909779906272888, + "step": 3320 + }, + { + "epoch": 1.4016877637130802, + "grad_norm": 1.1010066270828247, + "learning_rate": 9.328525156440952e-05, + "loss": 0.6051948666572571, + "step": 3322 + }, + { + "epoch": 1.4025316455696202, + "grad_norm": 1.127143144607544, + "learning_rate": 9.327322594618528e-05, + "loss": 0.6266679763793945, + "step": 3324 + }, + { + "epoch": 1.4033755274261603, + "grad_norm": 1.2160708904266357, + "learning_rate": 9.326119034565887e-05, + "loss": 0.6587526202201843, + "step": 3326 + }, + { + "epoch": 1.4042194092827005, + "grad_norm": 1.0853947401046753, + "learning_rate": 9.32491447656067e-05, + "loss": 0.5916946530342102, + "step": 3328 + }, + { + "epoch": 1.4050632911392404, + "grad_norm": 1.2205027341842651, + "learning_rate": 9.323708920880744e-05, + "loss": 0.6032452583312988, + "step": 3330 + }, + { + "epoch": 1.4059071729957806, + "grad_norm": 1.1964668035507202, + "learning_rate": 9.32250236780421e-05, + "loss": 0.6649114489555359, + "step": 3332 + }, + { + "epoch": 1.4067510548523208, + "grad_norm": 1.2507994174957275, + "learning_rate": 9.321294817609394e-05, + "loss": 0.7142994403839111, + "step": 3334 + }, + { + "epoch": 1.4075949367088607, + "grad_norm": 1.1310259103775024, + "learning_rate": 9.320086270574854e-05, + "loss": 0.709568977355957, + "step": 3336 + }, + { + "epoch": 1.4084388185654009, + "grad_norm": 1.2454090118408203, + "learning_rate": 9.318876726979385e-05, + "loss": 0.7800853848457336, + "step": 3338 + }, + { + "epoch": 1.409282700421941, + "grad_norm": 1.1168389320373535, + "learning_rate": 9.317666187101996e-05, + "loss": 0.6187908053398132, + "step": 3340 + }, + { + "epoch": 1.410126582278481, + "grad_norm": 1.6696287393569946, + "learning_rate": 9.316454651221942e-05, + "loss": 0.6222613453865051, + "step": 3342 + }, + { + "epoch": 1.4109704641350211, + "grad_norm": 0.9500295519828796, + "learning_rate": 9.315242119618698e-05, + "loss": 0.6116594672203064, + "step": 3344 + }, + { + "epoch": 1.4118143459915613, + "grad_norm": 1.186358094215393, + "learning_rate": 9.314028592571973e-05, + "loss": 0.633224368095398, + "step": 3346 + }, + { + "epoch": 1.4126582278481012, + "grad_norm": 1.1855978965759277, + "learning_rate": 9.312814070361705e-05, + "loss": 0.6675921082496643, + "step": 3348 + }, + { + "epoch": 1.4135021097046414, + "grad_norm": 1.2465872764587402, + "learning_rate": 9.311598553268059e-05, + "loss": 0.7268879413604736, + "step": 3350 + }, + { + "epoch": 1.4143459915611816, + "grad_norm": 1.151274561882019, + "learning_rate": 9.310382041571435e-05, + "loss": 0.6147416830062866, + "step": 3352 + }, + { + "epoch": 1.4151898734177215, + "grad_norm": 1.1226807832717896, + "learning_rate": 9.309164535552453e-05, + "loss": 0.6678543090820312, + "step": 3354 + }, + { + "epoch": 1.4160337552742617, + "grad_norm": 1.375842571258545, + "learning_rate": 9.307946035491975e-05, + "loss": 0.6334129571914673, + "step": 3356 + }, + { + "epoch": 1.4168776371308016, + "grad_norm": 1.058353066444397, + "learning_rate": 9.306726541671081e-05, + "loss": 0.6582583785057068, + "step": 3358 + }, + { + "epoch": 1.4177215189873418, + "grad_norm": 1.0511330366134644, + "learning_rate": 9.305506054371084e-05, + "loss": 0.5877419114112854, + "step": 3360 + }, + { + "epoch": 1.4185654008438817, + "grad_norm": 1.2246462106704712, + "learning_rate": 9.304284573873532e-05, + "loss": 0.711665689945221, + "step": 3362 + }, + { + "epoch": 1.4194092827004219, + "grad_norm": 1.0242294073104858, + "learning_rate": 9.303062100460193e-05, + "loss": 0.6743642687797546, + "step": 3364 + }, + { + "epoch": 1.420253164556962, + "grad_norm": 1.1432100534439087, + "learning_rate": 9.301838634413069e-05, + "loss": 0.6825576424598694, + "step": 3366 + }, + { + "epoch": 1.421097046413502, + "grad_norm": 1.0128604173660278, + "learning_rate": 9.30061417601439e-05, + "loss": 0.624455988407135, + "step": 3368 + }, + { + "epoch": 1.4219409282700421, + "grad_norm": 1.2738330364227295, + "learning_rate": 9.299388725546617e-05, + "loss": 0.7029586434364319, + "step": 3370 + }, + { + "epoch": 1.4227848101265823, + "grad_norm": 1.0857324600219727, + "learning_rate": 9.298162283292435e-05, + "loss": 0.5994319915771484, + "step": 3372 + }, + { + "epoch": 1.4236286919831223, + "grad_norm": 1.0811917781829834, + "learning_rate": 9.296934849534763e-05, + "loss": 0.6537772417068481, + "step": 3374 + }, + { + "epoch": 1.4244725738396624, + "grad_norm": 1.006913185119629, + "learning_rate": 9.295706424556745e-05, + "loss": 0.5775008201599121, + "step": 3376 + }, + { + "epoch": 1.4253164556962026, + "grad_norm": 1.2306486368179321, + "learning_rate": 9.294477008641755e-05, + "loss": 0.7445536255836487, + "step": 3378 + }, + { + "epoch": 1.4261603375527425, + "grad_norm": 1.223608374595642, + "learning_rate": 9.293246602073398e-05, + "loss": 0.6081538796424866, + "step": 3380 + }, + { + "epoch": 1.4270042194092827, + "grad_norm": 1.0933321714401245, + "learning_rate": 9.2920152051355e-05, + "loss": 0.6134634613990784, + "step": 3382 + }, + { + "epoch": 1.4278481012658228, + "grad_norm": 1.1738401651382446, + "learning_rate": 9.290782818112127e-05, + "loss": 0.5961087346076965, + "step": 3384 + }, + { + "epoch": 1.4286919831223628, + "grad_norm": 1.1493438482284546, + "learning_rate": 9.289549441287561e-05, + "loss": 0.6284122467041016, + "step": 3386 + }, + { + "epoch": 1.429535864978903, + "grad_norm": 1.1907998323440552, + "learning_rate": 9.288315074946324e-05, + "loss": 0.6654639840126038, + "step": 3388 + }, + { + "epoch": 1.4303797468354431, + "grad_norm": 1.3423025608062744, + "learning_rate": 9.287079719373157e-05, + "loss": 0.652850329875946, + "step": 3390 + }, + { + "epoch": 1.431223628691983, + "grad_norm": 1.3932039737701416, + "learning_rate": 9.285843374853034e-05, + "loss": 0.703445315361023, + "step": 3392 + }, + { + "epoch": 1.4320675105485232, + "grad_norm": 5.349400043487549, + "learning_rate": 9.284606041671155e-05, + "loss": 0.693265438079834, + "step": 3394 + }, + { + "epoch": 1.4329113924050634, + "grad_norm": 1.0921961069107056, + "learning_rate": 9.28336772011295e-05, + "loss": 0.6578536033630371, + "step": 3396 + }, + { + "epoch": 1.4337552742616033, + "grad_norm": 1.184157133102417, + "learning_rate": 9.282128410464074e-05, + "loss": 0.7092277407646179, + "step": 3398 + }, + { + "epoch": 1.4345991561181435, + "grad_norm": 1.0923491716384888, + "learning_rate": 9.280888113010415e-05, + "loss": 0.6866328120231628, + "step": 3400 + }, + { + "epoch": 1.4345991561181435, + "eval_loss": 0.715917706489563, + "eval_runtime": 868.51, + "eval_samples_per_second": 2.426, + "eval_steps_per_second": 2.426, + "step": 3400 + }, + { + "epoch": 1.4354430379746836, + "grad_norm": 1.2515597343444824, + "learning_rate": 9.279646828038083e-05, + "loss": 0.6617444157600403, + "step": 3402 + }, + { + "epoch": 1.4362869198312236, + "grad_norm": 1.2122540473937988, + "learning_rate": 9.278404555833422e-05, + "loss": 0.6373176574707031, + "step": 3404 + }, + { + "epoch": 1.4371308016877637, + "grad_norm": 1.191904902458191, + "learning_rate": 9.277161296682997e-05, + "loss": 0.6506488919258118, + "step": 3406 + }, + { + "epoch": 1.437974683544304, + "grad_norm": 1.2492214441299438, + "learning_rate": 9.275917050873606e-05, + "loss": 0.7172291874885559, + "step": 3408 + }, + { + "epoch": 1.4388185654008439, + "grad_norm": 1.0518640279769897, + "learning_rate": 9.274671818692272e-05, + "loss": 0.6180248260498047, + "step": 3410 + }, + { + "epoch": 1.439662447257384, + "grad_norm": 1.150563359260559, + "learning_rate": 9.273425600426245e-05, + "loss": 0.6828892827033997, + "step": 3412 + }, + { + "epoch": 1.4405063291139242, + "grad_norm": 1.76945960521698, + "learning_rate": 9.272178396363005e-05, + "loss": 0.6585919857025146, + "step": 3414 + }, + { + "epoch": 1.4413502109704641, + "grad_norm": 1.2367758750915527, + "learning_rate": 9.270930206790257e-05, + "loss": 0.7548692226409912, + "step": 3416 + }, + { + "epoch": 1.4421940928270043, + "grad_norm": 1.2292778491973877, + "learning_rate": 9.269681031995936e-05, + "loss": 0.7017102837562561, + "step": 3418 + }, + { + "epoch": 1.4430379746835442, + "grad_norm": 1.2193396091461182, + "learning_rate": 9.268430872268202e-05, + "loss": 0.6657648682594299, + "step": 3420 + }, + { + "epoch": 1.4438818565400844, + "grad_norm": 1.0505954027175903, + "learning_rate": 9.267179727895443e-05, + "loss": 0.6950910091400146, + "step": 3422 + }, + { + "epoch": 1.4447257383966245, + "grad_norm": 1.1560698747634888, + "learning_rate": 9.265927599166272e-05, + "loss": 0.689308226108551, + "step": 3424 + }, + { + "epoch": 1.4455696202531645, + "grad_norm": 1.189336895942688, + "learning_rate": 9.264674486369533e-05, + "loss": 0.6481659412384033, + "step": 3426 + }, + { + "epoch": 1.4464135021097047, + "grad_norm": 1.3527976274490356, + "learning_rate": 9.263420389794294e-05, + "loss": 0.6626612544059753, + "step": 3428 + }, + { + "epoch": 1.4472573839662446, + "grad_norm": 1.096303105354309, + "learning_rate": 9.262165309729854e-05, + "loss": 0.690841794013977, + "step": 3430 + }, + { + "epoch": 1.4481012658227848, + "grad_norm": 1.2131421566009521, + "learning_rate": 9.260909246465732e-05, + "loss": 0.6497649550437927, + "step": 3432 + }, + { + "epoch": 1.448945147679325, + "grad_norm": 1.1831032037734985, + "learning_rate": 9.259652200291678e-05, + "loss": 0.6236130595207214, + "step": 3434 + }, + { + "epoch": 1.4497890295358649, + "grad_norm": 0.9745979309082031, + "learning_rate": 9.25839417149767e-05, + "loss": 0.5223423838615417, + "step": 3436 + }, + { + "epoch": 1.450632911392405, + "grad_norm": 1.372460126876831, + "learning_rate": 9.257135160373912e-05, + "loss": 0.6642022728919983, + "step": 3438 + }, + { + "epoch": 1.4514767932489452, + "grad_norm": 1.421044111251831, + "learning_rate": 9.255875167210832e-05, + "loss": 0.5426992774009705, + "step": 3440 + }, + { + "epoch": 1.4523206751054851, + "grad_norm": 1.1694250106811523, + "learning_rate": 9.254614192299086e-05, + "loss": 0.6260567307472229, + "step": 3442 + }, + { + "epoch": 1.4531645569620253, + "grad_norm": 1.0892298221588135, + "learning_rate": 9.253352235929558e-05, + "loss": 0.5776100158691406, + "step": 3444 + }, + { + "epoch": 1.4540084388185655, + "grad_norm": 1.1841259002685547, + "learning_rate": 9.252089298393356e-05, + "loss": 0.6495202779769897, + "step": 3446 + }, + { + "epoch": 1.4548523206751054, + "grad_norm": 1.1133549213409424, + "learning_rate": 9.250825379981815e-05, + "loss": 0.6570594906806946, + "step": 3448 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 1.197100281715393, + "learning_rate": 9.249560480986498e-05, + "loss": 0.6496587991714478, + "step": 3450 + }, + { + "epoch": 1.4565400843881857, + "grad_norm": 1.1661107540130615, + "learning_rate": 9.248294601699193e-05, + "loss": 0.6644704341888428, + "step": 3452 + }, + { + "epoch": 1.4573839662447257, + "grad_norm": 1.2257879972457886, + "learning_rate": 9.247027742411912e-05, + "loss": 0.6451231241226196, + "step": 3454 + }, + { + "epoch": 1.4582278481012658, + "grad_norm": 1.3634982109069824, + "learning_rate": 9.245759903416897e-05, + "loss": 0.6108601093292236, + "step": 3456 + }, + { + "epoch": 1.459071729957806, + "grad_norm": 1.1802605390548706, + "learning_rate": 9.244491085006615e-05, + "loss": 0.6080004572868347, + "step": 3458 + }, + { + "epoch": 1.459915611814346, + "grad_norm": 1.280831217765808, + "learning_rate": 9.243221287473756e-05, + "loss": 0.6406423449516296, + "step": 3460 + }, + { + "epoch": 1.460759493670886, + "grad_norm": 1.3127192258834839, + "learning_rate": 9.241950511111237e-05, + "loss": 0.7320113778114319, + "step": 3462 + }, + { + "epoch": 1.4616033755274263, + "grad_norm": 1.1711835861206055, + "learning_rate": 9.240678756212204e-05, + "loss": 0.572110652923584, + "step": 3464 + }, + { + "epoch": 1.4624472573839662, + "grad_norm": 1.347143292427063, + "learning_rate": 9.239406023070028e-05, + "loss": 0.7446795105934143, + "step": 3466 + }, + { + "epoch": 1.4632911392405064, + "grad_norm": 1.4953652620315552, + "learning_rate": 9.238132311978299e-05, + "loss": 0.6709978580474854, + "step": 3468 + }, + { + "epoch": 1.4641350210970465, + "grad_norm": 1.2199387550354004, + "learning_rate": 9.236857623230842e-05, + "loss": 0.6691445112228394, + "step": 3470 + }, + { + "epoch": 1.4649789029535865, + "grad_norm": 1.0959199666976929, + "learning_rate": 9.235581957121702e-05, + "loss": 0.6964292526245117, + "step": 3472 + }, + { + "epoch": 1.4658227848101266, + "grad_norm": 1.455505609512329, + "learning_rate": 9.234305313945149e-05, + "loss": 0.6880454421043396, + "step": 3474 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.2820862531661987, + "learning_rate": 9.233027693995681e-05, + "loss": 0.6737138032913208, + "step": 3476 + }, + { + "epoch": 1.4675105485232067, + "grad_norm": 1.3459213972091675, + "learning_rate": 9.231749097568023e-05, + "loss": 0.6874006390571594, + "step": 3478 + }, + { + "epoch": 1.4683544303797469, + "grad_norm": 1.2815442085266113, + "learning_rate": 9.230469524957119e-05, + "loss": 0.7179469466209412, + "step": 3480 + }, + { + "epoch": 1.469198312236287, + "grad_norm": 1.6181597709655762, + "learning_rate": 9.229188976458145e-05, + "loss": 0.7525522112846375, + "step": 3482 + }, + { + "epoch": 1.470042194092827, + "grad_norm": 1.0633227825164795, + "learning_rate": 9.227907452366495e-05, + "loss": 0.5918128490447998, + "step": 3484 + }, + { + "epoch": 1.4708860759493672, + "grad_norm": 1.2055985927581787, + "learning_rate": 9.226624952977796e-05, + "loss": 0.6686186194419861, + "step": 3486 + }, + { + "epoch": 1.471729957805907, + "grad_norm": 1.2495088577270508, + "learning_rate": 9.225341478587893e-05, + "loss": 0.764410674571991, + "step": 3488 + }, + { + "epoch": 1.4725738396624473, + "grad_norm": 1.174229383468628, + "learning_rate": 9.22405702949286e-05, + "loss": 0.7066780924797058, + "step": 3490 + }, + { + "epoch": 1.4734177215189874, + "grad_norm": 1.0970302820205688, + "learning_rate": 9.222771605988995e-05, + "loss": 0.6740228533744812, + "step": 3492 + }, + { + "epoch": 1.4742616033755274, + "grad_norm": 1.2470436096191406, + "learning_rate": 9.221485208372822e-05, + "loss": 0.698371410369873, + "step": 3494 + }, + { + "epoch": 1.4751054852320675, + "grad_norm": 1.0750112533569336, + "learning_rate": 9.220197836941084e-05, + "loss": 0.6354188919067383, + "step": 3496 + }, + { + "epoch": 1.4759493670886075, + "grad_norm": 1.2656232118606567, + "learning_rate": 9.218909491990757e-05, + "loss": 0.7268608212471008, + "step": 3498 + }, + { + "epoch": 1.4767932489451476, + "grad_norm": 1.2389028072357178, + "learning_rate": 9.217620173819037e-05, + "loss": 0.6652966141700745, + "step": 3500 + }, + { + "epoch": 1.4767932489451476, + "eval_loss": 0.7155047059059143, + "eval_runtime": 855.8428, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 2.462, + "step": 3500 + }, + { + "epoch": 1.4776371308016878, + "grad_norm": 1.218304991722107, + "learning_rate": 9.216329882723343e-05, + "loss": 0.6845020651817322, + "step": 3502 + }, + { + "epoch": 1.4784810126582277, + "grad_norm": 1.123903512954712, + "learning_rate": 9.21503861900132e-05, + "loss": 0.6972519755363464, + "step": 3504 + }, + { + "epoch": 1.479324894514768, + "grad_norm": 1.1827739477157593, + "learning_rate": 9.213746382950839e-05, + "loss": 0.6699702739715576, + "step": 3506 + }, + { + "epoch": 1.480168776371308, + "grad_norm": 0.9934872984886169, + "learning_rate": 9.212453174869995e-05, + "loss": 0.5623225569725037, + "step": 3508 + }, + { + "epoch": 1.481012658227848, + "grad_norm": 1.221093773841858, + "learning_rate": 9.211158995057105e-05, + "loss": 0.6527173519134521, + "step": 3510 + }, + { + "epoch": 1.4818565400843882, + "grad_norm": 1.4569166898727417, + "learning_rate": 9.209863843810711e-05, + "loss": 0.7015712261199951, + "step": 3512 + }, + { + "epoch": 1.4827004219409283, + "grad_norm": 1.0764813423156738, + "learning_rate": 9.208567721429581e-05, + "loss": 0.6442505717277527, + "step": 3514 + }, + { + "epoch": 1.4835443037974683, + "grad_norm": 2.1307506561279297, + "learning_rate": 9.207270628212704e-05, + "loss": 0.666451096534729, + "step": 3516 + }, + { + "epoch": 1.4843881856540084, + "grad_norm": 1.180590271949768, + "learning_rate": 9.205972564459296e-05, + "loss": 0.6354807019233704, + "step": 3518 + }, + { + "epoch": 1.4852320675105486, + "grad_norm": 1.2999447584152222, + "learning_rate": 9.204673530468795e-05, + "loss": 0.6080324053764343, + "step": 3520 + }, + { + "epoch": 1.4860759493670885, + "grad_norm": 1.1680655479431152, + "learning_rate": 9.203373526540862e-05, + "loss": 0.6411244869232178, + "step": 3522 + }, + { + "epoch": 1.4869198312236287, + "grad_norm": 1.0565013885498047, + "learning_rate": 9.202072552975383e-05, + "loss": 0.6498287916183472, + "step": 3524 + }, + { + "epoch": 1.4877637130801689, + "grad_norm": 1.246267318725586, + "learning_rate": 9.20077061007247e-05, + "loss": 0.633613109588623, + "step": 3526 + }, + { + "epoch": 1.4886075949367088, + "grad_norm": 1.0626300573349, + "learning_rate": 9.199467698132453e-05, + "loss": 0.6102107167243958, + "step": 3528 + }, + { + "epoch": 1.489451476793249, + "grad_norm": 1.256600260734558, + "learning_rate": 9.198163817455892e-05, + "loss": 0.669352114200592, + "step": 3530 + }, + { + "epoch": 1.4902953586497891, + "grad_norm": 1.143188238143921, + "learning_rate": 9.196858968343565e-05, + "loss": 0.6305804252624512, + "step": 3532 + }, + { + "epoch": 1.491139240506329, + "grad_norm": 1.1471205949783325, + "learning_rate": 9.195553151096475e-05, + "loss": 0.6256994605064392, + "step": 3534 + }, + { + "epoch": 1.4919831223628692, + "grad_norm": 1.1771589517593384, + "learning_rate": 9.194246366015851e-05, + "loss": 0.6395107507705688, + "step": 3536 + }, + { + "epoch": 1.4928270042194094, + "grad_norm": 1.1997097730636597, + "learning_rate": 9.192938613403144e-05, + "loss": 0.6875160932540894, + "step": 3538 + }, + { + "epoch": 1.4936708860759493, + "grad_norm": 1.3962169885635376, + "learning_rate": 9.191629893560024e-05, + "loss": 0.7216510772705078, + "step": 3540 + }, + { + "epoch": 1.4945147679324895, + "grad_norm": 1.1835654973983765, + "learning_rate": 9.19032020678839e-05, + "loss": 0.6870693564414978, + "step": 3542 + }, + { + "epoch": 1.4953586497890297, + "grad_norm": 1.112331509590149, + "learning_rate": 9.18900955339036e-05, + "loss": 0.6266092658042908, + "step": 3544 + }, + { + "epoch": 1.4962025316455696, + "grad_norm": 1.0298354625701904, + "learning_rate": 9.187697933668278e-05, + "loss": 0.5906343460083008, + "step": 3546 + }, + { + "epoch": 1.4970464135021098, + "grad_norm": 1.2650012969970703, + "learning_rate": 9.186385347924709e-05, + "loss": 0.6203610897064209, + "step": 3548 + }, + { + "epoch": 1.49789029535865, + "grad_norm": 1.1208417415618896, + "learning_rate": 9.185071796462441e-05, + "loss": 0.6841281652450562, + "step": 3550 + }, + { + "epoch": 1.4987341772151899, + "grad_norm": 1.1319488286972046, + "learning_rate": 9.183757279584486e-05, + "loss": 0.7089514136314392, + "step": 3552 + }, + { + "epoch": 1.49957805907173, + "grad_norm": 1.1104235649108887, + "learning_rate": 9.182441797594076e-05, + "loss": 0.6663861870765686, + "step": 3554 + }, + { + "epoch": 1.5004219409282702, + "grad_norm": 1.161412000656128, + "learning_rate": 9.18112535079467e-05, + "loss": 0.6713237762451172, + "step": 3556 + }, + { + "epoch": 1.5012658227848101, + "grad_norm": 1.2925246953964233, + "learning_rate": 9.179807939489945e-05, + "loss": 0.6665274500846863, + "step": 3558 + }, + { + "epoch": 1.50210970464135, + "grad_norm": 1.0968270301818848, + "learning_rate": 9.178489563983802e-05, + "loss": 0.6881593465805054, + "step": 3560 + }, + { + "epoch": 1.5029535864978905, + "grad_norm": 1.111439824104309, + "learning_rate": 9.177170224580368e-05, + "loss": 0.631568431854248, + "step": 3562 + }, + { + "epoch": 1.5037974683544304, + "grad_norm": 1.6731075048446655, + "learning_rate": 9.175849921583986e-05, + "loss": 0.6896167397499084, + "step": 3564 + }, + { + "epoch": 1.5046413502109703, + "grad_norm": 1.226739525794983, + "learning_rate": 9.174528655299226e-05, + "loss": 0.6285277605056763, + "step": 3566 + }, + { + "epoch": 1.5054852320675105, + "grad_norm": 1.2030941247940063, + "learning_rate": 9.17320642603088e-05, + "loss": 0.6256678700447083, + "step": 3568 + }, + { + "epoch": 1.5063291139240507, + "grad_norm": 1.1980781555175781, + "learning_rate": 9.171883234083958e-05, + "loss": 0.6895992159843445, + "step": 3570 + }, + { + "epoch": 1.5071729957805906, + "grad_norm": 1.2083429098129272, + "learning_rate": 9.170559079763696e-05, + "loss": 0.6642275452613831, + "step": 3572 + }, + { + "epoch": 1.5080168776371308, + "grad_norm": 1.134020209312439, + "learning_rate": 9.169233963375552e-05, + "loss": 0.7441924214363098, + "step": 3574 + }, + { + "epoch": 1.508860759493671, + "grad_norm": 1.8178621530532837, + "learning_rate": 9.167907885225204e-05, + "loss": 0.6435995101928711, + "step": 3576 + }, + { + "epoch": 1.5097046413502109, + "grad_norm": 1.3850326538085938, + "learning_rate": 9.166580845618553e-05, + "loss": 0.6933603882789612, + "step": 3578 + }, + { + "epoch": 1.510548523206751, + "grad_norm": 1.2500641345977783, + "learning_rate": 9.165252844861723e-05, + "loss": 0.6686714887619019, + "step": 3580 + }, + { + "epoch": 1.5113924050632912, + "grad_norm": 1.0226643085479736, + "learning_rate": 9.163923883261056e-05, + "loss": 0.607890248298645, + "step": 3582 + }, + { + "epoch": 1.5122362869198311, + "grad_norm": 1.233402132987976, + "learning_rate": 9.162593961123118e-05, + "loss": 0.6604583859443665, + "step": 3584 + }, + { + "epoch": 1.5130801687763713, + "grad_norm": 1.2609056234359741, + "learning_rate": 9.161263078754698e-05, + "loss": 0.6756428480148315, + "step": 3586 + }, + { + "epoch": 1.5139240506329115, + "grad_norm": 1.22673761844635, + "learning_rate": 9.159931236462805e-05, + "loss": 0.6990940570831299, + "step": 3588 + }, + { + "epoch": 1.5147679324894514, + "grad_norm": 1.1386182308197021, + "learning_rate": 9.158598434554668e-05, + "loss": 0.6436648964881897, + "step": 3590 + }, + { + "epoch": 1.5156118143459916, + "grad_norm": 1.1136831045150757, + "learning_rate": 9.157264673337739e-05, + "loss": 0.6420145034790039, + "step": 3592 + }, + { + "epoch": 1.5164556962025317, + "grad_norm": 1.1957908868789673, + "learning_rate": 9.155929953119693e-05, + "loss": 0.6518592834472656, + "step": 3594 + }, + { + "epoch": 1.5172995780590717, + "grad_norm": 1.1049647331237793, + "learning_rate": 9.154594274208422e-05, + "loss": 0.6891129612922668, + "step": 3596 + }, + { + "epoch": 1.5181434599156118, + "grad_norm": 1.243675947189331, + "learning_rate": 9.153257636912043e-05, + "loss": 0.6945107579231262, + "step": 3598 + }, + { + "epoch": 1.518987341772152, + "grad_norm": 1.2633713483810425, + "learning_rate": 9.15192004153889e-05, + "loss": 0.7011660933494568, + "step": 3600 + }, + { + "epoch": 1.518987341772152, + "eval_loss": 0.7118256688117981, + "eval_runtime": 851.3079, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 2.475, + "step": 3600 + }, + { + "epoch": 1.519831223628692, + "grad_norm": 1.2995525598526, + "learning_rate": 9.150581488397525e-05, + "loss": 0.6843758821487427, + "step": 3602 + }, + { + "epoch": 1.520675105485232, + "grad_norm": 1.3140910863876343, + "learning_rate": 9.149241977796723e-05, + "loss": 0.6699353456497192, + "step": 3604 + }, + { + "epoch": 1.5215189873417723, + "grad_norm": 1.2674909830093384, + "learning_rate": 9.147901510045485e-05, + "loss": 0.7269271612167358, + "step": 3606 + }, + { + "epoch": 1.5223628691983122, + "grad_norm": 1.0232038497924805, + "learning_rate": 9.146560085453031e-05, + "loss": 0.5556837916374207, + "step": 3608 + }, + { + "epoch": 1.5232067510548524, + "grad_norm": 1.2598992586135864, + "learning_rate": 9.1452177043288e-05, + "loss": 0.7273092269897461, + "step": 3610 + }, + { + "epoch": 1.5240506329113925, + "grad_norm": 1.2002917528152466, + "learning_rate": 9.143874366982455e-05, + "loss": 0.6897470355033875, + "step": 3612 + }, + { + "epoch": 1.5248945147679325, + "grad_norm": 1.0959099531173706, + "learning_rate": 9.142530073723878e-05, + "loss": 0.6060715913772583, + "step": 3614 + }, + { + "epoch": 1.5257383966244724, + "grad_norm": 1.9890750646591187, + "learning_rate": 9.141184824863173e-05, + "loss": 0.6585046052932739, + "step": 3616 + }, + { + "epoch": 1.5265822784810128, + "grad_norm": 1.1460137367248535, + "learning_rate": 9.139838620710663e-05, + "loss": 0.6022046804428101, + "step": 3618 + }, + { + "epoch": 1.5274261603375527, + "grad_norm": 1.193206548690796, + "learning_rate": 9.138491461576888e-05, + "loss": 0.6332581639289856, + "step": 3620 + }, + { + "epoch": 1.5282700421940927, + "grad_norm": 1.2813689708709717, + "learning_rate": 9.137143347772614e-05, + "loss": 0.6690208315849304, + "step": 3622 + }, + { + "epoch": 1.529113924050633, + "grad_norm": 1.0950052738189697, + "learning_rate": 9.135794279608827e-05, + "loss": 0.6034293174743652, + "step": 3624 + }, + { + "epoch": 1.529957805907173, + "grad_norm": 1.208884358406067, + "learning_rate": 9.134444257396729e-05, + "loss": 0.7077960968017578, + "step": 3626 + }, + { + "epoch": 1.530801687763713, + "grad_norm": 1.093759298324585, + "learning_rate": 9.133093281447742e-05, + "loss": 0.6741147637367249, + "step": 3628 + }, + { + "epoch": 1.5316455696202531, + "grad_norm": 1.1280012130737305, + "learning_rate": 9.131741352073514e-05, + "loss": 0.6816818118095398, + "step": 3630 + }, + { + "epoch": 1.5324894514767933, + "grad_norm": 1.2868385314941406, + "learning_rate": 9.130388469585907e-05, + "loss": 0.7149180769920349, + "step": 3632 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.9654553532600403, + "learning_rate": 9.129034634297007e-05, + "loss": 0.613467812538147, + "step": 3634 + }, + { + "epoch": 1.5341772151898734, + "grad_norm": 1.8958736658096313, + "learning_rate": 9.127679846519115e-05, + "loss": 0.7034116387367249, + "step": 3636 + }, + { + "epoch": 1.5350210970464135, + "grad_norm": 1.305284857749939, + "learning_rate": 9.126324106564757e-05, + "loss": 0.7076106667518616, + "step": 3638 + }, + { + "epoch": 1.5358649789029535, + "grad_norm": 1.1843762397766113, + "learning_rate": 9.124967414746675e-05, + "loss": 0.6671180725097656, + "step": 3640 + }, + { + "epoch": 1.5367088607594936, + "grad_norm": 1.0460047721862793, + "learning_rate": 9.123609771377832e-05, + "loss": 0.667533814907074, + "step": 3642 + }, + { + "epoch": 1.5375527426160338, + "grad_norm": 1.0441135168075562, + "learning_rate": 9.122251176771409e-05, + "loss": 0.6454499959945679, + "step": 3644 + }, + { + "epoch": 1.5383966244725737, + "grad_norm": 1.5647634267807007, + "learning_rate": 9.120891631240811e-05, + "loss": 0.677007794380188, + "step": 3646 + }, + { + "epoch": 1.539240506329114, + "grad_norm": 1.0650273561477661, + "learning_rate": 9.119531135099655e-05, + "loss": 0.7017449736595154, + "step": 3648 + }, + { + "epoch": 1.540084388185654, + "grad_norm": 1.2904767990112305, + "learning_rate": 9.118169688661784e-05, + "loss": 0.683830738067627, + "step": 3650 + }, + { + "epoch": 1.540928270042194, + "grad_norm": 1.1278672218322754, + "learning_rate": 9.116807292241257e-05, + "loss": 0.5923286080360413, + "step": 3652 + }, + { + "epoch": 1.5417721518987342, + "grad_norm": 1.1107184886932373, + "learning_rate": 9.115443946152352e-05, + "loss": 0.6595140099525452, + "step": 3654 + }, + { + "epoch": 1.5426160337552743, + "grad_norm": 1.0917898416519165, + "learning_rate": 9.114079650709566e-05, + "loss": 0.655241072177887, + "step": 3656 + }, + { + "epoch": 1.5434599156118143, + "grad_norm": 1.1922433376312256, + "learning_rate": 9.11271440622762e-05, + "loss": 0.5987096428871155, + "step": 3658 + }, + { + "epoch": 1.5443037974683544, + "grad_norm": 0.9974617958068848, + "learning_rate": 9.111348213021445e-05, + "loss": 0.5710145235061646, + "step": 3660 + }, + { + "epoch": 1.5451476793248946, + "grad_norm": 1.133683443069458, + "learning_rate": 9.109981071406197e-05, + "loss": 0.6067734360694885, + "step": 3662 + }, + { + "epoch": 1.5459915611814345, + "grad_norm": 1.1958736181259155, + "learning_rate": 9.108612981697248e-05, + "loss": 0.622981071472168, + "step": 3664 + }, + { + "epoch": 1.5468354430379747, + "grad_norm": 1.234328031539917, + "learning_rate": 9.107243944210194e-05, + "loss": 0.6520710587501526, + "step": 3666 + }, + { + "epoch": 1.5476793248945149, + "grad_norm": 1.0374714136123657, + "learning_rate": 9.105873959260842e-05, + "loss": 0.5993341207504272, + "step": 3668 + }, + { + "epoch": 1.5485232067510548, + "grad_norm": 0.9987428784370422, + "learning_rate": 9.104503027165223e-05, + "loss": 0.6564813852310181, + "step": 3670 + }, + { + "epoch": 1.549367088607595, + "grad_norm": 1.0823339223861694, + "learning_rate": 9.103131148239584e-05, + "loss": 0.61710524559021, + "step": 3672 + }, + { + "epoch": 1.5502109704641351, + "grad_norm": 1.3481065034866333, + "learning_rate": 9.101758322800391e-05, + "loss": 0.687752366065979, + "step": 3674 + }, + { + "epoch": 1.551054852320675, + "grad_norm": 1.2243965864181519, + "learning_rate": 9.10038455116433e-05, + "loss": 0.5981095433235168, + "step": 3676 + }, + { + "epoch": 1.5518987341772152, + "grad_norm": 1.1384631395339966, + "learning_rate": 9.0990098336483e-05, + "loss": 0.7181004285812378, + "step": 3678 + }, + { + "epoch": 1.5527426160337554, + "grad_norm": 1.042925477027893, + "learning_rate": 9.097634170569426e-05, + "loss": 0.6137188076972961, + "step": 3680 + }, + { + "epoch": 1.5535864978902953, + "grad_norm": 1.372023105621338, + "learning_rate": 9.096257562245045e-05, + "loss": 0.6761168241500854, + "step": 3682 + }, + { + "epoch": 1.5544303797468353, + "grad_norm": 1.0574673414230347, + "learning_rate": 9.094880008992714e-05, + "loss": 0.614276647567749, + "step": 3684 + }, + { + "epoch": 1.5552742616033757, + "grad_norm": 1.2894645929336548, + "learning_rate": 9.093501511130208e-05, + "loss": 0.668122410774231, + "step": 3686 + }, + { + "epoch": 1.5561181434599156, + "grad_norm": 1.2241230010986328, + "learning_rate": 9.092122068975523e-05, + "loss": 0.6305631399154663, + "step": 3688 + }, + { + "epoch": 1.5569620253164556, + "grad_norm": 1.1316208839416504, + "learning_rate": 9.090741682846866e-05, + "loss": 0.633276641368866, + "step": 3690 + }, + { + "epoch": 1.557805907172996, + "grad_norm": 1.2857953310012817, + "learning_rate": 9.089360353062666e-05, + "loss": 0.6657599806785583, + "step": 3692 + }, + { + "epoch": 1.5586497890295359, + "grad_norm": 1.2325671911239624, + "learning_rate": 9.087978079941573e-05, + "loss": 0.6379332542419434, + "step": 3694 + }, + { + "epoch": 1.5594936708860758, + "grad_norm": 1.3286080360412598, + "learning_rate": 9.086594863802445e-05, + "loss": 0.6841909885406494, + "step": 3696 + }, + { + "epoch": 1.560337552742616, + "grad_norm": 1.261890172958374, + "learning_rate": 9.085210704964368e-05, + "loss": 0.6735964417457581, + "step": 3698 + }, + { + "epoch": 1.5611814345991561, + "grad_norm": 1.0922305583953857, + "learning_rate": 9.083825603746639e-05, + "loss": 0.6602351665496826, + "step": 3700 + }, + { + "epoch": 1.5611814345991561, + "eval_loss": 0.7099412679672241, + "eval_runtime": 857.2273, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 3700 + }, + { + "epoch": 1.562025316455696, + "grad_norm": 1.1113468408584595, + "learning_rate": 9.082439560468774e-05, + "loss": 0.6590834259986877, + "step": 3702 + }, + { + "epoch": 1.5628691983122363, + "grad_norm": 1.1476659774780273, + "learning_rate": 9.081052575450508e-05, + "loss": 0.6397460103034973, + "step": 3704 + }, + { + "epoch": 1.5637130801687764, + "grad_norm": 1.2270452976226807, + "learning_rate": 9.07966464901179e-05, + "loss": 0.6337460279464722, + "step": 3706 + }, + { + "epoch": 1.5645569620253164, + "grad_norm": 1.233667016029358, + "learning_rate": 9.07827578147279e-05, + "loss": 0.680374801158905, + "step": 3708 + }, + { + "epoch": 1.5654008438818565, + "grad_norm": 1.0761466026306152, + "learning_rate": 9.076885973153891e-05, + "loss": 0.6234241724014282, + "step": 3710 + }, + { + "epoch": 1.5662447257383967, + "grad_norm": 0.9219012260437012, + "learning_rate": 9.075495224375697e-05, + "loss": 0.6096800565719604, + "step": 3712 + }, + { + "epoch": 1.5670886075949366, + "grad_norm": 1.151168942451477, + "learning_rate": 9.074103535459026e-05, + "loss": 0.649919867515564, + "step": 3714 + }, + { + "epoch": 1.5679324894514768, + "grad_norm": 1.1380470991134644, + "learning_rate": 9.072710906724914e-05, + "loss": 0.6704574227333069, + "step": 3716 + }, + { + "epoch": 1.568776371308017, + "grad_norm": 1.2184447050094604, + "learning_rate": 9.071317338494614e-05, + "loss": 0.6619362831115723, + "step": 3718 + }, + { + "epoch": 1.5696202531645569, + "grad_norm": 1.131170630455017, + "learning_rate": 9.069922831089594e-05, + "loss": 0.6179121732711792, + "step": 3720 + }, + { + "epoch": 1.570464135021097, + "grad_norm": 1.2668405771255493, + "learning_rate": 9.06852738483154e-05, + "loss": 0.594958484172821, + "step": 3722 + }, + { + "epoch": 1.5713080168776372, + "grad_norm": 1.1624782085418701, + "learning_rate": 9.067131000042359e-05, + "loss": 0.6323778629302979, + "step": 3724 + }, + { + "epoch": 1.5721518987341772, + "grad_norm": 1.2936128377914429, + "learning_rate": 9.065733677044166e-05, + "loss": 0.628058910369873, + "step": 3726 + }, + { + "epoch": 1.5729957805907173, + "grad_norm": 1.1847784519195557, + "learning_rate": 9.064335416159296e-05, + "loss": 0.6472614407539368, + "step": 3728 + }, + { + "epoch": 1.5738396624472575, + "grad_norm": 1.8903449773788452, + "learning_rate": 9.062936217710305e-05, + "loss": 0.6395491361618042, + "step": 3730 + }, + { + "epoch": 1.5746835443037974, + "grad_norm": 1.1150785684585571, + "learning_rate": 9.061536082019956e-05, + "loss": 0.6911961436271667, + "step": 3732 + }, + { + "epoch": 1.5755274261603376, + "grad_norm": 1.1206107139587402, + "learning_rate": 9.060135009411239e-05, + "loss": 0.7051874399185181, + "step": 3734 + }, + { + "epoch": 1.5763713080168777, + "grad_norm": 1.27924382686615, + "learning_rate": 9.05873300020735e-05, + "loss": 0.7012752890586853, + "step": 3736 + }, + { + "epoch": 1.5772151898734177, + "grad_norm": 1.3970832824707031, + "learning_rate": 9.057330054731707e-05, + "loss": 0.7185142040252686, + "step": 3738 + }, + { + "epoch": 1.5780590717299579, + "grad_norm": 0.9732457995414734, + "learning_rate": 9.055926173307945e-05, + "loss": 0.6298858523368835, + "step": 3740 + }, + { + "epoch": 1.578902953586498, + "grad_norm": 1.230928897857666, + "learning_rate": 9.054521356259909e-05, + "loss": 0.7142943739891052, + "step": 3742 + }, + { + "epoch": 1.579746835443038, + "grad_norm": 1.1297426223754883, + "learning_rate": 9.053115603911664e-05, + "loss": 0.6535376310348511, + "step": 3744 + }, + { + "epoch": 1.580590717299578, + "grad_norm": 1.2132076025009155, + "learning_rate": 9.051708916587491e-05, + "loss": 0.6236510872840881, + "step": 3746 + }, + { + "epoch": 1.5814345991561183, + "grad_norm": 1.201319932937622, + "learning_rate": 9.050301294611885e-05, + "loss": 0.6752219200134277, + "step": 3748 + }, + { + "epoch": 1.5822784810126582, + "grad_norm": 1.2969163656234741, + "learning_rate": 9.048892738309559e-05, + "loss": 0.7248554825782776, + "step": 3750 + }, + { + "epoch": 1.5831223628691982, + "grad_norm": 1.0721957683563232, + "learning_rate": 9.047483248005439e-05, + "loss": 0.6488997340202332, + "step": 3752 + }, + { + "epoch": 1.5839662447257385, + "grad_norm": 0.9988508820533752, + "learning_rate": 9.046072824024667e-05, + "loss": 0.6191130876541138, + "step": 3754 + }, + { + "epoch": 1.5848101265822785, + "grad_norm": 1.260183572769165, + "learning_rate": 9.0446614666926e-05, + "loss": 0.6681985259056091, + "step": 3756 + }, + { + "epoch": 1.5856540084388184, + "grad_norm": 1.1288834810256958, + "learning_rate": 9.043249176334812e-05, + "loss": 0.662024736404419, + "step": 3758 + }, + { + "epoch": 1.5864978902953588, + "grad_norm": 1.4384263753890991, + "learning_rate": 9.04183595327709e-05, + "loss": 0.609916627407074, + "step": 3760 + }, + { + "epoch": 1.5873417721518988, + "grad_norm": 1.1109941005706787, + "learning_rate": 9.04042179784544e-05, + "loss": 0.6532528400421143, + "step": 3762 + }, + { + "epoch": 1.5881856540084387, + "grad_norm": 1.0959233045578003, + "learning_rate": 9.039006710366078e-05, + "loss": 0.7136290669441223, + "step": 3764 + }, + { + "epoch": 1.5890295358649789, + "grad_norm": 1.2313964366912842, + "learning_rate": 9.037590691165439e-05, + "loss": 0.6907190084457397, + "step": 3766 + }, + { + "epoch": 1.589873417721519, + "grad_norm": 1.3127682209014893, + "learning_rate": 9.036173740570172e-05, + "loss": 0.7114790678024292, + "step": 3768 + }, + { + "epoch": 1.590717299578059, + "grad_norm": 1.0038903951644897, + "learning_rate": 9.034755858907138e-05, + "loss": 0.6257581114768982, + "step": 3770 + }, + { + "epoch": 1.5915611814345991, + "grad_norm": 1.1058061122894287, + "learning_rate": 9.033337046503416e-05, + "loss": 0.578145444393158, + "step": 3772 + }, + { + "epoch": 1.5924050632911393, + "grad_norm": 1.0893515348434448, + "learning_rate": 9.0319173036863e-05, + "loss": 0.6312620043754578, + "step": 3774 + }, + { + "epoch": 1.5932489451476792, + "grad_norm": 1.1091047525405884, + "learning_rate": 9.030496630783297e-05, + "loss": 0.6799508333206177, + "step": 3776 + }, + { + "epoch": 1.5940928270042194, + "grad_norm": 1.1103609800338745, + "learning_rate": 9.029075028122127e-05, + "loss": 0.678726315498352, + "step": 3778 + }, + { + "epoch": 1.5949367088607596, + "grad_norm": 1.1918376684188843, + "learning_rate": 9.027652496030728e-05, + "loss": 0.7357890009880066, + "step": 3780 + }, + { + "epoch": 1.5957805907172995, + "grad_norm": 1.0541924238204956, + "learning_rate": 9.026229034837253e-05, + "loss": 0.6079391241073608, + "step": 3782 + }, + { + "epoch": 1.5966244725738397, + "grad_norm": 1.195845603942871, + "learning_rate": 9.024804644870062e-05, + "loss": 0.7173702120780945, + "step": 3784 + }, + { + "epoch": 1.5974683544303798, + "grad_norm": 1.1362866163253784, + "learning_rate": 9.023379326457737e-05, + "loss": 0.6431670188903809, + "step": 3786 + }, + { + "epoch": 1.5983122362869198, + "grad_norm": 1.2327499389648438, + "learning_rate": 9.021953079929074e-05, + "loss": 0.6346777677536011, + "step": 3788 + }, + { + "epoch": 1.59915611814346, + "grad_norm": 1.1623177528381348, + "learning_rate": 9.020525905613078e-05, + "loss": 0.6852784156799316, + "step": 3790 + }, + { + "epoch": 1.6, + "grad_norm": 1.0258424282073975, + "learning_rate": 9.019097803838971e-05, + "loss": 0.6357095241546631, + "step": 3792 + }, + { + "epoch": 1.60084388185654, + "grad_norm": 1.0825177431106567, + "learning_rate": 9.017668774936188e-05, + "loss": 0.6663659811019897, + "step": 3794 + }, + { + "epoch": 1.6016877637130802, + "grad_norm": 1.1190401315689087, + "learning_rate": 9.016238819234381e-05, + "loss": 0.6009758710861206, + "step": 3796 + }, + { + "epoch": 1.6025316455696204, + "grad_norm": 1.09871244430542, + "learning_rate": 9.01480793706341e-05, + "loss": 0.6907890439033508, + "step": 3798 + }, + { + "epoch": 1.6033755274261603, + "grad_norm": 1.2046958208084106, + "learning_rate": 9.013376128753354e-05, + "loss": 0.6709389090538025, + "step": 3800 + }, + { + "epoch": 1.6033755274261603, + "eval_loss": 0.7080941200256348, + "eval_runtime": 865.6774, + "eval_samples_per_second": 2.434, + "eval_steps_per_second": 2.434, + "step": 3800 + }, + { + "epoch": 1.6042194092827005, + "grad_norm": 1.0671489238739014, + "learning_rate": 9.011943394634505e-05, + "loss": 0.653937041759491, + "step": 3802 + }, + { + "epoch": 1.6050632911392406, + "grad_norm": 1.4205375909805298, + "learning_rate": 9.010509735037364e-05, + "loss": 0.6647229194641113, + "step": 3804 + }, + { + "epoch": 1.6059071729957806, + "grad_norm": 1.3793799877166748, + "learning_rate": 9.009075150292652e-05, + "loss": 0.6981267929077148, + "step": 3806 + }, + { + "epoch": 1.6067510548523207, + "grad_norm": 1.0534380674362183, + "learning_rate": 9.007639640731298e-05, + "loss": 0.6151314973831177, + "step": 3808 + }, + { + "epoch": 1.6075949367088609, + "grad_norm": 1.1359853744506836, + "learning_rate": 9.006203206684447e-05, + "loss": 0.6671237349510193, + "step": 3810 + }, + { + "epoch": 1.6084388185654008, + "grad_norm": 1.2385475635528564, + "learning_rate": 9.004765848483456e-05, + "loss": 0.7145646810531616, + "step": 3812 + }, + { + "epoch": 1.6092827004219408, + "grad_norm": 1.1323930025100708, + "learning_rate": 9.003327566459899e-05, + "loss": 0.6524789929389954, + "step": 3814 + }, + { + "epoch": 1.6101265822784812, + "grad_norm": 1.1863508224487305, + "learning_rate": 9.001888360945555e-05, + "loss": 0.7574670314788818, + "step": 3816 + }, + { + "epoch": 1.610970464135021, + "grad_norm": 1.0288994312286377, + "learning_rate": 9.000448232272425e-05, + "loss": 0.5858811736106873, + "step": 3818 + }, + { + "epoch": 1.611814345991561, + "grad_norm": 1.2674148082733154, + "learning_rate": 8.999007180772719e-05, + "loss": 0.6834250688552856, + "step": 3820 + }, + { + "epoch": 1.6126582278481014, + "grad_norm": 1.2014318704605103, + "learning_rate": 8.997565206778856e-05, + "loss": 0.6435309052467346, + "step": 3822 + }, + { + "epoch": 1.6135021097046414, + "grad_norm": 1.205741286277771, + "learning_rate": 8.996122310623476e-05, + "loss": 0.6212471127510071, + "step": 3824 + }, + { + "epoch": 1.6143459915611813, + "grad_norm": 1.0866186618804932, + "learning_rate": 8.994678492639426e-05, + "loss": 0.6832143664360046, + "step": 3826 + }, + { + "epoch": 1.6151898734177215, + "grad_norm": 1.0786924362182617, + "learning_rate": 8.993233753159768e-05, + "loss": 0.6129988431930542, + "step": 3828 + }, + { + "epoch": 1.6160337552742616, + "grad_norm": 1.176597237586975, + "learning_rate": 8.991788092517775e-05, + "loss": 0.6376019716262817, + "step": 3830 + }, + { + "epoch": 1.6168776371308016, + "grad_norm": 1.149990200996399, + "learning_rate": 8.99034151104693e-05, + "loss": 0.7300569415092468, + "step": 3832 + }, + { + "epoch": 1.6177215189873417, + "grad_norm": 1.0655301809310913, + "learning_rate": 8.988894009080936e-05, + "loss": 0.6163336634635925, + "step": 3834 + }, + { + "epoch": 1.618565400843882, + "grad_norm": 1.1596909761428833, + "learning_rate": 8.987445586953703e-05, + "loss": 0.6459008455276489, + "step": 3836 + }, + { + "epoch": 1.6194092827004218, + "grad_norm": 1.201897382736206, + "learning_rate": 8.985996244999352e-05, + "loss": 0.6166399121284485, + "step": 3838 + }, + { + "epoch": 1.620253164556962, + "grad_norm": 1.1000950336456299, + "learning_rate": 8.984545983552219e-05, + "loss": 0.6438087224960327, + "step": 3840 + }, + { + "epoch": 1.6210970464135022, + "grad_norm": 0.9962409734725952, + "learning_rate": 8.983094802946854e-05, + "loss": 0.6238043308258057, + "step": 3842 + }, + { + "epoch": 1.621940928270042, + "grad_norm": 1.2501682043075562, + "learning_rate": 8.981642703518015e-05, + "loss": 0.6445946097373962, + "step": 3844 + }, + { + "epoch": 1.6227848101265823, + "grad_norm": 1.2027913331985474, + "learning_rate": 8.980189685600673e-05, + "loss": 0.7147613167762756, + "step": 3846 + }, + { + "epoch": 1.6236286919831224, + "grad_norm": 1.1382197141647339, + "learning_rate": 8.97873574953001e-05, + "loss": 0.6531714200973511, + "step": 3848 + }, + { + "epoch": 1.6244725738396624, + "grad_norm": 1.2600723505020142, + "learning_rate": 8.977280895641425e-05, + "loss": 0.6811055541038513, + "step": 3850 + }, + { + "epoch": 1.6253164556962025, + "grad_norm": 0.9908071160316467, + "learning_rate": 8.97582512427052e-05, + "loss": 0.6142261624336243, + "step": 3852 + }, + { + "epoch": 1.6261603375527427, + "grad_norm": 1.171557068824768, + "learning_rate": 8.974368435753117e-05, + "loss": 0.6408987045288086, + "step": 3854 + }, + { + "epoch": 1.6270042194092826, + "grad_norm": 1.1839419603347778, + "learning_rate": 8.972910830425247e-05, + "loss": 0.7352069616317749, + "step": 3856 + }, + { + "epoch": 1.6278481012658228, + "grad_norm": 1.233730673789978, + "learning_rate": 8.971452308623148e-05, + "loss": 0.7663040161132812, + "step": 3858 + }, + { + "epoch": 1.628691983122363, + "grad_norm": 1.3636224269866943, + "learning_rate": 8.969992870683273e-05, + "loss": 0.6496971249580383, + "step": 3860 + }, + { + "epoch": 1.629535864978903, + "grad_norm": 1.2819573879241943, + "learning_rate": 8.96853251694229e-05, + "loss": 0.6079609394073486, + "step": 3862 + }, + { + "epoch": 1.630379746835443, + "grad_norm": 1.087265968322754, + "learning_rate": 8.967071247737071e-05, + "loss": 0.6299422979354858, + "step": 3864 + }, + { + "epoch": 1.6312236286919832, + "grad_norm": 1.24200439453125, + "learning_rate": 8.965609063404706e-05, + "loss": 0.6691840291023254, + "step": 3866 + }, + { + "epoch": 1.6320675105485232, + "grad_norm": 1.0771806240081787, + "learning_rate": 8.96414596428249e-05, + "loss": 0.6623613238334656, + "step": 3868 + }, + { + "epoch": 1.6329113924050633, + "grad_norm": 1.1830974817276, + "learning_rate": 8.962681950707932e-05, + "loss": 0.6663276553153992, + "step": 3870 + }, + { + "epoch": 1.6337552742616035, + "grad_norm": 1.1107177734375, + "learning_rate": 8.961217023018754e-05, + "loss": 0.6426810622215271, + "step": 3872 + }, + { + "epoch": 1.6345991561181434, + "grad_norm": 1.2528507709503174, + "learning_rate": 8.959751181552886e-05, + "loss": 0.7113696336746216, + "step": 3874 + }, + { + "epoch": 1.6354430379746834, + "grad_norm": 1.0656070709228516, + "learning_rate": 8.958284426648467e-05, + "loss": 0.6211581230163574, + "step": 3876 + }, + { + "epoch": 1.6362869198312238, + "grad_norm": 1.0627381801605225, + "learning_rate": 8.956816758643852e-05, + "loss": 0.5950066447257996, + "step": 3878 + }, + { + "epoch": 1.6371308016877637, + "grad_norm": 0.9812912344932556, + "learning_rate": 8.955348177877603e-05, + "loss": 0.6519815325737, + "step": 3880 + }, + { + "epoch": 1.6379746835443036, + "grad_norm": 1.1843842267990112, + "learning_rate": 8.953878684688493e-05, + "loss": 0.6830767393112183, + "step": 3882 + }, + { + "epoch": 1.638818565400844, + "grad_norm": 1.0393236875534058, + "learning_rate": 8.952408279415507e-05, + "loss": 0.5920302271842957, + "step": 3884 + }, + { + "epoch": 1.639662447257384, + "grad_norm": 0.9931944608688354, + "learning_rate": 8.950936962397838e-05, + "loss": 0.6269177198410034, + "step": 3886 + }, + { + "epoch": 1.640506329113924, + "grad_norm": 1.1461358070373535, + "learning_rate": 8.949464733974891e-05, + "loss": 0.7021532654762268, + "step": 3888 + }, + { + "epoch": 1.6413502109704643, + "grad_norm": 1.2654093503952026, + "learning_rate": 8.947991594486279e-05, + "loss": 0.7331246733665466, + "step": 3890 + }, + { + "epoch": 1.6421940928270042, + "grad_norm": 1.1487081050872803, + "learning_rate": 8.946517544271831e-05, + "loss": 0.6438513994216919, + "step": 3892 + }, + { + "epoch": 1.6430379746835442, + "grad_norm": 1.0876784324645996, + "learning_rate": 8.945042583671579e-05, + "loss": 0.6779276728630066, + "step": 3894 + }, + { + "epoch": 1.6438818565400843, + "grad_norm": 1.2382020950317383, + "learning_rate": 8.943566713025768e-05, + "loss": 0.7255419492721558, + "step": 3896 + }, + { + "epoch": 1.6447257383966245, + "grad_norm": 1.3502718210220337, + "learning_rate": 8.942089932674855e-05, + "loss": 0.7068934440612793, + "step": 3898 + }, + { + "epoch": 1.6455696202531644, + "grad_norm": 1.050878643989563, + "learning_rate": 8.940612242959503e-05, + "loss": 0.608700156211853, + "step": 3900 + }, + { + "epoch": 1.6455696202531644, + "eval_loss": 0.7049403786659241, + "eval_runtime": 854.9866, + "eval_samples_per_second": 2.464, + "eval_steps_per_second": 2.464, + "step": 3900 + }, + { + "epoch": 1.6464135021097046, + "grad_norm": 1.0536954402923584, + "learning_rate": 8.939133644220588e-05, + "loss": 0.6257222890853882, + "step": 3902 + }, + { + "epoch": 1.6472573839662448, + "grad_norm": 1.1903947591781616, + "learning_rate": 8.937654136799195e-05, + "loss": 0.6823404431343079, + "step": 3904 + }, + { + "epoch": 1.6481012658227847, + "grad_norm": 1.225679874420166, + "learning_rate": 8.936173721036616e-05, + "loss": 0.6596478819847107, + "step": 3906 + }, + { + "epoch": 1.6489451476793249, + "grad_norm": 1.0071430206298828, + "learning_rate": 8.934692397274354e-05, + "loss": 0.5638422966003418, + "step": 3908 + }, + { + "epoch": 1.649789029535865, + "grad_norm": 1.0146223306655884, + "learning_rate": 8.933210165854125e-05, + "loss": 0.5743419528007507, + "step": 3910 + }, + { + "epoch": 1.650632911392405, + "grad_norm": 1.122976541519165, + "learning_rate": 8.931727027117848e-05, + "loss": 0.6775169372558594, + "step": 3912 + }, + { + "epoch": 1.6514767932489451, + "grad_norm": 0.9223271012306213, + "learning_rate": 8.930242981407656e-05, + "loss": 0.5984215140342712, + "step": 3914 + }, + { + "epoch": 1.6523206751054853, + "grad_norm": 1.1599735021591187, + "learning_rate": 8.928758029065891e-05, + "loss": 0.6342158913612366, + "step": 3916 + }, + { + "epoch": 1.6531645569620252, + "grad_norm": 1.2680121660232544, + "learning_rate": 8.927272170435101e-05, + "loss": 0.678507924079895, + "step": 3918 + }, + { + "epoch": 1.6540084388185654, + "grad_norm": 1.3628549575805664, + "learning_rate": 8.925785405858047e-05, + "loss": 0.6739710569381714, + "step": 3920 + }, + { + "epoch": 1.6548523206751056, + "grad_norm": 1.163482427597046, + "learning_rate": 8.924297735677694e-05, + "loss": 0.7050020098686218, + "step": 3922 + }, + { + "epoch": 1.6556962025316455, + "grad_norm": 1.2057000398635864, + "learning_rate": 8.922809160237222e-05, + "loss": 0.6847540140151978, + "step": 3924 + }, + { + "epoch": 1.6565400843881857, + "grad_norm": 1.2784082889556885, + "learning_rate": 8.921319679880016e-05, + "loss": 0.7079069018363953, + "step": 3926 + }, + { + "epoch": 1.6573839662447258, + "grad_norm": 1.1701157093048096, + "learning_rate": 8.919829294949671e-05, + "loss": 0.665060818195343, + "step": 3928 + }, + { + "epoch": 1.6582278481012658, + "grad_norm": 1.3886606693267822, + "learning_rate": 8.918338005789988e-05, + "loss": 0.7547550201416016, + "step": 3930 + }, + { + "epoch": 1.659071729957806, + "grad_norm": 0.9504727721214294, + "learning_rate": 8.91684581274498e-05, + "loss": 0.5718522667884827, + "step": 3932 + }, + { + "epoch": 1.659915611814346, + "grad_norm": 1.1185030937194824, + "learning_rate": 8.915352716158869e-05, + "loss": 0.5984254479408264, + "step": 3934 + }, + { + "epoch": 1.660759493670886, + "grad_norm": 1.1489602327346802, + "learning_rate": 8.913858716376081e-05, + "loss": 0.6749780774116516, + "step": 3936 + }, + { + "epoch": 1.6616033755274262, + "grad_norm": 1.389431118965149, + "learning_rate": 8.912363813741255e-05, + "loss": 0.6537864804267883, + "step": 3938 + }, + { + "epoch": 1.6624472573839664, + "grad_norm": 1.0958757400512695, + "learning_rate": 8.910868008599235e-05, + "loss": 0.6033569574356079, + "step": 3940 + }, + { + "epoch": 1.6632911392405063, + "grad_norm": 1.2735344171524048, + "learning_rate": 8.909371301295075e-05, + "loss": 0.7404987215995789, + "step": 3942 + }, + { + "epoch": 1.6641350210970463, + "grad_norm": 1.123336911201477, + "learning_rate": 8.907873692174038e-05, + "loss": 0.6265006065368652, + "step": 3944 + }, + { + "epoch": 1.6649789029535866, + "grad_norm": 1.259470820426941, + "learning_rate": 8.90637518158159e-05, + "loss": 0.650705099105835, + "step": 3946 + }, + { + "epoch": 1.6658227848101266, + "grad_norm": 1.4020485877990723, + "learning_rate": 8.904875769863412e-05, + "loss": 0.7813970446586609, + "step": 3948 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.1709671020507812, + "learning_rate": 8.903375457365389e-05, + "loss": 0.6499447822570801, + "step": 3950 + }, + { + "epoch": 1.667510548523207, + "grad_norm": 1.085585355758667, + "learning_rate": 8.901874244433612e-05, + "loss": 0.6141875386238098, + "step": 3952 + }, + { + "epoch": 1.6683544303797468, + "grad_norm": 1.2340166568756104, + "learning_rate": 8.900372131414386e-05, + "loss": 0.7080221176147461, + "step": 3954 + }, + { + "epoch": 1.6691983122362868, + "grad_norm": 1.148576259613037, + "learning_rate": 8.898869118654216e-05, + "loss": 0.6340513229370117, + "step": 3956 + }, + { + "epoch": 1.6700421940928272, + "grad_norm": 1.2231999635696411, + "learning_rate": 8.89736520649982e-05, + "loss": 0.6999116539955139, + "step": 3958 + }, + { + "epoch": 1.6708860759493671, + "grad_norm": 1.1600396633148193, + "learning_rate": 8.895860395298121e-05, + "loss": 0.7177759408950806, + "step": 3960 + }, + { + "epoch": 1.671729957805907, + "grad_norm": 1.3019158840179443, + "learning_rate": 8.894354685396251e-05, + "loss": 0.6485702395439148, + "step": 3962 + }, + { + "epoch": 1.6725738396624472, + "grad_norm": 1.0153226852416992, + "learning_rate": 8.892848077141546e-05, + "loss": 0.6189450025558472, + "step": 3964 + }, + { + "epoch": 1.6734177215189874, + "grad_norm": 1.1953094005584717, + "learning_rate": 8.891340570881555e-05, + "loss": 0.6756728291511536, + "step": 3966 + }, + { + "epoch": 1.6742616033755273, + "grad_norm": 1.3376187086105347, + "learning_rate": 8.889832166964027e-05, + "loss": 0.6851167678833008, + "step": 3968 + }, + { + "epoch": 1.6751054852320675, + "grad_norm": 1.0045926570892334, + "learning_rate": 8.888322865736924e-05, + "loss": 0.5991915464401245, + "step": 3970 + }, + { + "epoch": 1.6759493670886076, + "grad_norm": 1.2115750312805176, + "learning_rate": 8.886812667548414e-05, + "loss": 0.713362455368042, + "step": 3972 + }, + { + "epoch": 1.6767932489451476, + "grad_norm": 1.1887929439544678, + "learning_rate": 8.88530157274687e-05, + "loss": 0.7058883309364319, + "step": 3974 + }, + { + "epoch": 1.6776371308016877, + "grad_norm": 1.1465295553207397, + "learning_rate": 8.883789581680868e-05, + "loss": 0.6501380801200867, + "step": 3976 + }, + { + "epoch": 1.678481012658228, + "grad_norm": 1.184693694114685, + "learning_rate": 8.882276694699204e-05, + "loss": 0.6109840273857117, + "step": 3978 + }, + { + "epoch": 1.6793248945147679, + "grad_norm": 1.2034777402877808, + "learning_rate": 8.880762912150862e-05, + "loss": 0.6815584897994995, + "step": 3980 + }, + { + "epoch": 1.680168776371308, + "grad_norm": 1.1312000751495361, + "learning_rate": 8.879248234385052e-05, + "loss": 0.6859248876571655, + "step": 3982 + }, + { + "epoch": 1.6810126582278482, + "grad_norm": 1.2273681163787842, + "learning_rate": 8.877732661751173e-05, + "loss": 0.6426702737808228, + "step": 3984 + }, + { + "epoch": 1.6818565400843881, + "grad_norm": 1.2550326585769653, + "learning_rate": 8.876216194598844e-05, + "loss": 0.6462456583976746, + "step": 3986 + }, + { + "epoch": 1.6827004219409283, + "grad_norm": 1.3111321926116943, + "learning_rate": 8.874698833277884e-05, + "loss": 0.6293925046920776, + "step": 3988 + }, + { + "epoch": 1.6835443037974684, + "grad_norm": 1.037883996963501, + "learning_rate": 8.873180578138316e-05, + "loss": 0.59798264503479, + "step": 3990 + }, + { + "epoch": 1.6843881856540084, + "grad_norm": 1.2411901950836182, + "learning_rate": 8.871661429530376e-05, + "loss": 0.6741529703140259, + "step": 3992 + }, + { + "epoch": 1.6852320675105485, + "grad_norm": 1.206354022026062, + "learning_rate": 8.8701413878045e-05, + "loss": 0.5972680449485779, + "step": 3994 + }, + { + "epoch": 1.6860759493670887, + "grad_norm": 1.1922144889831543, + "learning_rate": 8.868620453311334e-05, + "loss": 0.5879245400428772, + "step": 3996 + }, + { + "epoch": 1.6869198312236287, + "grad_norm": 1.3499996662139893, + "learning_rate": 8.867098626401729e-05, + "loss": 0.7381167411804199, + "step": 3998 + }, + { + "epoch": 1.6877637130801688, + "grad_norm": 1.3601514101028442, + "learning_rate": 8.865575907426737e-05, + "loss": 0.6590276956558228, + "step": 4000 + }, + { + "epoch": 1.6877637130801688, + "eval_loss": 0.7027890682220459, + "eval_runtime": 848.7529, + "eval_samples_per_second": 2.482, + "eval_steps_per_second": 2.482, + "step": 4000 + }, + { + "epoch": 1.688607594936709, + "grad_norm": 1.1060529947280884, + "learning_rate": 8.864052296737624e-05, + "loss": 0.5958077907562256, + "step": 4002 + }, + { + "epoch": 1.689451476793249, + "grad_norm": 1.2067371606826782, + "learning_rate": 8.862527794685858e-05, + "loss": 0.6802279353141785, + "step": 4004 + }, + { + "epoch": 1.690295358649789, + "grad_norm": 1.0094636678695679, + "learning_rate": 8.86100240162311e-05, + "loss": 0.5701603889465332, + "step": 4006 + }, + { + "epoch": 1.6911392405063292, + "grad_norm": 1.0976500511169434, + "learning_rate": 8.85947611790126e-05, + "loss": 0.6580625176429749, + "step": 4008 + }, + { + "epoch": 1.6919831223628692, + "grad_norm": 0.9448981285095215, + "learning_rate": 8.857948943872392e-05, + "loss": 0.5947542190551758, + "step": 4010 + }, + { + "epoch": 1.6928270042194091, + "grad_norm": 1.219609260559082, + "learning_rate": 8.856420879888796e-05, + "loss": 0.6361464262008667, + "step": 4012 + }, + { + "epoch": 1.6936708860759495, + "grad_norm": 1.2395503520965576, + "learning_rate": 8.854891926302966e-05, + "loss": 0.608664333820343, + "step": 4014 + }, + { + "epoch": 1.6945147679324895, + "grad_norm": 1.1300057172775269, + "learning_rate": 8.853362083467604e-05, + "loss": 0.6932460069656372, + "step": 4016 + }, + { + "epoch": 1.6953586497890294, + "grad_norm": 1.2300254106521606, + "learning_rate": 8.851831351735616e-05, + "loss": 0.646004855632782, + "step": 4018 + }, + { + "epoch": 1.6962025316455698, + "grad_norm": 1.2328956127166748, + "learning_rate": 8.85029973146011e-05, + "loss": 0.6760826110839844, + "step": 4020 + }, + { + "epoch": 1.6970464135021097, + "grad_norm": 1.1252286434173584, + "learning_rate": 8.848767222994401e-05, + "loss": 0.5943224430084229, + "step": 4022 + }, + { + "epoch": 1.6978902953586497, + "grad_norm": 1.1587592363357544, + "learning_rate": 8.847233826692012e-05, + "loss": 0.7535276412963867, + "step": 4024 + }, + { + "epoch": 1.6987341772151898, + "grad_norm": 1.0294606685638428, + "learning_rate": 8.845699542906667e-05, + "loss": 0.5903090834617615, + "step": 4026 + }, + { + "epoch": 1.69957805907173, + "grad_norm": 1.1940597295761108, + "learning_rate": 8.844164371992295e-05, + "loss": 0.6031379699707031, + "step": 4028 + }, + { + "epoch": 1.70042194092827, + "grad_norm": 1.0416409969329834, + "learning_rate": 8.842628314303031e-05, + "loss": 0.6185168623924255, + "step": 4030 + }, + { + "epoch": 1.70126582278481, + "grad_norm": 1.8715689182281494, + "learning_rate": 8.841091370193214e-05, + "loss": 0.6325570344924927, + "step": 4032 + }, + { + "epoch": 1.7021097046413503, + "grad_norm": 1.230658769607544, + "learning_rate": 8.839553540017387e-05, + "loss": 0.7413952350616455, + "step": 4034 + }, + { + "epoch": 1.7029535864978902, + "grad_norm": 1.298003077507019, + "learning_rate": 8.838014824130299e-05, + "loss": 0.6973189115524292, + "step": 4036 + }, + { + "epoch": 1.7037974683544304, + "grad_norm": 1.0246652364730835, + "learning_rate": 8.836475222886902e-05, + "loss": 0.6582493185997009, + "step": 4038 + }, + { + "epoch": 1.7046413502109705, + "grad_norm": 1.3652594089508057, + "learning_rate": 8.834934736642351e-05, + "loss": 0.6934399008750916, + "step": 4040 + }, + { + "epoch": 1.7054852320675105, + "grad_norm": 1.029778242111206, + "learning_rate": 8.833393365752007e-05, + "loss": 0.6437561511993408, + "step": 4042 + }, + { + "epoch": 1.7063291139240506, + "grad_norm": 1.1993004083633423, + "learning_rate": 8.831851110571437e-05, + "loss": 0.605059027671814, + "step": 4044 + }, + { + "epoch": 1.7071729957805908, + "grad_norm": 1.286389946937561, + "learning_rate": 8.830307971456406e-05, + "loss": 0.7035017609596252, + "step": 4046 + }, + { + "epoch": 1.7080168776371307, + "grad_norm": 1.1211459636688232, + "learning_rate": 8.82876394876289e-05, + "loss": 0.6429924964904785, + "step": 4048 + }, + { + "epoch": 1.7088607594936709, + "grad_norm": 1.1284868717193604, + "learning_rate": 8.827219042847064e-05, + "loss": 0.6454769968986511, + "step": 4050 + }, + { + "epoch": 1.709704641350211, + "grad_norm": 1.1934884786605835, + "learning_rate": 8.825673254065306e-05, + "loss": 0.707233190536499, + "step": 4052 + }, + { + "epoch": 1.710548523206751, + "grad_norm": 1.1560680866241455, + "learning_rate": 8.824126582774203e-05, + "loss": 0.6790444254875183, + "step": 4054 + }, + { + "epoch": 1.7113924050632912, + "grad_norm": 1.1924364566802979, + "learning_rate": 8.822579029330541e-05, + "loss": 0.6115295886993408, + "step": 4056 + }, + { + "epoch": 1.7122362869198313, + "grad_norm": 1.107370138168335, + "learning_rate": 8.82103059409131e-05, + "loss": 0.7039182186126709, + "step": 4058 + }, + { + "epoch": 1.7130801687763713, + "grad_norm": 1.2554657459259033, + "learning_rate": 8.819481277413707e-05, + "loss": 0.6580052971839905, + "step": 4060 + }, + { + "epoch": 1.7139240506329114, + "grad_norm": 1.2873135805130005, + "learning_rate": 8.817931079655127e-05, + "loss": 0.6042479276657104, + "step": 4062 + }, + { + "epoch": 1.7147679324894516, + "grad_norm": 1.027056097984314, + "learning_rate": 8.816380001173172e-05, + "loss": 0.5992372632026672, + "step": 4064 + }, + { + "epoch": 1.7156118143459915, + "grad_norm": 1.0694721937179565, + "learning_rate": 8.814828042325644e-05, + "loss": 0.7078655362129211, + "step": 4066 + }, + { + "epoch": 1.7164556962025317, + "grad_norm": 1.194984793663025, + "learning_rate": 8.813275203470555e-05, + "loss": 0.6618752479553223, + "step": 4068 + }, + { + "epoch": 1.7172995780590719, + "grad_norm": 1.1713165044784546, + "learning_rate": 8.811721484966109e-05, + "loss": 0.6328625679016113, + "step": 4070 + }, + { + "epoch": 1.7181434599156118, + "grad_norm": 0.9993656277656555, + "learning_rate": 8.810166887170724e-05, + "loss": 0.5916416645050049, + "step": 4072 + }, + { + "epoch": 1.7189873417721517, + "grad_norm": 1.172642707824707, + "learning_rate": 8.808611410443011e-05, + "loss": 0.6490002274513245, + "step": 4074 + }, + { + "epoch": 1.7198312236286921, + "grad_norm": 1.1404821872711182, + "learning_rate": 8.807055055141793e-05, + "loss": 0.6571791172027588, + "step": 4076 + }, + { + "epoch": 1.720675105485232, + "grad_norm": 1.2104214429855347, + "learning_rate": 8.80549782162609e-05, + "loss": 0.6233854293823242, + "step": 4078 + }, + { + "epoch": 1.721518987341772, + "grad_norm": 1.1691396236419678, + "learning_rate": 8.803939710255126e-05, + "loss": 0.6331531405448914, + "step": 4080 + }, + { + "epoch": 1.7223628691983124, + "grad_norm": 1.263174057006836, + "learning_rate": 8.802380721388325e-05, + "loss": 0.6321156620979309, + "step": 4082 + }, + { + "epoch": 1.7232067510548523, + "grad_norm": 1.0685606002807617, + "learning_rate": 8.80082085538532e-05, + "loss": 0.644904613494873, + "step": 4084 + }, + { + "epoch": 1.7240506329113923, + "grad_norm": 1.2289735078811646, + "learning_rate": 8.799260112605938e-05, + "loss": 0.6743831634521484, + "step": 4086 + }, + { + "epoch": 1.7248945147679327, + "grad_norm": 1.0661355257034302, + "learning_rate": 8.797698493410216e-05, + "loss": 0.6866999268531799, + "step": 4088 + }, + { + "epoch": 1.7257383966244726, + "grad_norm": 1.1001228094100952, + "learning_rate": 8.796135998158386e-05, + "loss": 0.691387414932251, + "step": 4090 + }, + { + "epoch": 1.7265822784810125, + "grad_norm": 1.1078115701675415, + "learning_rate": 8.794572627210887e-05, + "loss": 0.5882864594459534, + "step": 4092 + }, + { + "epoch": 1.7274261603375527, + "grad_norm": 1.0483999252319336, + "learning_rate": 8.79300838092836e-05, + "loss": 0.6192089319229126, + "step": 4094 + }, + { + "epoch": 1.7282700421940929, + "grad_norm": 1.1194913387298584, + "learning_rate": 8.791443259671645e-05, + "loss": 0.603322446346283, + "step": 4096 + }, + { + "epoch": 1.7291139240506328, + "grad_norm": 1.1800397634506226, + "learning_rate": 8.789877263801787e-05, + "loss": 0.6141818165779114, + "step": 4098 + }, + { + "epoch": 1.729957805907173, + "grad_norm": 1.261768102645874, + "learning_rate": 8.78831039368003e-05, + "loss": 0.6707983016967773, + "step": 4100 + }, + { + "epoch": 1.729957805907173, + "eval_loss": 0.7022181153297424, + "eval_runtime": 844.6405, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 2.495, + "step": 4100 + }, + { + "epoch": 1.7308016877637131, + "grad_norm": 1.2505232095718384, + "learning_rate": 8.786742649667822e-05, + "loss": 0.6440353989601135, + "step": 4102 + }, + { + "epoch": 1.731645569620253, + "grad_norm": 1.2631809711456299, + "learning_rate": 8.78517403212681e-05, + "loss": 0.6712808012962341, + "step": 4104 + }, + { + "epoch": 1.7324894514767932, + "grad_norm": 1.2781071662902832, + "learning_rate": 8.783604541418845e-05, + "loss": 0.6854958534240723, + "step": 4106 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 1.1065936088562012, + "learning_rate": 8.782034177905976e-05, + "loss": 0.6281477808952332, + "step": 4108 + }, + { + "epoch": 1.7341772151898733, + "grad_norm": 1.010961890220642, + "learning_rate": 8.780462941950457e-05, + "loss": 0.6835165619850159, + "step": 4110 + }, + { + "epoch": 1.7350210970464135, + "grad_norm": 1.1467366218566895, + "learning_rate": 8.778890833914744e-05, + "loss": 0.6674962639808655, + "step": 4112 + }, + { + "epoch": 1.7358649789029537, + "grad_norm": 1.0221859216690063, + "learning_rate": 8.77731785416149e-05, + "loss": 0.5967551469802856, + "step": 4114 + }, + { + "epoch": 1.7367088607594936, + "grad_norm": 1.347937822341919, + "learning_rate": 8.775744003053552e-05, + "loss": 0.7356855869293213, + "step": 4116 + }, + { + "epoch": 1.7375527426160338, + "grad_norm": 1.2952557802200317, + "learning_rate": 8.774169280953988e-05, + "loss": 0.6932644844055176, + "step": 4118 + }, + { + "epoch": 1.738396624472574, + "grad_norm": 1.0157089233398438, + "learning_rate": 8.772593688226052e-05, + "loss": 0.5917407870292664, + "step": 4120 + }, + { + "epoch": 1.7392405063291139, + "grad_norm": 1.1537878513336182, + "learning_rate": 8.77101722523321e-05, + "loss": 0.6335760354995728, + "step": 4122 + }, + { + "epoch": 1.740084388185654, + "grad_norm": 1.0989667177200317, + "learning_rate": 8.769439892339115e-05, + "loss": 0.6892110109329224, + "step": 4124 + }, + { + "epoch": 1.7409282700421942, + "grad_norm": 1.1293572187423706, + "learning_rate": 8.767861689907633e-05, + "loss": 0.5966230630874634, + "step": 4126 + }, + { + "epoch": 1.7417721518987341, + "grad_norm": 1.1167775392532349, + "learning_rate": 8.76628261830282e-05, + "loss": 0.5981804728507996, + "step": 4128 + }, + { + "epoch": 1.7426160337552743, + "grad_norm": 1.0572419166564941, + "learning_rate": 8.76470267788894e-05, + "loss": 0.5539529919624329, + "step": 4130 + }, + { + "epoch": 1.7434599156118145, + "grad_norm": 0.937256932258606, + "learning_rate": 8.763121869030456e-05, + "loss": 0.6238219141960144, + "step": 4132 + }, + { + "epoch": 1.7443037974683544, + "grad_norm": 1.082932472229004, + "learning_rate": 8.761540192092029e-05, + "loss": 0.6033329963684082, + "step": 4134 + }, + { + "epoch": 1.7451476793248946, + "grad_norm": 1.0495184659957886, + "learning_rate": 8.75995764743852e-05, + "loss": 0.5567626357078552, + "step": 4136 + }, + { + "epoch": 1.7459915611814347, + "grad_norm": 1.3143779039382935, + "learning_rate": 8.758374235434994e-05, + "loss": 0.6759346127510071, + "step": 4138 + }, + { + "epoch": 1.7468354430379747, + "grad_norm": 1.2385786771774292, + "learning_rate": 8.756789956446713e-05, + "loss": 0.6439400315284729, + "step": 4140 + }, + { + "epoch": 1.7476793248945146, + "grad_norm": 1.0453747510910034, + "learning_rate": 8.75520481083914e-05, + "loss": 0.627493679523468, + "step": 4142 + }, + { + "epoch": 1.748523206751055, + "grad_norm": 1.09946608543396, + "learning_rate": 8.753618798977935e-05, + "loss": 0.677209198474884, + "step": 4144 + }, + { + "epoch": 1.749367088607595, + "grad_norm": 1.2207063436508179, + "learning_rate": 8.752031921228965e-05, + "loss": 0.6874014735221863, + "step": 4146 + }, + { + "epoch": 1.7502109704641349, + "grad_norm": 1.2520697116851807, + "learning_rate": 8.750444177958288e-05, + "loss": 0.6332831382751465, + "step": 4148 + }, + { + "epoch": 1.7510548523206753, + "grad_norm": 1.2463186979293823, + "learning_rate": 8.748855569532168e-05, + "loss": 0.682744562625885, + "step": 4150 + }, + { + "epoch": 1.7518987341772152, + "grad_norm": 1.1895235776901245, + "learning_rate": 8.747266096317069e-05, + "loss": 0.7006803750991821, + "step": 4152 + }, + { + "epoch": 1.7527426160337551, + "grad_norm": 1.1627185344696045, + "learning_rate": 8.745675758679646e-05, + "loss": 0.6751191020011902, + "step": 4154 + }, + { + "epoch": 1.7535864978902953, + "grad_norm": 1.324127197265625, + "learning_rate": 8.744084556986764e-05, + "loss": 0.661848247051239, + "step": 4156 + }, + { + "epoch": 1.7544303797468355, + "grad_norm": 1.226809024810791, + "learning_rate": 8.74249249160548e-05, + "loss": 0.7057217955589294, + "step": 4158 + }, + { + "epoch": 1.7552742616033754, + "grad_norm": 1.2341214418411255, + "learning_rate": 8.740899562903056e-05, + "loss": 0.6856105923652649, + "step": 4160 + }, + { + "epoch": 1.7561181434599156, + "grad_norm": 1.3907564878463745, + "learning_rate": 8.739305771246946e-05, + "loss": 0.6616930365562439, + "step": 4162 + }, + { + "epoch": 1.7569620253164557, + "grad_norm": 1.2756825685501099, + "learning_rate": 8.737711117004812e-05, + "loss": 0.5791551470756531, + "step": 4164 + }, + { + "epoch": 1.7578059071729957, + "grad_norm": 1.2861095666885376, + "learning_rate": 8.736115600544506e-05, + "loss": 0.7074756622314453, + "step": 4166 + }, + { + "epoch": 1.7586497890295358, + "grad_norm": 1.2198424339294434, + "learning_rate": 8.734519222234083e-05, + "loss": 0.6494167447090149, + "step": 4168 + }, + { + "epoch": 1.759493670886076, + "grad_norm": 1.19169020652771, + "learning_rate": 8.732921982441799e-05, + "loss": 0.6546841859817505, + "step": 4170 + }, + { + "epoch": 1.760337552742616, + "grad_norm": 1.11533784866333, + "learning_rate": 8.731323881536108e-05, + "loss": 0.6701815724372864, + "step": 4172 + }, + { + "epoch": 1.761181434599156, + "grad_norm": 1.2148140668869019, + "learning_rate": 8.729724919885657e-05, + "loss": 0.6678179502487183, + "step": 4174 + }, + { + "epoch": 1.7620253164556963, + "grad_norm": 1.1968709230422974, + "learning_rate": 8.728125097859298e-05, + "loss": 0.6505144834518433, + "step": 4176 + }, + { + "epoch": 1.7628691983122362, + "grad_norm": 1.0954766273498535, + "learning_rate": 8.726524415826079e-05, + "loss": 0.6531696915626526, + "step": 4178 + }, + { + "epoch": 1.7637130801687764, + "grad_norm": 1.5149537324905396, + "learning_rate": 8.724922874155246e-05, + "loss": 0.710014283657074, + "step": 4180 + }, + { + "epoch": 1.7645569620253165, + "grad_norm": 1.145113229751587, + "learning_rate": 8.723320473216245e-05, + "loss": 0.714016318321228, + "step": 4182 + }, + { + "epoch": 1.7654008438818565, + "grad_norm": 0.9454524517059326, + "learning_rate": 8.721717213378719e-05, + "loss": 0.6775414347648621, + "step": 4184 + }, + { + "epoch": 1.7662447257383966, + "grad_norm": 1.1414754390716553, + "learning_rate": 8.720113095012507e-05, + "loss": 0.6279728412628174, + "step": 4186 + }, + { + "epoch": 1.7670886075949368, + "grad_norm": 1.212802767753601, + "learning_rate": 8.718508118487652e-05, + "loss": 0.5894309282302856, + "step": 4188 + }, + { + "epoch": 1.7679324894514767, + "grad_norm": 1.5213478803634644, + "learning_rate": 8.716902284174388e-05, + "loss": 0.6124046444892883, + "step": 4190 + }, + { + "epoch": 1.768776371308017, + "grad_norm": 0.9973840713500977, + "learning_rate": 8.715295592443154e-05, + "loss": 0.5990801453590393, + "step": 4192 + }, + { + "epoch": 1.769620253164557, + "grad_norm": 1.1084294319152832, + "learning_rate": 8.713688043664579e-05, + "loss": 0.6485559344291687, + "step": 4194 + }, + { + "epoch": 1.770464135021097, + "grad_norm": 1.1401913166046143, + "learning_rate": 8.712079638209493e-05, + "loss": 0.7083099484443665, + "step": 4196 + }, + { + "epoch": 1.7713080168776372, + "grad_norm": 1.278105616569519, + "learning_rate": 8.71047037644893e-05, + "loss": 0.7237915992736816, + "step": 4198 + }, + { + "epoch": 1.7721518987341773, + "grad_norm": 1.2407530546188354, + "learning_rate": 8.708860258754108e-05, + "loss": 0.6259870529174805, + "step": 4200 + }, + { + "epoch": 1.7721518987341773, + "eval_loss": 0.6993561387062073, + "eval_runtime": 542.0281, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 3.887, + "step": 4200 + }, + { + "epoch": 1.7729957805907173, + "grad_norm": 1.102859616279602, + "learning_rate": 8.707249285496457e-05, + "loss": 0.6604248285293579, + "step": 4202 + }, + { + "epoch": 1.7738396624472574, + "grad_norm": 1.2478244304656982, + "learning_rate": 8.705637457047594e-05, + "loss": 0.6799775958061218, + "step": 4204 + }, + { + "epoch": 1.7746835443037976, + "grad_norm": 1.1178022623062134, + "learning_rate": 8.704024773779338e-05, + "loss": 0.6136477589607239, + "step": 4206 + }, + { + "epoch": 1.7755274261603375, + "grad_norm": 1.904076337814331, + "learning_rate": 8.702411236063703e-05, + "loss": 0.6568390130996704, + "step": 4208 + }, + { + "epoch": 1.7763713080168775, + "grad_norm": 1.0902835130691528, + "learning_rate": 8.700796844272903e-05, + "loss": 0.6404406428337097, + "step": 4210 + }, + { + "epoch": 1.7772151898734179, + "grad_norm": 1.1858288049697876, + "learning_rate": 8.699181598779347e-05, + "loss": 0.6924911737442017, + "step": 4212 + }, + { + "epoch": 1.7780590717299578, + "grad_norm": 1.0015727281570435, + "learning_rate": 8.69756549995564e-05, + "loss": 0.572692334651947, + "step": 4214 + }, + { + "epoch": 1.7789029535864977, + "grad_norm": 1.440079689025879, + "learning_rate": 8.695948548174583e-05, + "loss": 0.7196018695831299, + "step": 4216 + }, + { + "epoch": 1.7797468354430381, + "grad_norm": 1.1320992708206177, + "learning_rate": 8.69433074380918e-05, + "loss": 0.5870906710624695, + "step": 4218 + }, + { + "epoch": 1.780590717299578, + "grad_norm": 1.3156964778900146, + "learning_rate": 8.692712087232626e-05, + "loss": 0.6501539349555969, + "step": 4220 + }, + { + "epoch": 1.781434599156118, + "grad_norm": 1.1869803667068481, + "learning_rate": 8.691092578818311e-05, + "loss": 0.7017278075218201, + "step": 4222 + }, + { + "epoch": 1.7822784810126582, + "grad_norm": 0.9708380699157715, + "learning_rate": 8.689472218939829e-05, + "loss": 0.5954802632331848, + "step": 4224 + }, + { + "epoch": 1.7831223628691983, + "grad_norm": 1.0753228664398193, + "learning_rate": 8.687851007970962e-05, + "loss": 0.6494144797325134, + "step": 4226 + }, + { + "epoch": 1.7839662447257383, + "grad_norm": 1.1038413047790527, + "learning_rate": 8.686228946285695e-05, + "loss": 0.7247282862663269, + "step": 4228 + }, + { + "epoch": 1.7848101265822784, + "grad_norm": 0.9666786789894104, + "learning_rate": 8.684606034258206e-05, + "loss": 0.5673812627792358, + "step": 4230 + }, + { + "epoch": 1.7856540084388186, + "grad_norm": 1.1972676515579224, + "learning_rate": 8.682982272262869e-05, + "loss": 0.5950504541397095, + "step": 4232 + }, + { + "epoch": 1.7864978902953585, + "grad_norm": 1.23736572265625, + "learning_rate": 8.681357660674255e-05, + "loss": 0.6477514505386353, + "step": 4234 + }, + { + "epoch": 1.7873417721518987, + "grad_norm": 1.0238158702850342, + "learning_rate": 8.679732199867127e-05, + "loss": 0.6180200576782227, + "step": 4236 + }, + { + "epoch": 1.7881856540084389, + "grad_norm": 1.0333375930786133, + "learning_rate": 8.678105890216455e-05, + "loss": 0.5771099328994751, + "step": 4238 + }, + { + "epoch": 1.7890295358649788, + "grad_norm": 1.30390202999115, + "learning_rate": 8.676478732097393e-05, + "loss": 0.6592516899108887, + "step": 4240 + }, + { + "epoch": 1.789873417721519, + "grad_norm": 1.115160346031189, + "learning_rate": 8.674850725885294e-05, + "loss": 0.6662757396697998, + "step": 4242 + }, + { + "epoch": 1.7907172995780591, + "grad_norm": 1.2130142450332642, + "learning_rate": 8.67322187195571e-05, + "loss": 0.6673333048820496, + "step": 4244 + }, + { + "epoch": 1.791561181434599, + "grad_norm": 1.1505554914474487, + "learning_rate": 8.671592170684386e-05, + "loss": 0.6698325872421265, + "step": 4246 + }, + { + "epoch": 1.7924050632911392, + "grad_norm": 1.0758062601089478, + "learning_rate": 8.669961622447262e-05, + "loss": 0.6216199398040771, + "step": 4248 + }, + { + "epoch": 1.7932489451476794, + "grad_norm": 0.9300920367240906, + "learning_rate": 8.668330227620475e-05, + "loss": 0.6460495591163635, + "step": 4250 + }, + { + "epoch": 1.7940928270042193, + "grad_norm": 1.3860046863555908, + "learning_rate": 8.666697986580357e-05, + "loss": 0.6949506998062134, + "step": 4252 + }, + { + "epoch": 1.7949367088607595, + "grad_norm": 1.2287555932998657, + "learning_rate": 8.665064899703433e-05, + "loss": 0.6320405602455139, + "step": 4254 + }, + { + "epoch": 1.7957805907172997, + "grad_norm": 1.1585466861724854, + "learning_rate": 8.663430967366426e-05, + "loss": 0.6635019779205322, + "step": 4256 + }, + { + "epoch": 1.7966244725738396, + "grad_norm": 1.1007941961288452, + "learning_rate": 8.661796189946252e-05, + "loss": 0.645052969455719, + "step": 4258 + }, + { + "epoch": 1.7974683544303798, + "grad_norm": 1.2059847116470337, + "learning_rate": 8.660160567820023e-05, + "loss": 0.70420902967453, + "step": 4260 + }, + { + "epoch": 1.79831223628692, + "grad_norm": 1.0648717880249023, + "learning_rate": 8.658524101365044e-05, + "loss": 0.6263765096664429, + "step": 4262 + }, + { + "epoch": 1.7991561181434599, + "grad_norm": 1.017052412033081, + "learning_rate": 8.656886790958821e-05, + "loss": 0.6199937462806702, + "step": 4264 + }, + { + "epoch": 1.8, + "grad_norm": 1.1153450012207031, + "learning_rate": 8.655248636979045e-05, + "loss": 0.5891271233558655, + "step": 4266 + }, + { + "epoch": 1.8008438818565402, + "grad_norm": 1.0661747455596924, + "learning_rate": 8.65360963980361e-05, + "loss": 0.5442121028900146, + "step": 4268 + }, + { + "epoch": 1.8016877637130801, + "grad_norm": 1.3049758672714233, + "learning_rate": 8.6519697998106e-05, + "loss": 0.6988245248794556, + "step": 4270 + }, + { + "epoch": 1.80253164556962, + "grad_norm": 1.2679938077926636, + "learning_rate": 8.650329117378294e-05, + "loss": 0.7260398864746094, + "step": 4272 + }, + { + "epoch": 1.8033755274261605, + "grad_norm": 1.0899536609649658, + "learning_rate": 8.648687592885168e-05, + "loss": 0.5757678151130676, + "step": 4274 + }, + { + "epoch": 1.8042194092827004, + "grad_norm": 1.4088575839996338, + "learning_rate": 8.647045226709887e-05, + "loss": 0.7042108178138733, + "step": 4276 + }, + { + "epoch": 1.8050632911392404, + "grad_norm": 1.2143783569335938, + "learning_rate": 8.645402019231316e-05, + "loss": 0.641275942325592, + "step": 4278 + }, + { + "epoch": 1.8059071729957807, + "grad_norm": 1.4072896242141724, + "learning_rate": 8.64375797082851e-05, + "loss": 0.7657124996185303, + "step": 4280 + }, + { + "epoch": 1.8067510548523207, + "grad_norm": 1.2563380002975464, + "learning_rate": 8.642113081880718e-05, + "loss": 0.713768720626831, + "step": 4282 + }, + { + "epoch": 1.8075949367088606, + "grad_norm": 1.1195416450500488, + "learning_rate": 8.64046735276739e-05, + "loss": 0.6276429295539856, + "step": 4284 + }, + { + "epoch": 1.808438818565401, + "grad_norm": 1.2472422122955322, + "learning_rate": 8.638820783868158e-05, + "loss": 0.5641238689422607, + "step": 4286 + }, + { + "epoch": 1.809282700421941, + "grad_norm": 1.1974313259124756, + "learning_rate": 8.637173375562855e-05, + "loss": 0.6312015056610107, + "step": 4288 + }, + { + "epoch": 1.810126582278481, + "grad_norm": 1.1673604249954224, + "learning_rate": 8.63552512823151e-05, + "loss": 0.6674410104751587, + "step": 4290 + }, + { + "epoch": 1.810970464135021, + "grad_norm": 1.199095368385315, + "learning_rate": 8.633876042254337e-05, + "loss": 0.6772016286849976, + "step": 4292 + }, + { + "epoch": 1.8118143459915612, + "grad_norm": 1.2302746772766113, + "learning_rate": 8.632226118011752e-05, + "loss": 0.6621671915054321, + "step": 4294 + }, + { + "epoch": 1.8126582278481012, + "grad_norm": 1.304010033607483, + "learning_rate": 8.63057535588436e-05, + "loss": 0.6965363621711731, + "step": 4296 + }, + { + "epoch": 1.8135021097046413, + "grad_norm": 1.223366618156433, + "learning_rate": 8.62892375625296e-05, + "loss": 0.6300807595252991, + "step": 4298 + }, + { + "epoch": 1.8143459915611815, + "grad_norm": 1.028496265411377, + "learning_rate": 8.627271319498544e-05, + "loss": 0.5610660910606384, + "step": 4300 + }, + { + "epoch": 1.8143459915611815, + "eval_loss": 0.6981000900268555, + "eval_runtime": 514.4659, + "eval_samples_per_second": 4.096, + "eval_steps_per_second": 4.096, + "step": 4300 + }, + { + "epoch": 1.8151898734177214, + "grad_norm": 1.2050007581710815, + "learning_rate": 8.625618046002298e-05, + "loss": 0.6666551232337952, + "step": 4302 + }, + { + "epoch": 1.8160337552742616, + "grad_norm": 1.1233220100402832, + "learning_rate": 8.6239639361456e-05, + "loss": 0.6631835103034973, + "step": 4304 + }, + { + "epoch": 1.8168776371308017, + "grad_norm": 1.1262956857681274, + "learning_rate": 8.622308990310021e-05, + "loss": 0.6395270228385925, + "step": 4306 + }, + { + "epoch": 1.8177215189873417, + "grad_norm": 1.0448222160339355, + "learning_rate": 8.620653208877328e-05, + "loss": 0.6165015697479248, + "step": 4308 + }, + { + "epoch": 1.8185654008438819, + "grad_norm": 1.1555759906768799, + "learning_rate": 8.618996592229473e-05, + "loss": 0.5915844440460205, + "step": 4310 + }, + { + "epoch": 1.819409282700422, + "grad_norm": 1.5407506227493286, + "learning_rate": 8.617339140748608e-05, + "loss": 0.6491456627845764, + "step": 4312 + }, + { + "epoch": 1.820253164556962, + "grad_norm": 1.3690788745880127, + "learning_rate": 8.615680854817077e-05, + "loss": 0.6053901314735413, + "step": 4314 + }, + { + "epoch": 1.8210970464135021, + "grad_norm": 1.052583932876587, + "learning_rate": 8.614021734817413e-05, + "loss": 0.5821644067764282, + "step": 4316 + }, + { + "epoch": 1.8219409282700423, + "grad_norm": 1.090567708015442, + "learning_rate": 8.612361781132344e-05, + "loss": 0.645878255367279, + "step": 4318 + }, + { + "epoch": 1.8227848101265822, + "grad_norm": 1.122719645500183, + "learning_rate": 8.610700994144787e-05, + "loss": 0.6883123517036438, + "step": 4320 + }, + { + "epoch": 1.8236286919831224, + "grad_norm": 1.3273001909255981, + "learning_rate": 8.609039374237856e-05, + "loss": 0.6918330788612366, + "step": 4322 + }, + { + "epoch": 1.8244725738396625, + "grad_norm": 1.0628443956375122, + "learning_rate": 8.607376921794855e-05, + "loss": 0.6292204856872559, + "step": 4324 + }, + { + "epoch": 1.8253164556962025, + "grad_norm": 1.287466287612915, + "learning_rate": 8.605713637199279e-05, + "loss": 0.6136105060577393, + "step": 4326 + }, + { + "epoch": 1.8261603375527427, + "grad_norm": 1.1399345397949219, + "learning_rate": 8.604049520834816e-05, + "loss": 0.6099681854248047, + "step": 4328 + }, + { + "epoch": 1.8270042194092828, + "grad_norm": 1.1131435632705688, + "learning_rate": 8.602384573085345e-05, + "loss": 0.6267056465148926, + "step": 4330 + }, + { + "epoch": 1.8278481012658228, + "grad_norm": 1.1312925815582275, + "learning_rate": 8.600718794334939e-05, + "loss": 0.609437882900238, + "step": 4332 + }, + { + "epoch": 1.828691983122363, + "grad_norm": 1.3711494207382202, + "learning_rate": 8.599052184967859e-05, + "loss": 0.727881669998169, + "step": 4334 + }, + { + "epoch": 1.829535864978903, + "grad_norm": 1.1403605937957764, + "learning_rate": 8.597384745368562e-05, + "loss": 0.6771696209907532, + "step": 4336 + }, + { + "epoch": 1.830379746835443, + "grad_norm": 1.2769951820373535, + "learning_rate": 8.595716475921693e-05, + "loss": 0.6812924742698669, + "step": 4338 + }, + { + "epoch": 1.831223628691983, + "grad_norm": 1.055721402168274, + "learning_rate": 8.59404737701209e-05, + "loss": 0.6403515338897705, + "step": 4340 + }, + { + "epoch": 1.8320675105485233, + "grad_norm": 1.1047639846801758, + "learning_rate": 8.592377449024784e-05, + "loss": 0.663240373134613, + "step": 4342 + }, + { + "epoch": 1.8329113924050633, + "grad_norm": 1.0808883905410767, + "learning_rate": 8.590706692344991e-05, + "loss": 0.6398993134498596, + "step": 4344 + }, + { + "epoch": 1.8337552742616032, + "grad_norm": 1.2433407306671143, + "learning_rate": 8.589035107358125e-05, + "loss": 0.6838348507881165, + "step": 4346 + }, + { + "epoch": 1.8345991561181436, + "grad_norm": 1.031216025352478, + "learning_rate": 8.58736269444979e-05, + "loss": 0.640884280204773, + "step": 4348 + }, + { + "epoch": 1.8354430379746836, + "grad_norm": 1.1417057514190674, + "learning_rate": 8.585689454005776e-05, + "loss": 0.6346741914749146, + "step": 4350 + }, + { + "epoch": 1.8362869198312235, + "grad_norm": 1.210988998413086, + "learning_rate": 8.584015386412072e-05, + "loss": 0.6209521889686584, + "step": 4352 + }, + { + "epoch": 1.8371308016877637, + "grad_norm": 1.2120760679244995, + "learning_rate": 8.582340492054847e-05, + "loss": 0.6699252128601074, + "step": 4354 + }, + { + "epoch": 1.8379746835443038, + "grad_norm": 1.1768114566802979, + "learning_rate": 8.580664771320475e-05, + "loss": 0.6472980380058289, + "step": 4356 + }, + { + "epoch": 1.8388185654008438, + "grad_norm": 1.060070276260376, + "learning_rate": 8.578988224595506e-05, + "loss": 0.6440452933311462, + "step": 4358 + }, + { + "epoch": 1.839662447257384, + "grad_norm": 1.1366443634033203, + "learning_rate": 8.57731085226669e-05, + "loss": 0.5894474387168884, + "step": 4360 + }, + { + "epoch": 1.840506329113924, + "grad_norm": 1.1571751832962036, + "learning_rate": 8.575632654720963e-05, + "loss": 0.5868900418281555, + "step": 4362 + }, + { + "epoch": 1.841350210970464, + "grad_norm": 1.1983840465545654, + "learning_rate": 8.573953632345453e-05, + "loss": 0.5841533541679382, + "step": 4364 + }, + { + "epoch": 1.8421940928270042, + "grad_norm": 1.101806640625, + "learning_rate": 8.572273785527481e-05, + "loss": 0.5503215193748474, + "step": 4366 + }, + { + "epoch": 1.8430379746835444, + "grad_norm": 1.0327471494674683, + "learning_rate": 8.570593114654552e-05, + "loss": 0.6131128072738647, + "step": 4368 + }, + { + "epoch": 1.8438818565400843, + "grad_norm": 1.1421098709106445, + "learning_rate": 8.568911620114368e-05, + "loss": 0.6614060401916504, + "step": 4370 + }, + { + "epoch": 1.8447257383966245, + "grad_norm": 1.1707026958465576, + "learning_rate": 8.567229302294814e-05, + "loss": 0.6392307877540588, + "step": 4372 + }, + { + "epoch": 1.8455696202531646, + "grad_norm": 1.1704418659210205, + "learning_rate": 8.565546161583969e-05, + "loss": 0.6560825109481812, + "step": 4374 + }, + { + "epoch": 1.8464135021097046, + "grad_norm": 1.3618037700653076, + "learning_rate": 8.563862198370103e-05, + "loss": 0.6996290683746338, + "step": 4376 + }, + { + "epoch": 1.8472573839662447, + "grad_norm": 1.116645097732544, + "learning_rate": 8.562177413041674e-05, + "loss": 0.6776535511016846, + "step": 4378 + }, + { + "epoch": 1.8481012658227849, + "grad_norm": 1.1669151782989502, + "learning_rate": 8.560491805987327e-05, + "loss": 0.6390423774719238, + "step": 4380 + }, + { + "epoch": 1.8489451476793248, + "grad_norm": 1.2188117504119873, + "learning_rate": 8.558805377595904e-05, + "loss": 0.6554020047187805, + "step": 4382 + }, + { + "epoch": 1.849789029535865, + "grad_norm": 1.216829776763916, + "learning_rate": 8.557118128256425e-05, + "loss": 0.6291787624359131, + "step": 4384 + }, + { + "epoch": 1.8506329113924052, + "grad_norm": 1.0431596040725708, + "learning_rate": 8.555430058358111e-05, + "loss": 0.6484442949295044, + "step": 4386 + }, + { + "epoch": 1.851476793248945, + "grad_norm": 1.3015289306640625, + "learning_rate": 8.553741168290367e-05, + "loss": 0.7034047842025757, + "step": 4388 + }, + { + "epoch": 1.8523206751054853, + "grad_norm": 1.2062040567398071, + "learning_rate": 8.552051458442785e-05, + "loss": 0.644135594367981, + "step": 4390 + }, + { + "epoch": 1.8531645569620254, + "grad_norm": 1.238461971282959, + "learning_rate": 8.55036092920515e-05, + "loss": 0.6767282485961914, + "step": 4392 + }, + { + "epoch": 1.8540084388185654, + "grad_norm": 1.2978830337524414, + "learning_rate": 8.548669580967435e-05, + "loss": 0.7292267680168152, + "step": 4394 + }, + { + "epoch": 1.8548523206751055, + "grad_norm": 1.1448328495025635, + "learning_rate": 8.546977414119801e-05, + "loss": 0.6788421273231506, + "step": 4396 + }, + { + "epoch": 1.8556962025316457, + "grad_norm": 1.0685368776321411, + "learning_rate": 8.5452844290526e-05, + "loss": 0.6745942234992981, + "step": 4398 + }, + { + "epoch": 1.8565400843881856, + "grad_norm": 1.125707983970642, + "learning_rate": 8.543590626156368e-05, + "loss": 0.6351125836372375, + "step": 4400 + }, + { + "epoch": 1.8565400843881856, + "eval_loss": 0.6961485147476196, + "eval_runtime": 513.5724, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 4400 + }, + { + "epoch": 1.8573839662447258, + "grad_norm": 1.072179913520813, + "learning_rate": 8.541896005821835e-05, + "loss": 0.5840762257575989, + "step": 4402 + }, + { + "epoch": 1.858227848101266, + "grad_norm": 1.2572803497314453, + "learning_rate": 8.540200568439915e-05, + "loss": 0.6431074738502502, + "step": 4404 + }, + { + "epoch": 1.859071729957806, + "grad_norm": 1.3294413089752197, + "learning_rate": 8.538504314401718e-05, + "loss": 0.708808183670044, + "step": 4406 + }, + { + "epoch": 1.8599156118143458, + "grad_norm": 1.1775587797164917, + "learning_rate": 8.536807244098533e-05, + "loss": 0.6580085754394531, + "step": 4408 + }, + { + "epoch": 1.8607594936708862, + "grad_norm": 1.1880089044570923, + "learning_rate": 8.53510935792184e-05, + "loss": 0.6500136256217957, + "step": 4410 + }, + { + "epoch": 1.8616033755274262, + "grad_norm": 1.2166204452514648, + "learning_rate": 8.533410656263313e-05, + "loss": 0.6922352313995361, + "step": 4412 + }, + { + "epoch": 1.862447257383966, + "grad_norm": 1.0405415296554565, + "learning_rate": 8.531711139514808e-05, + "loss": 0.6761626601219177, + "step": 4414 + }, + { + "epoch": 1.8632911392405065, + "grad_norm": 1.0674270391464233, + "learning_rate": 8.530010808068371e-05, + "loss": 0.672576904296875, + "step": 4416 + }, + { + "epoch": 1.8641350210970464, + "grad_norm": 1.0584741830825806, + "learning_rate": 8.528309662316236e-05, + "loss": 0.5521218180656433, + "step": 4418 + }, + { + "epoch": 1.8649789029535864, + "grad_norm": 1.3619039058685303, + "learning_rate": 8.526607702650824e-05, + "loss": 0.6546680927276611, + "step": 4420 + }, + { + "epoch": 1.8658227848101265, + "grad_norm": 0.9904745221138, + "learning_rate": 8.524904929464745e-05, + "loss": 0.6043933629989624, + "step": 4422 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 1.3046703338623047, + "learning_rate": 8.523201343150795e-05, + "loss": 0.7106801271438599, + "step": 4424 + }, + { + "epoch": 1.8675105485232066, + "grad_norm": 1.1166832447052002, + "learning_rate": 8.52149694410196e-05, + "loss": 0.6456703543663025, + "step": 4426 + }, + { + "epoch": 1.8683544303797468, + "grad_norm": 1.1260632276535034, + "learning_rate": 8.519791732711412e-05, + "loss": 0.5963318347930908, + "step": 4428 + }, + { + "epoch": 1.869198312236287, + "grad_norm": 1.0990599393844604, + "learning_rate": 8.51808570937251e-05, + "loss": 0.6295356750488281, + "step": 4430 + }, + { + "epoch": 1.870042194092827, + "grad_norm": 1.3689274787902832, + "learning_rate": 8.516378874478801e-05, + "loss": 0.6984617114067078, + "step": 4432 + }, + { + "epoch": 1.870886075949367, + "grad_norm": 1.0986580848693848, + "learning_rate": 8.514671228424018e-05, + "loss": 0.5598900318145752, + "step": 4434 + }, + { + "epoch": 1.8717299578059072, + "grad_norm": 0.9570761322975159, + "learning_rate": 8.512962771602085e-05, + "loss": 0.6286435723304749, + "step": 4436 + }, + { + "epoch": 1.8725738396624472, + "grad_norm": 1.1480669975280762, + "learning_rate": 8.511253504407107e-05, + "loss": 0.5956313014030457, + "step": 4438 + }, + { + "epoch": 1.8734177215189873, + "grad_norm": 1.1132479906082153, + "learning_rate": 8.50954342723338e-05, + "loss": 0.6523844599723816, + "step": 4440 + }, + { + "epoch": 1.8742616033755275, + "grad_norm": 1.1569167375564575, + "learning_rate": 8.507832540475387e-05, + "loss": 0.6231355667114258, + "step": 4442 + }, + { + "epoch": 1.8751054852320674, + "grad_norm": 1.1327043771743774, + "learning_rate": 8.506120844527796e-05, + "loss": 0.660773754119873, + "step": 4444 + }, + { + "epoch": 1.8759493670886076, + "grad_norm": 0.8939630389213562, + "learning_rate": 8.504408339785463e-05, + "loss": 0.6319235563278198, + "step": 4446 + }, + { + "epoch": 1.8767932489451478, + "grad_norm": 1.1910638809204102, + "learning_rate": 8.50269502664343e-05, + "loss": 0.6753001809120178, + "step": 4448 + }, + { + "epoch": 1.8776371308016877, + "grad_norm": 1.1502408981323242, + "learning_rate": 8.500980905496923e-05, + "loss": 0.6300671696662903, + "step": 4450 + }, + { + "epoch": 1.8784810126582279, + "grad_norm": 1.0639009475708008, + "learning_rate": 8.49926597674136e-05, + "loss": 0.6196691989898682, + "step": 4452 + }, + { + "epoch": 1.879324894514768, + "grad_norm": 1.1072754859924316, + "learning_rate": 8.497550240772341e-05, + "loss": 0.7029181122779846, + "step": 4454 + }, + { + "epoch": 1.880168776371308, + "grad_norm": 1.0440188646316528, + "learning_rate": 8.495833697985652e-05, + "loss": 0.65432208776474, + "step": 4456 + }, + { + "epoch": 1.8810126582278481, + "grad_norm": 1.0646617412567139, + "learning_rate": 8.494116348777269e-05, + "loss": 0.6446614861488342, + "step": 4458 + }, + { + "epoch": 1.8818565400843883, + "grad_norm": 1.2163805961608887, + "learning_rate": 8.492398193543349e-05, + "loss": 0.6430497765541077, + "step": 4460 + }, + { + "epoch": 1.8827004219409282, + "grad_norm": 1.2715297937393188, + "learning_rate": 8.490679232680241e-05, + "loss": 0.6609845161437988, + "step": 4462 + }, + { + "epoch": 1.8835443037974684, + "grad_norm": 1.0435588359832764, + "learning_rate": 8.488959466584469e-05, + "loss": 0.5791062712669373, + "step": 4464 + }, + { + "epoch": 1.8843881856540086, + "grad_norm": 1.229202151298523, + "learning_rate": 8.487238895652759e-05, + "loss": 0.6312171220779419, + "step": 4466 + }, + { + "epoch": 1.8852320675105485, + "grad_norm": 1.0713022947311401, + "learning_rate": 8.485517520282008e-05, + "loss": 0.6698815226554871, + "step": 4468 + }, + { + "epoch": 1.8860759493670884, + "grad_norm": 1.0172312259674072, + "learning_rate": 8.483795340869305e-05, + "loss": 0.6283810138702393, + "step": 4470 + }, + { + "epoch": 1.8869198312236288, + "grad_norm": 1.2880207300186157, + "learning_rate": 8.482072357811926e-05, + "loss": 0.6659437417984009, + "step": 4472 + }, + { + "epoch": 1.8877637130801688, + "grad_norm": 1.0840508937835693, + "learning_rate": 8.480348571507329e-05, + "loss": 0.6190289258956909, + "step": 4474 + }, + { + "epoch": 1.8886075949367087, + "grad_norm": 1.1101994514465332, + "learning_rate": 8.478623982353156e-05, + "loss": 0.5760066509246826, + "step": 4476 + }, + { + "epoch": 1.889451476793249, + "grad_norm": 1.2388770580291748, + "learning_rate": 8.476898590747237e-05, + "loss": 0.6151811480522156, + "step": 4478 + }, + { + "epoch": 1.890295358649789, + "grad_norm": 0.9986408948898315, + "learning_rate": 8.475172397087591e-05, + "loss": 0.5991593599319458, + "step": 4480 + }, + { + "epoch": 1.891139240506329, + "grad_norm": 1.1380778551101685, + "learning_rate": 8.473445401772415e-05, + "loss": 0.7262179255485535, + "step": 4482 + }, + { + "epoch": 1.8919831223628694, + "grad_norm": 1.3933676481246948, + "learning_rate": 8.471717605200092e-05, + "loss": 0.5806916356086731, + "step": 4484 + }, + { + "epoch": 1.8928270042194093, + "grad_norm": 1.0242944955825806, + "learning_rate": 8.469989007769194e-05, + "loss": 0.617904782295227, + "step": 4486 + }, + { + "epoch": 1.8936708860759492, + "grad_norm": 1.0909028053283691, + "learning_rate": 8.468259609878475e-05, + "loss": 0.6488202810287476, + "step": 4488 + }, + { + "epoch": 1.8945147679324894, + "grad_norm": 1.042611002922058, + "learning_rate": 8.466529411926874e-05, + "loss": 0.6015118956565857, + "step": 4490 + }, + { + "epoch": 1.8953586497890296, + "grad_norm": 1.3965784311294556, + "learning_rate": 8.46479841431351e-05, + "loss": 0.7035272717475891, + "step": 4492 + }, + { + "epoch": 1.8962025316455695, + "grad_norm": 1.1486462354660034, + "learning_rate": 8.463066617437698e-05, + "loss": 0.6611229777336121, + "step": 4494 + }, + { + "epoch": 1.8970464135021097, + "grad_norm": 1.0845859050750732, + "learning_rate": 8.461334021698925e-05, + "loss": 0.6378056406974792, + "step": 4496 + }, + { + "epoch": 1.8978902953586498, + "grad_norm": 0.936612069606781, + "learning_rate": 8.459600627496869e-05, + "loss": 0.642429769039154, + "step": 4498 + }, + { + "epoch": 1.8987341772151898, + "grad_norm": 1.1905454397201538, + "learning_rate": 8.457866435231391e-05, + "loss": 0.6341768503189087, + "step": 4500 + }, + { + "epoch": 1.8987341772151898, + "eval_loss": 0.6938078999519348, + "eval_runtime": 513.615, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 4.102, + "step": 4500 + }, + { + "epoch": 1.89957805907173, + "grad_norm": 0.9778118133544922, + "learning_rate": 8.456131445302538e-05, + "loss": 0.5973100662231445, + "step": 4502 + }, + { + "epoch": 1.90042194092827, + "grad_norm": 0.9587083458900452, + "learning_rate": 8.454395658110536e-05, + "loss": 0.5982911586761475, + "step": 4504 + }, + { + "epoch": 1.90126582278481, + "grad_norm": 1.327643871307373, + "learning_rate": 8.452659074055798e-05, + "loss": 0.6858586668968201, + "step": 4506 + }, + { + "epoch": 1.9021097046413502, + "grad_norm": 1.0740257501602173, + "learning_rate": 8.450921693538922e-05, + "loss": 0.6172328591346741, + "step": 4508 + }, + { + "epoch": 1.9029535864978904, + "grad_norm": 1.0705101490020752, + "learning_rate": 8.449183516960685e-05, + "loss": 0.5349634289741516, + "step": 4510 + }, + { + "epoch": 1.9037974683544303, + "grad_norm": 0.9151237607002258, + "learning_rate": 8.447444544722058e-05, + "loss": 0.5769277811050415, + "step": 4512 + }, + { + "epoch": 1.9046413502109705, + "grad_norm": 1.139900803565979, + "learning_rate": 8.44570477722418e-05, + "loss": 0.6579093933105469, + "step": 4514 + }, + { + "epoch": 1.9054852320675106, + "grad_norm": 1.2481658458709717, + "learning_rate": 8.443964214868387e-05, + "loss": 0.6748929619789124, + "step": 4516 + }, + { + "epoch": 1.9063291139240506, + "grad_norm": 1.1661686897277832, + "learning_rate": 8.442222858056193e-05, + "loss": 0.6492021083831787, + "step": 4518 + }, + { + "epoch": 1.9071729957805907, + "grad_norm": 1.241477370262146, + "learning_rate": 8.440480707189295e-05, + "loss": 0.635409951210022, + "step": 4520 + }, + { + "epoch": 1.908016877637131, + "grad_norm": 1.1102054119110107, + "learning_rate": 8.438737762669573e-05, + "loss": 0.631928026676178, + "step": 4522 + }, + { + "epoch": 1.9088607594936708, + "grad_norm": 1.0638107061386108, + "learning_rate": 8.43699402489909e-05, + "loss": 0.604518473148346, + "step": 4524 + }, + { + "epoch": 1.909704641350211, + "grad_norm": 1.0270655155181885, + "learning_rate": 8.435249494280096e-05, + "loss": 0.61314457654953, + "step": 4526 + }, + { + "epoch": 1.9105485232067512, + "grad_norm": 1.1840111017227173, + "learning_rate": 8.433504171215018e-05, + "loss": 0.661663293838501, + "step": 4528 + }, + { + "epoch": 1.9113924050632911, + "grad_norm": 1.1404399871826172, + "learning_rate": 8.43175805610647e-05, + "loss": 0.7026967406272888, + "step": 4530 + }, + { + "epoch": 1.9122362869198313, + "grad_norm": 1.2371265888214111, + "learning_rate": 8.430011149357246e-05, + "loss": 0.6599440574645996, + "step": 4532 + }, + { + "epoch": 1.9130801687763714, + "grad_norm": 1.0042651891708374, + "learning_rate": 8.428263451370326e-05, + "loss": 0.5728344321250916, + "step": 4534 + }, + { + "epoch": 1.9139240506329114, + "grad_norm": 1.04367196559906, + "learning_rate": 8.426514962548866e-05, + "loss": 0.6495450735092163, + "step": 4536 + }, + { + "epoch": 1.9147679324894513, + "grad_norm": 1.0867135524749756, + "learning_rate": 8.424765683296215e-05, + "loss": 0.6406553387641907, + "step": 4538 + }, + { + "epoch": 1.9156118143459917, + "grad_norm": 1.0751310586929321, + "learning_rate": 8.423015614015892e-05, + "loss": 0.6692186594009399, + "step": 4540 + }, + { + "epoch": 1.9164556962025316, + "grad_norm": 1.13556969165802, + "learning_rate": 8.421264755111607e-05, + "loss": 0.6029785871505737, + "step": 4542 + }, + { + "epoch": 1.9172995780590716, + "grad_norm": 1.1560977697372437, + "learning_rate": 8.419513106987251e-05, + "loss": 0.6457844972610474, + "step": 4544 + }, + { + "epoch": 1.918143459915612, + "grad_norm": 1.2192902565002441, + "learning_rate": 8.417760670046893e-05, + "loss": 0.7082147598266602, + "step": 4546 + }, + { + "epoch": 1.918987341772152, + "grad_norm": 1.1170696020126343, + "learning_rate": 8.41600744469479e-05, + "loss": 0.6919234991073608, + "step": 4548 + }, + { + "epoch": 1.9198312236286919, + "grad_norm": 1.061253547668457, + "learning_rate": 8.414253431335373e-05, + "loss": 0.6310052871704102, + "step": 4550 + }, + { + "epoch": 1.920675105485232, + "grad_norm": 1.0671885013580322, + "learning_rate": 8.412498630373263e-05, + "loss": 0.6330236792564392, + "step": 4552 + }, + { + "epoch": 1.9215189873417722, + "grad_norm": 1.2085163593292236, + "learning_rate": 8.410743042213256e-05, + "loss": 0.7031015157699585, + "step": 4554 + }, + { + "epoch": 1.9223628691983121, + "grad_norm": 1.2682013511657715, + "learning_rate": 8.408986667260334e-05, + "loss": 0.7078304290771484, + "step": 4556 + }, + { + "epoch": 1.9232067510548523, + "grad_norm": 1.2966876029968262, + "learning_rate": 8.407229505919658e-05, + "loss": 0.6542860865592957, + "step": 4558 + }, + { + "epoch": 1.9240506329113924, + "grad_norm": 1.1086169481277466, + "learning_rate": 8.405471558596573e-05, + "loss": 0.5856828093528748, + "step": 4560 + }, + { + "epoch": 1.9248945147679324, + "grad_norm": 1.3175504207611084, + "learning_rate": 8.403712825696604e-05, + "loss": 0.7382104992866516, + "step": 4562 + }, + { + "epoch": 1.9257383966244725, + "grad_norm": 1.163164496421814, + "learning_rate": 8.401953307625454e-05, + "loss": 0.6862360239028931, + "step": 4564 + }, + { + "epoch": 1.9265822784810127, + "grad_norm": 1.207650899887085, + "learning_rate": 8.400193004789013e-05, + "loss": 0.7442302703857422, + "step": 4566 + }, + { + "epoch": 1.9274261603375527, + "grad_norm": 1.1570589542388916, + "learning_rate": 8.398431917593345e-05, + "loss": 0.595226526260376, + "step": 4568 + }, + { + "epoch": 1.9282700421940928, + "grad_norm": 1.091927170753479, + "learning_rate": 8.396670046444704e-05, + "loss": 0.6360410451889038, + "step": 4570 + }, + { + "epoch": 1.929113924050633, + "grad_norm": 1.149559497833252, + "learning_rate": 8.394907391749516e-05, + "loss": 0.6343122124671936, + "step": 4572 + }, + { + "epoch": 1.929957805907173, + "grad_norm": 1.0585254430770874, + "learning_rate": 8.393143953914395e-05, + "loss": 0.7394745349884033, + "step": 4574 + }, + { + "epoch": 1.930801687763713, + "grad_norm": 1.1648521423339844, + "learning_rate": 8.391379733346128e-05, + "loss": 0.6489678025245667, + "step": 4576 + }, + { + "epoch": 1.9316455696202532, + "grad_norm": 1.1756316423416138, + "learning_rate": 8.389614730451692e-05, + "loss": 0.6687861084938049, + "step": 4578 + }, + { + "epoch": 1.9324894514767932, + "grad_norm": 0.9857237339019775, + "learning_rate": 8.387848945638235e-05, + "loss": 0.523727536201477, + "step": 4580 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 1.1038693189620972, + "learning_rate": 8.386082379313092e-05, + "loss": 0.6545047760009766, + "step": 4582 + }, + { + "epoch": 1.9341772151898735, + "grad_norm": 1.0780832767486572, + "learning_rate": 8.384315031883774e-05, + "loss": 0.6067036390304565, + "step": 4584 + }, + { + "epoch": 1.9350210970464135, + "grad_norm": 1.2915070056915283, + "learning_rate": 8.382546903757975e-05, + "loss": 0.6880824565887451, + "step": 4586 + }, + { + "epoch": 1.9358649789029536, + "grad_norm": 1.1243441104888916, + "learning_rate": 8.380777995343568e-05, + "loss": 0.7319117188453674, + "step": 4588 + }, + { + "epoch": 1.9367088607594938, + "grad_norm": 1.1143072843551636, + "learning_rate": 8.379008307048609e-05, + "loss": 0.6845395565032959, + "step": 4590 + }, + { + "epoch": 1.9375527426160337, + "grad_norm": 1.039494276046753, + "learning_rate": 8.377237839281327e-05, + "loss": 0.6653600335121155, + "step": 4592 + }, + { + "epoch": 1.9383966244725739, + "grad_norm": 1.299617886543274, + "learning_rate": 8.375466592450136e-05, + "loss": 0.6352495551109314, + "step": 4594 + }, + { + "epoch": 1.939240506329114, + "grad_norm": 0.9918657541275024, + "learning_rate": 8.373694566963631e-05, + "loss": 0.5660957098007202, + "step": 4596 + }, + { + "epoch": 1.940084388185654, + "grad_norm": 1.0540478229522705, + "learning_rate": 8.371921763230579e-05, + "loss": 0.6296496987342834, + "step": 4598 + }, + { + "epoch": 1.9409282700421941, + "grad_norm": 1.1309545040130615, + "learning_rate": 8.370148181659939e-05, + "loss": 0.6672025918960571, + "step": 4600 + }, + { + "epoch": 1.9409282700421941, + "eval_loss": 0.6930755376815796, + "eval_runtime": 617.8927, + "eval_samples_per_second": 3.41, + "eval_steps_per_second": 3.41, + "step": 4600 + }, + { + "epoch": 1.9417721518987343, + "grad_norm": 1.2338588237762451, + "learning_rate": 8.368373822660836e-05, + "loss": 0.6200884580612183, + "step": 4602 + }, + { + "epoch": 1.9426160337552743, + "grad_norm": 1.1756945848464966, + "learning_rate": 8.366598686642582e-05, + "loss": 0.653294026851654, + "step": 4604 + }, + { + "epoch": 1.9434599156118142, + "grad_norm": 1.032018780708313, + "learning_rate": 8.364822774014671e-05, + "loss": 0.5670395493507385, + "step": 4606 + }, + { + "epoch": 1.9443037974683546, + "grad_norm": 1.045280933380127, + "learning_rate": 8.363046085186766e-05, + "loss": 0.6819197535514832, + "step": 4608 + }, + { + "epoch": 1.9451476793248945, + "grad_norm": 1.3223930597305298, + "learning_rate": 8.36126862056872e-05, + "loss": 0.6952820420265198, + "step": 4610 + }, + { + "epoch": 1.9459915611814345, + "grad_norm": 1.0048432350158691, + "learning_rate": 8.359490380570556e-05, + "loss": 0.5291440486907959, + "step": 4612 + }, + { + "epoch": 1.9468354430379748, + "grad_norm": 1.1477346420288086, + "learning_rate": 8.357711365602483e-05, + "loss": 0.6857813000679016, + "step": 4614 + }, + { + "epoch": 1.9476793248945148, + "grad_norm": 0.959985077381134, + "learning_rate": 8.355931576074882e-05, + "loss": 0.5581508278846741, + "step": 4616 + }, + { + "epoch": 1.9485232067510547, + "grad_norm": 1.1104289293289185, + "learning_rate": 8.35415101239832e-05, + "loss": 0.6536211371421814, + "step": 4618 + }, + { + "epoch": 1.9493670886075949, + "grad_norm": 1.2344517707824707, + "learning_rate": 8.352369674983535e-05, + "loss": 0.6570560336112976, + "step": 4620 + }, + { + "epoch": 1.950210970464135, + "grad_norm": 1.3411606550216675, + "learning_rate": 8.350587564241451e-05, + "loss": 0.6070495247840881, + "step": 4622 + }, + { + "epoch": 1.951054852320675, + "grad_norm": 1.1713159084320068, + "learning_rate": 8.348804680583166e-05, + "loss": 0.6444135904312134, + "step": 4624 + }, + { + "epoch": 1.9518987341772152, + "grad_norm": 1.127242922782898, + "learning_rate": 8.347021024419954e-05, + "loss": 0.6517419815063477, + "step": 4626 + }, + { + "epoch": 1.9527426160337553, + "grad_norm": 1.0733028650283813, + "learning_rate": 8.345236596163274e-05, + "loss": 0.6174065470695496, + "step": 4628 + }, + { + "epoch": 1.9535864978902953, + "grad_norm": 1.1114680767059326, + "learning_rate": 8.343451396224757e-05, + "loss": 0.7163593769073486, + "step": 4630 + }, + { + "epoch": 1.9544303797468354, + "grad_norm": 1.0839568376541138, + "learning_rate": 8.341665425016216e-05, + "loss": 0.698553204536438, + "step": 4632 + }, + { + "epoch": 1.9552742616033756, + "grad_norm": 1.17001211643219, + "learning_rate": 8.339878682949638e-05, + "loss": 0.6224857568740845, + "step": 4634 + }, + { + "epoch": 1.9561181434599155, + "grad_norm": 3.483793020248413, + "learning_rate": 8.338091170437193e-05, + "loss": 0.5931200981140137, + "step": 4636 + }, + { + "epoch": 1.9569620253164557, + "grad_norm": 1.1575394868850708, + "learning_rate": 8.336302887891224e-05, + "loss": 0.6031442284584045, + "step": 4638 + }, + { + "epoch": 1.9578059071729959, + "grad_norm": 1.1494992971420288, + "learning_rate": 8.334513835724252e-05, + "loss": 0.6101768016815186, + "step": 4640 + }, + { + "epoch": 1.9586497890295358, + "grad_norm": 1.3858197927474976, + "learning_rate": 8.332724014348981e-05, + "loss": 0.6571711301803589, + "step": 4642 + }, + { + "epoch": 1.959493670886076, + "grad_norm": 1.1094943284988403, + "learning_rate": 8.330933424178284e-05, + "loss": 0.6391071677207947, + "step": 4644 + }, + { + "epoch": 1.9603375527426161, + "grad_norm": 1.1640198230743408, + "learning_rate": 8.329142065625218e-05, + "loss": 0.6542805433273315, + "step": 4646 + }, + { + "epoch": 1.961181434599156, + "grad_norm": 1.1080211400985718, + "learning_rate": 8.327349939103016e-05, + "loss": 0.6053075194358826, + "step": 4648 + }, + { + "epoch": 1.9620253164556962, + "grad_norm": 1.0137052536010742, + "learning_rate": 8.325557045025085e-05, + "loss": 0.6009573340415955, + "step": 4650 + }, + { + "epoch": 1.9628691983122364, + "grad_norm": 1.0867283344268799, + "learning_rate": 8.323763383805012e-05, + "loss": 0.5993483066558838, + "step": 4652 + }, + { + "epoch": 1.9637130801687763, + "grad_norm": 1.0577161312103271, + "learning_rate": 8.321968955856562e-05, + "loss": 0.6788463592529297, + "step": 4654 + }, + { + "epoch": 1.9645569620253165, + "grad_norm": 1.2002183198928833, + "learning_rate": 8.320173761593672e-05, + "loss": 0.5786917209625244, + "step": 4656 + }, + { + "epoch": 1.9654008438818567, + "grad_norm": 1.2266993522644043, + "learning_rate": 8.318377801430461e-05, + "loss": 0.7437994480133057, + "step": 4658 + }, + { + "epoch": 1.9662447257383966, + "grad_norm": 1.007582187652588, + "learning_rate": 8.316581075781223e-05, + "loss": 0.6763550639152527, + "step": 4660 + }, + { + "epoch": 1.9670886075949368, + "grad_norm": 1.2374811172485352, + "learning_rate": 8.314783585060425e-05, + "loss": 0.6953140497207642, + "step": 4662 + }, + { + "epoch": 1.967932489451477, + "grad_norm": 1.1791057586669922, + "learning_rate": 8.312985329682717e-05, + "loss": 0.6867341995239258, + "step": 4664 + }, + { + "epoch": 1.9687763713080169, + "grad_norm": 1.1903331279754639, + "learning_rate": 8.31118631006292e-05, + "loss": 0.6445001363754272, + "step": 4666 + }, + { + "epoch": 1.9696202531645568, + "grad_norm": 1.1731067895889282, + "learning_rate": 8.309386526616034e-05, + "loss": 0.6500589847564697, + "step": 4668 + }, + { + "epoch": 1.9704641350210972, + "grad_norm": 0.9470233917236328, + "learning_rate": 8.307585979757233e-05, + "loss": 0.6215718984603882, + "step": 4670 + }, + { + "epoch": 1.9713080168776371, + "grad_norm": 1.2900800704956055, + "learning_rate": 8.305784669901872e-05, + "loss": 0.6396787762641907, + "step": 4672 + }, + { + "epoch": 1.972151898734177, + "grad_norm": 1.1729133129119873, + "learning_rate": 8.303982597465474e-05, + "loss": 0.6581959128379822, + "step": 4674 + }, + { + "epoch": 1.9729957805907175, + "grad_norm": 1.1450555324554443, + "learning_rate": 8.302179762863746e-05, + "loss": 0.7013490796089172, + "step": 4676 + }, + { + "epoch": 1.9738396624472574, + "grad_norm": 1.1506338119506836, + "learning_rate": 8.300376166512567e-05, + "loss": 0.6796102523803711, + "step": 4678 + }, + { + "epoch": 1.9746835443037973, + "grad_norm": 1.149979591369629, + "learning_rate": 8.298571808827991e-05, + "loss": 0.6960519552230835, + "step": 4680 + }, + { + "epoch": 1.9755274261603377, + "grad_norm": 1.1078912019729614, + "learning_rate": 8.296766690226249e-05, + "loss": 0.6789507865905762, + "step": 4682 + }, + { + "epoch": 1.9763713080168777, + "grad_norm": 1.0199202299118042, + "learning_rate": 8.294960811123747e-05, + "loss": 0.5962659120559692, + "step": 4684 + }, + { + "epoch": 1.9772151898734176, + "grad_norm": 1.2226134538650513, + "learning_rate": 8.293154171937068e-05, + "loss": 0.6483094692230225, + "step": 4686 + }, + { + "epoch": 1.9780590717299578, + "grad_norm": 1.184095025062561, + "learning_rate": 8.291346773082965e-05, + "loss": 0.6750242710113525, + "step": 4688 + }, + { + "epoch": 1.978902953586498, + "grad_norm": 1.1018693447113037, + "learning_rate": 8.289538614978375e-05, + "loss": 0.7094066739082336, + "step": 4690 + }, + { + "epoch": 1.9797468354430379, + "grad_norm": 1.0342390537261963, + "learning_rate": 8.287729698040403e-05, + "loss": 0.6554126739501953, + "step": 4692 + }, + { + "epoch": 1.980590717299578, + "grad_norm": 1.0603563785552979, + "learning_rate": 8.285920022686332e-05, + "loss": 0.5493529438972473, + "step": 4694 + }, + { + "epoch": 1.9814345991561182, + "grad_norm": 1.139609932899475, + "learning_rate": 8.284109589333617e-05, + "loss": 0.6824741363525391, + "step": 4696 + }, + { + "epoch": 1.9822784810126581, + "grad_norm": 1.2167822122573853, + "learning_rate": 8.282298398399895e-05, + "loss": 0.7121000289916992, + "step": 4698 + }, + { + "epoch": 1.9831223628691983, + "grad_norm": 1.109857201576233, + "learning_rate": 8.280486450302968e-05, + "loss": 0.6711249351501465, + "step": 4700 + }, + { + "epoch": 1.9831223628691983, + "eval_loss": 0.6923081278800964, + "eval_runtime": 514.7729, + "eval_samples_per_second": 4.093, + "eval_steps_per_second": 4.093, + "step": 4700 + }, + { + "epoch": 1.9839662447257385, + "grad_norm": 1.1387107372283936, + "learning_rate": 8.27867374546082e-05, + "loss": 0.581635594367981, + "step": 4702 + }, + { + "epoch": 1.9848101265822784, + "grad_norm": 1.2519257068634033, + "learning_rate": 8.27686028429161e-05, + "loss": 0.6867302060127258, + "step": 4704 + }, + { + "epoch": 1.9856540084388186, + "grad_norm": 1.0927205085754395, + "learning_rate": 8.275046067213663e-05, + "loss": 0.6494556665420532, + "step": 4706 + }, + { + "epoch": 1.9864978902953587, + "grad_norm": 1.042035698890686, + "learning_rate": 8.273231094645487e-05, + "loss": 0.6949493288993835, + "step": 4708 + }, + { + "epoch": 1.9873417721518987, + "grad_norm": 1.0220824480056763, + "learning_rate": 8.271415367005762e-05, + "loss": 0.6535884737968445, + "step": 4710 + }, + { + "epoch": 1.9881856540084388, + "grad_norm": 1.3023611307144165, + "learning_rate": 8.269598884713339e-05, + "loss": 0.6635278463363647, + "step": 4712 + }, + { + "epoch": 1.989029535864979, + "grad_norm": 1.2526965141296387, + "learning_rate": 8.267781648187248e-05, + "loss": 0.7194697856903076, + "step": 4714 + }, + { + "epoch": 1.989873417721519, + "grad_norm": 1.0388038158416748, + "learning_rate": 8.265963657846691e-05, + "loss": 0.6355333924293518, + "step": 4716 + }, + { + "epoch": 1.990717299578059, + "grad_norm": 1.0852965116500854, + "learning_rate": 8.264144914111041e-05, + "loss": 0.6898305416107178, + "step": 4718 + }, + { + "epoch": 1.9915611814345993, + "grad_norm": 1.0714049339294434, + "learning_rate": 8.262325417399847e-05, + "loss": 0.6202836036682129, + "step": 4720 + }, + { + "epoch": 1.9924050632911392, + "grad_norm": 1.0767238140106201, + "learning_rate": 8.260505168132835e-05, + "loss": 0.6160458326339722, + "step": 4722 + }, + { + "epoch": 1.9932489451476794, + "grad_norm": 0.9605211615562439, + "learning_rate": 8.258684166729899e-05, + "loss": 0.6049920916557312, + "step": 4724 + }, + { + "epoch": 1.9940928270042195, + "grad_norm": 1.0580185651779175, + "learning_rate": 8.256862413611113e-05, + "loss": 0.5622014999389648, + "step": 4726 + }, + { + "epoch": 1.9949367088607595, + "grad_norm": 1.1039034128189087, + "learning_rate": 8.255039909196713e-05, + "loss": 0.6678924560546875, + "step": 4728 + }, + { + "epoch": 1.9957805907172996, + "grad_norm": 1.1482586860656738, + "learning_rate": 8.253216653907123e-05, + "loss": 0.658260703086853, + "step": 4730 + }, + { + "epoch": 1.9966244725738398, + "grad_norm": 1.135349988937378, + "learning_rate": 8.251392648162929e-05, + "loss": 0.6461613178253174, + "step": 4732 + }, + { + "epoch": 1.9974683544303797, + "grad_norm": 1.0155420303344727, + "learning_rate": 8.249567892384895e-05, + "loss": 0.6837426424026489, + "step": 4734 + }, + { + "epoch": 1.9983122362869197, + "grad_norm": 1.3392970561981201, + "learning_rate": 8.247742386993958e-05, + "loss": 0.6091697812080383, + "step": 4736 + }, + { + "epoch": 1.99915611814346, + "grad_norm": 1.0509974956512451, + "learning_rate": 8.245916132411226e-05, + "loss": 0.6539653539657593, + "step": 4738 + }, + { + "epoch": 2.0, + "grad_norm": 0.9777396321296692, + "learning_rate": 8.244089129057982e-05, + "loss": 0.5630147457122803, + "step": 4740 + }, + { + "epoch": 2.00084388185654, + "grad_norm": 1.1639164686203003, + "learning_rate": 8.24226137735568e-05, + "loss": 0.6190353631973267, + "step": 4742 + }, + { + "epoch": 2.0016877637130803, + "grad_norm": 1.119614839553833, + "learning_rate": 8.240432877725947e-05, + "loss": 0.6282529234886169, + "step": 4744 + }, + { + "epoch": 2.0025316455696203, + "grad_norm": 1.114739179611206, + "learning_rate": 8.238603630590581e-05, + "loss": 0.6176725625991821, + "step": 4746 + }, + { + "epoch": 2.00337552742616, + "grad_norm": 1.0543076992034912, + "learning_rate": 8.236773636371557e-05, + "loss": 0.5182007551193237, + "step": 4748 + }, + { + "epoch": 2.0042194092827006, + "grad_norm": 1.060389518737793, + "learning_rate": 8.234942895491019e-05, + "loss": 0.532536506652832, + "step": 4750 + }, + { + "epoch": 2.0050632911392405, + "grad_norm": 1.0824412107467651, + "learning_rate": 8.233111408371282e-05, + "loss": 0.5474061369895935, + "step": 4752 + }, + { + "epoch": 2.0059071729957805, + "grad_norm": 1.1450858116149902, + "learning_rate": 8.231279175434838e-05, + "loss": 0.586384654045105, + "step": 4754 + }, + { + "epoch": 2.006751054852321, + "grad_norm": 1.1225577592849731, + "learning_rate": 8.229446197104345e-05, + "loss": 0.6469444036483765, + "step": 4756 + }, + { + "epoch": 2.007594936708861, + "grad_norm": 1.7292449474334717, + "learning_rate": 8.227612473802637e-05, + "loss": 0.5371572971343994, + "step": 4758 + }, + { + "epoch": 2.0084388185654007, + "grad_norm": 1.1743781566619873, + "learning_rate": 8.22577800595272e-05, + "loss": 0.558707058429718, + "step": 4760 + }, + { + "epoch": 2.009282700421941, + "grad_norm": 1.0385273694992065, + "learning_rate": 8.223942793977769e-05, + "loss": 0.5943514108657837, + "step": 4762 + }, + { + "epoch": 2.010126582278481, + "grad_norm": 1.1302000284194946, + "learning_rate": 8.222106838301131e-05, + "loss": 0.5630753636360168, + "step": 4764 + }, + { + "epoch": 2.010970464135021, + "grad_norm": 1.140005111694336, + "learning_rate": 8.220270139346327e-05, + "loss": 0.527510404586792, + "step": 4766 + }, + { + "epoch": 2.0118143459915614, + "grad_norm": 1.1979734897613525, + "learning_rate": 8.21843269753705e-05, + "loss": 0.6315013766288757, + "step": 4768 + }, + { + "epoch": 2.0126582278481013, + "grad_norm": 1.3759459257125854, + "learning_rate": 8.21659451329716e-05, + "loss": 0.6225199699401855, + "step": 4770 + }, + { + "epoch": 2.0135021097046413, + "grad_norm": 1.330600380897522, + "learning_rate": 8.21475558705069e-05, + "loss": 0.6838938593864441, + "step": 4772 + }, + { + "epoch": 2.014345991561181, + "grad_norm": 1.2365351915359497, + "learning_rate": 8.21291591922185e-05, + "loss": 0.606302797794342, + "step": 4774 + }, + { + "epoch": 2.0151898734177216, + "grad_norm": 1.1886142492294312, + "learning_rate": 8.211075510235011e-05, + "loss": 0.6194182634353638, + "step": 4776 + }, + { + "epoch": 2.0160337552742615, + "grad_norm": 1.1414743661880493, + "learning_rate": 8.209234360514721e-05, + "loss": 0.639540433883667, + "step": 4778 + }, + { + "epoch": 2.0168776371308015, + "grad_norm": 1.2877455949783325, + "learning_rate": 8.2073924704857e-05, + "loss": 0.6350902318954468, + "step": 4780 + }, + { + "epoch": 2.017721518987342, + "grad_norm": 1.095578908920288, + "learning_rate": 8.205549840572834e-05, + "loss": 0.5152000784873962, + "step": 4782 + }, + { + "epoch": 2.018565400843882, + "grad_norm": 1.0043798685073853, + "learning_rate": 8.203706471201183e-05, + "loss": 0.46245837211608887, + "step": 4784 + }, + { + "epoch": 2.0194092827004217, + "grad_norm": 1.2133857011795044, + "learning_rate": 8.201862362795979e-05, + "loss": 0.6471722722053528, + "step": 4786 + }, + { + "epoch": 2.020253164556962, + "grad_norm": 1.0835390090942383, + "learning_rate": 8.200017515782619e-05, + "loss": 0.5790625214576721, + "step": 4788 + }, + { + "epoch": 2.021097046413502, + "grad_norm": 1.0176091194152832, + "learning_rate": 8.198171930586678e-05, + "loss": 0.5826238989830017, + "step": 4790 + }, + { + "epoch": 2.021940928270042, + "grad_norm": 1.1581370830535889, + "learning_rate": 8.196325607633893e-05, + "loss": 0.5781272649765015, + "step": 4792 + }, + { + "epoch": 2.0227848101265824, + "grad_norm": 1.243381142616272, + "learning_rate": 8.194478547350178e-05, + "loss": 0.6600401997566223, + "step": 4794 + }, + { + "epoch": 2.0236286919831223, + "grad_norm": 1.0718560218811035, + "learning_rate": 8.192630750161612e-05, + "loss": 0.5291268825531006, + "step": 4796 + }, + { + "epoch": 2.0244725738396623, + "grad_norm": 1.2338320016860962, + "learning_rate": 8.190782216494448e-05, + "loss": 0.6564924120903015, + "step": 4798 + }, + { + "epoch": 2.0253164556962027, + "grad_norm": 0.978547990322113, + "learning_rate": 8.188932946775107e-05, + "loss": 0.5471183657646179, + "step": 4800 + }, + { + "epoch": 2.0253164556962027, + "eval_loss": 0.6924457550048828, + "eval_runtime": 514.0427, + "eval_samples_per_second": 4.099, + "eval_steps_per_second": 4.099, + "step": 4800 + }, + { + "epoch": 2.0261603375527426, + "grad_norm": 1.1782792806625366, + "learning_rate": 8.18708294143018e-05, + "loss": 0.567442774772644, + "step": 4802 + }, + { + "epoch": 2.0270042194092825, + "grad_norm": 1.0768574476242065, + "learning_rate": 8.185232200886426e-05, + "loss": 0.6005180478096008, + "step": 4804 + }, + { + "epoch": 2.027848101265823, + "grad_norm": 1.3096717596054077, + "learning_rate": 8.18338072557078e-05, + "loss": 0.616436779499054, + "step": 4806 + }, + { + "epoch": 2.028691983122363, + "grad_norm": 1.0233508348464966, + "learning_rate": 8.181528515910336e-05, + "loss": 0.49587416648864746, + "step": 4808 + }, + { + "epoch": 2.029535864978903, + "grad_norm": 1.0800065994262695, + "learning_rate": 8.179675572332366e-05, + "loss": 0.5758571624755859, + "step": 4810 + }, + { + "epoch": 2.030379746835443, + "grad_norm": 1.09299898147583, + "learning_rate": 8.177821895264309e-05, + "loss": 0.561736524105072, + "step": 4812 + }, + { + "epoch": 2.031223628691983, + "grad_norm": 1.1439210176467896, + "learning_rate": 8.175967485133771e-05, + "loss": 0.5249468088150024, + "step": 4814 + }, + { + "epoch": 2.032067510548523, + "grad_norm": 1.15841805934906, + "learning_rate": 8.174112342368532e-05, + "loss": 0.6429001688957214, + "step": 4816 + }, + { + "epoch": 2.0329113924050635, + "grad_norm": 1.1720670461654663, + "learning_rate": 8.172256467396533e-05, + "loss": 0.60152667760849, + "step": 4818 + }, + { + "epoch": 2.0337552742616034, + "grad_norm": 1.2652091979980469, + "learning_rate": 8.170399860645892e-05, + "loss": 0.5553541779518127, + "step": 4820 + }, + { + "epoch": 2.0345991561181433, + "grad_norm": 1.0768507719039917, + "learning_rate": 8.168542522544893e-05, + "loss": 0.5369323492050171, + "step": 4822 + }, + { + "epoch": 2.0354430379746837, + "grad_norm": 0.9906469583511353, + "learning_rate": 8.166684453521986e-05, + "loss": 0.5468952655792236, + "step": 4824 + }, + { + "epoch": 2.0362869198312237, + "grad_norm": 1.3448988199234009, + "learning_rate": 8.164825654005792e-05, + "loss": 0.5795659422874451, + "step": 4826 + }, + { + "epoch": 2.0371308016877636, + "grad_norm": 1.2502341270446777, + "learning_rate": 8.162966124425103e-05, + "loss": 0.6465779542922974, + "step": 4828 + }, + { + "epoch": 2.037974683544304, + "grad_norm": 1.1512303352355957, + "learning_rate": 8.161105865208875e-05, + "loss": 0.5509394407272339, + "step": 4830 + }, + { + "epoch": 2.038818565400844, + "grad_norm": 1.2513408660888672, + "learning_rate": 8.159244876786232e-05, + "loss": 0.5515735745429993, + "step": 4832 + }, + { + "epoch": 2.039662447257384, + "grad_norm": 1.3035682439804077, + "learning_rate": 8.157383159586473e-05, + "loss": 0.757799506187439, + "step": 4834 + }, + { + "epoch": 2.0405063291139243, + "grad_norm": 1.1136540174484253, + "learning_rate": 8.155520714039056e-05, + "loss": 0.607295036315918, + "step": 4836 + }, + { + "epoch": 2.041350210970464, + "grad_norm": 1.220146656036377, + "learning_rate": 8.153657540573613e-05, + "loss": 0.5769712328910828, + "step": 4838 + }, + { + "epoch": 2.042194092827004, + "grad_norm": 1.2104195356369019, + "learning_rate": 8.151793639619944e-05, + "loss": 0.5746933817863464, + "step": 4840 + }, + { + "epoch": 2.043037974683544, + "grad_norm": 1.241708517074585, + "learning_rate": 8.149929011608014e-05, + "loss": 0.5932332277297974, + "step": 4842 + }, + { + "epoch": 2.0438818565400845, + "grad_norm": 1.1172713041305542, + "learning_rate": 8.148063656967955e-05, + "loss": 0.583284318447113, + "step": 4844 + }, + { + "epoch": 2.0447257383966244, + "grad_norm": 1.0867618322372437, + "learning_rate": 8.14619757613007e-05, + "loss": 0.5589476823806763, + "step": 4846 + }, + { + "epoch": 2.0455696202531644, + "grad_norm": 1.2470483779907227, + "learning_rate": 8.14433076952483e-05, + "loss": 0.6118156313896179, + "step": 4848 + }, + { + "epoch": 2.0464135021097047, + "grad_norm": 1.0908832550048828, + "learning_rate": 8.142463237582868e-05, + "loss": 0.5815895795822144, + "step": 4850 + }, + { + "epoch": 2.0472573839662447, + "grad_norm": 1.2589281797409058, + "learning_rate": 8.140594980734989e-05, + "loss": 0.6232373714447021, + "step": 4852 + }, + { + "epoch": 2.0481012658227846, + "grad_norm": 1.234152913093567, + "learning_rate": 8.138725999412165e-05, + "loss": 0.5992053151130676, + "step": 4854 + }, + { + "epoch": 2.048945147679325, + "grad_norm": 1.3304446935653687, + "learning_rate": 8.136856294045533e-05, + "loss": 0.6494496464729309, + "step": 4856 + }, + { + "epoch": 2.049789029535865, + "grad_norm": 1.1871088743209839, + "learning_rate": 8.134985865066398e-05, + "loss": 0.6263431906700134, + "step": 4858 + }, + { + "epoch": 2.050632911392405, + "grad_norm": 1.1454699039459229, + "learning_rate": 8.133114712906234e-05, + "loss": 0.6036502122879028, + "step": 4860 + }, + { + "epoch": 2.0514767932489453, + "grad_norm": 1.2953420877456665, + "learning_rate": 8.131242837996675e-05, + "loss": 0.5674451589584351, + "step": 4862 + }, + { + "epoch": 2.052320675105485, + "grad_norm": 1.1874405145645142, + "learning_rate": 8.129370240769534e-05, + "loss": 0.5616317987442017, + "step": 4864 + }, + { + "epoch": 2.053164556962025, + "grad_norm": 1.2936227321624756, + "learning_rate": 8.127496921656777e-05, + "loss": 0.6495023369789124, + "step": 4866 + }, + { + "epoch": 2.0540084388185655, + "grad_norm": 1.1935228109359741, + "learning_rate": 8.125622881090544e-05, + "loss": 0.6028099060058594, + "step": 4868 + }, + { + "epoch": 2.0548523206751055, + "grad_norm": 0.9932331442832947, + "learning_rate": 8.123748119503143e-05, + "loss": 0.476296067237854, + "step": 4870 + }, + { + "epoch": 2.0556962025316454, + "grad_norm": 1.3878839015960693, + "learning_rate": 8.121872637327042e-05, + "loss": 0.6191902756690979, + "step": 4872 + }, + { + "epoch": 2.056540084388186, + "grad_norm": 1.1185581684112549, + "learning_rate": 8.11999643499488e-05, + "loss": 0.566487729549408, + "step": 4874 + }, + { + "epoch": 2.0573839662447257, + "grad_norm": 1.3729257583618164, + "learning_rate": 8.118119512939464e-05, + "loss": 0.5970078706741333, + "step": 4876 + }, + { + "epoch": 2.0582278481012657, + "grad_norm": 1.1332688331604004, + "learning_rate": 8.11624187159376e-05, + "loss": 0.570341944694519, + "step": 4878 + }, + { + "epoch": 2.059071729957806, + "grad_norm": 1.2648937702178955, + "learning_rate": 8.114363511390903e-05, + "loss": 0.6302897334098816, + "step": 4880 + }, + { + "epoch": 2.059915611814346, + "grad_norm": 1.250616192817688, + "learning_rate": 8.112484432764197e-05, + "loss": 0.5619142651557922, + "step": 4882 + }, + { + "epoch": 2.060759493670886, + "grad_norm": 0.9710861444473267, + "learning_rate": 8.110604636147109e-05, + "loss": 0.5426228642463684, + "step": 4884 + }, + { + "epoch": 2.0616033755274263, + "grad_norm": 1.1979506015777588, + "learning_rate": 8.108724121973271e-05, + "loss": 0.5498107671737671, + "step": 4886 + }, + { + "epoch": 2.0624472573839663, + "grad_norm": 1.0936485528945923, + "learning_rate": 8.106842890676483e-05, + "loss": 0.5695134401321411, + "step": 4888 + }, + { + "epoch": 2.0632911392405062, + "grad_norm": 1.1246092319488525, + "learning_rate": 8.10496094269071e-05, + "loss": 0.5998331308364868, + "step": 4890 + }, + { + "epoch": 2.0641350210970466, + "grad_norm": 1.244438648223877, + "learning_rate": 8.103078278450075e-05, + "loss": 0.5702623128890991, + "step": 4892 + }, + { + "epoch": 2.0649789029535865, + "grad_norm": 1.1585633754730225, + "learning_rate": 8.101194898388881e-05, + "loss": 0.5392299890518188, + "step": 4894 + }, + { + "epoch": 2.0658227848101265, + "grad_norm": 1.3044285774230957, + "learning_rate": 8.099310802941582e-05, + "loss": 0.5640127658843994, + "step": 4896 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 1.2483032941818237, + "learning_rate": 8.097425992542804e-05, + "loss": 0.6103175282478333, + "step": 4898 + }, + { + "epoch": 2.067510548523207, + "grad_norm": 1.0845462083816528, + "learning_rate": 8.095540467627337e-05, + "loss": 0.5041166543960571, + "step": 4900 + }, + { + "epoch": 2.067510548523207, + "eval_loss": 0.6941288113594055, + "eval_runtime": 513.4497, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 4900 + }, + { + "epoch": 2.0683544303797468, + "grad_norm": 1.2493232488632202, + "learning_rate": 8.093654228630134e-05, + "loss": 0.6253946423530579, + "step": 4902 + }, + { + "epoch": 2.0691983122362867, + "grad_norm": 1.1668756008148193, + "learning_rate": 8.091767275986317e-05, + "loss": 0.523486852645874, + "step": 4904 + }, + { + "epoch": 2.070042194092827, + "grad_norm": 1.1709638833999634, + "learning_rate": 8.089879610131167e-05, + "loss": 0.5569989681243896, + "step": 4906 + }, + { + "epoch": 2.070886075949367, + "grad_norm": 1.1044740676879883, + "learning_rate": 8.087991231500133e-05, + "loss": 0.642728865146637, + "step": 4908 + }, + { + "epoch": 2.071729957805907, + "grad_norm": 1.1032549142837524, + "learning_rate": 8.086102140528828e-05, + "loss": 0.5998259782791138, + "step": 4910 + }, + { + "epoch": 2.0725738396624473, + "grad_norm": 0.9980027079582214, + "learning_rate": 8.08421233765303e-05, + "loss": 0.5460172891616821, + "step": 4912 + }, + { + "epoch": 2.0734177215189873, + "grad_norm": 1.0866090059280396, + "learning_rate": 8.082321823308679e-05, + "loss": 0.5643284916877747, + "step": 4914 + }, + { + "epoch": 2.0742616033755272, + "grad_norm": 1.1942687034606934, + "learning_rate": 8.080430597931878e-05, + "loss": 0.554400622844696, + "step": 4916 + }, + { + "epoch": 2.0751054852320676, + "grad_norm": 1.0680599212646484, + "learning_rate": 8.078538661958901e-05, + "loss": 0.5955621004104614, + "step": 4918 + }, + { + "epoch": 2.0759493670886076, + "grad_norm": 1.20845627784729, + "learning_rate": 8.076646015826179e-05, + "loss": 0.5970203280448914, + "step": 4920 + }, + { + "epoch": 2.0767932489451475, + "grad_norm": 1.8368924856185913, + "learning_rate": 8.074752659970308e-05, + "loss": 0.6467664837837219, + "step": 4922 + }, + { + "epoch": 2.077637130801688, + "grad_norm": 1.3291922807693481, + "learning_rate": 8.072858594828053e-05, + "loss": 0.630719006061554, + "step": 4924 + }, + { + "epoch": 2.078481012658228, + "grad_norm": 1.1496083736419678, + "learning_rate": 8.070963820836333e-05, + "loss": 0.601140022277832, + "step": 4926 + }, + { + "epoch": 2.0793248945147678, + "grad_norm": 1.1562724113464355, + "learning_rate": 8.069068338432239e-05, + "loss": 0.6096881031990051, + "step": 4928 + }, + { + "epoch": 2.080168776371308, + "grad_norm": 1.0115300416946411, + "learning_rate": 8.067172148053021e-05, + "loss": 0.5085908770561218, + "step": 4930 + }, + { + "epoch": 2.081012658227848, + "grad_norm": 1.2181830406188965, + "learning_rate": 8.065275250136097e-05, + "loss": 0.5268720984458923, + "step": 4932 + }, + { + "epoch": 2.081856540084388, + "grad_norm": 1.1249788999557495, + "learning_rate": 8.06337764511904e-05, + "loss": 0.6075665950775146, + "step": 4934 + }, + { + "epoch": 2.0827004219409284, + "grad_norm": 1.1143964529037476, + "learning_rate": 8.061479333439595e-05, + "loss": 0.59170001745224, + "step": 4936 + }, + { + "epoch": 2.0835443037974684, + "grad_norm": 1.4773131608963013, + "learning_rate": 8.059580315535664e-05, + "loss": 0.6689745187759399, + "step": 4938 + }, + { + "epoch": 2.0843881856540083, + "grad_norm": 1.143965244293213, + "learning_rate": 8.057680591845316e-05, + "loss": 0.5409777760505676, + "step": 4940 + }, + { + "epoch": 2.0852320675105487, + "grad_norm": 1.0384942293167114, + "learning_rate": 8.055780162806777e-05, + "loss": 0.5778636336326599, + "step": 4942 + }, + { + "epoch": 2.0860759493670886, + "grad_norm": 1.0102177858352661, + "learning_rate": 8.053879028858442e-05, + "loss": 0.5576038360595703, + "step": 4944 + }, + { + "epoch": 2.0869198312236286, + "grad_norm": 1.3792158365249634, + "learning_rate": 8.051977190438868e-05, + "loss": 0.5873376131057739, + "step": 4946 + }, + { + "epoch": 2.087763713080169, + "grad_norm": 1.4402949810028076, + "learning_rate": 8.050074647986768e-05, + "loss": 0.6067743301391602, + "step": 4948 + }, + { + "epoch": 2.088607594936709, + "grad_norm": 1.2719058990478516, + "learning_rate": 8.048171401941027e-05, + "loss": 0.604671835899353, + "step": 4950 + }, + { + "epoch": 2.089451476793249, + "grad_norm": 1.1054867506027222, + "learning_rate": 8.046267452740683e-05, + "loss": 0.5743544697761536, + "step": 4952 + }, + { + "epoch": 2.090295358649789, + "grad_norm": 1.0521535873413086, + "learning_rate": 8.044362800824944e-05, + "loss": 0.576278567314148, + "step": 4954 + }, + { + "epoch": 2.091139240506329, + "grad_norm": 1.2665088176727295, + "learning_rate": 8.042457446633174e-05, + "loss": 0.5903641581535339, + "step": 4956 + }, + { + "epoch": 2.091983122362869, + "grad_norm": 1.1283398866653442, + "learning_rate": 8.040551390604902e-05, + "loss": 0.5854214429855347, + "step": 4958 + }, + { + "epoch": 2.0928270042194095, + "grad_norm": 1.1194316148757935, + "learning_rate": 8.03864463317982e-05, + "loss": 0.5843619108200073, + "step": 4960 + }, + { + "epoch": 2.0936708860759494, + "grad_norm": 1.3581651449203491, + "learning_rate": 8.036737174797778e-05, + "loss": 0.6115096211433411, + "step": 4962 + }, + { + "epoch": 2.0945147679324894, + "grad_norm": 1.341748595237732, + "learning_rate": 8.034829015898793e-05, + "loss": 0.5998795032501221, + "step": 4964 + }, + { + "epoch": 2.0953586497890297, + "grad_norm": 1.2212611436843872, + "learning_rate": 8.032920156923038e-05, + "loss": 0.628372311592102, + "step": 4966 + }, + { + "epoch": 2.0962025316455697, + "grad_norm": 1.1348317861557007, + "learning_rate": 8.031010598310851e-05, + "loss": 0.5668916702270508, + "step": 4968 + }, + { + "epoch": 2.0970464135021096, + "grad_norm": 1.1106547117233276, + "learning_rate": 8.029100340502731e-05, + "loss": 0.5253881216049194, + "step": 4970 + }, + { + "epoch": 2.09789029535865, + "grad_norm": 1.2471354007720947, + "learning_rate": 8.027189383939339e-05, + "loss": 0.5790762901306152, + "step": 4972 + }, + { + "epoch": 2.09873417721519, + "grad_norm": 1.2477394342422485, + "learning_rate": 8.025277729061492e-05, + "loss": 0.6382888555526733, + "step": 4974 + }, + { + "epoch": 2.09957805907173, + "grad_norm": 1.2716054916381836, + "learning_rate": 8.023365376310176e-05, + "loss": 0.5962072610855103, + "step": 4976 + }, + { + "epoch": 2.10042194092827, + "grad_norm": 1.257820725440979, + "learning_rate": 8.021452326126532e-05, + "loss": 0.5882940292358398, + "step": 4978 + }, + { + "epoch": 2.1012658227848102, + "grad_norm": 1.0924186706542969, + "learning_rate": 8.019538578951864e-05, + "loss": 0.5640701055526733, + "step": 4980 + }, + { + "epoch": 2.10210970464135, + "grad_norm": 1.1250383853912354, + "learning_rate": 8.017624135227637e-05, + "loss": 0.5746428966522217, + "step": 4982 + }, + { + "epoch": 2.10295358649789, + "grad_norm": 1.131323218345642, + "learning_rate": 8.015708995395477e-05, + "loss": 0.5611346960067749, + "step": 4984 + }, + { + "epoch": 2.1037974683544305, + "grad_norm": 1.4267152547836304, + "learning_rate": 8.013793159897171e-05, + "loss": 0.6173797249794006, + "step": 4986 + }, + { + "epoch": 2.1046413502109704, + "grad_norm": 1.41414213180542, + "learning_rate": 8.011876629174662e-05, + "loss": 0.64865642786026, + "step": 4988 + }, + { + "epoch": 2.1054852320675104, + "grad_norm": 1.1498184204101562, + "learning_rate": 8.00995940367006e-05, + "loss": 0.6125827431678772, + "step": 4990 + }, + { + "epoch": 2.1063291139240508, + "grad_norm": 1.2327708005905151, + "learning_rate": 8.00804148382563e-05, + "loss": 0.670495867729187, + "step": 4992 + }, + { + "epoch": 2.1071729957805907, + "grad_norm": 1.2797311544418335, + "learning_rate": 8.0061228700838e-05, + "loss": 0.6020209193229675, + "step": 4994 + }, + { + "epoch": 2.1080168776371306, + "grad_norm": 1.079584002494812, + "learning_rate": 8.004203562887157e-05, + "loss": 0.5974310636520386, + "step": 4996 + }, + { + "epoch": 2.108860759493671, + "grad_norm": 1.4352604150772095, + "learning_rate": 8.002283562678452e-05, + "loss": 0.6424587368965149, + "step": 4998 + }, + { + "epoch": 2.109704641350211, + "grad_norm": 1.0876719951629639, + "learning_rate": 8.000362869900586e-05, + "loss": 0.6185846328735352, + "step": 5000 + }, + { + "epoch": 2.109704641350211, + "eval_loss": 0.6908889412879944, + "eval_runtime": 675.8398, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 3.118, + "step": 5000 + }, + { + "epoch": 2.110548523206751, + "grad_norm": 1.0125762224197388, + "learning_rate": 7.998441484996631e-05, + "loss": 0.6127280592918396, + "step": 5002 + }, + { + "epoch": 2.1113924050632913, + "grad_norm": 1.0253753662109375, + "learning_rate": 7.99651940840981e-05, + "loss": 0.5495694875717163, + "step": 5004 + }, + { + "epoch": 2.1122362869198312, + "grad_norm": 1.5620673894882202, + "learning_rate": 7.994596640583511e-05, + "loss": 0.6199497580528259, + "step": 5006 + }, + { + "epoch": 2.113080168776371, + "grad_norm": 1.3032969236373901, + "learning_rate": 7.992673181961281e-05, + "loss": 0.5896390676498413, + "step": 5008 + }, + { + "epoch": 2.1139240506329116, + "grad_norm": 1.0933046340942383, + "learning_rate": 7.990749032986821e-05, + "loss": 0.6332341432571411, + "step": 5010 + }, + { + "epoch": 2.1147679324894515, + "grad_norm": 1.3115314245224, + "learning_rate": 7.988824194104e-05, + "loss": 0.5964323282241821, + "step": 5012 + }, + { + "epoch": 2.1156118143459914, + "grad_norm": 1.229978084564209, + "learning_rate": 7.986898665756837e-05, + "loss": 0.5938325524330139, + "step": 5014 + }, + { + "epoch": 2.116455696202532, + "grad_norm": 1.1779940128326416, + "learning_rate": 7.984972448389517e-05, + "loss": 0.5761791467666626, + "step": 5016 + }, + { + "epoch": 2.1172995780590718, + "grad_norm": 1.063490629196167, + "learning_rate": 7.98304554244638e-05, + "loss": 0.6073653101921082, + "step": 5018 + }, + { + "epoch": 2.1181434599156117, + "grad_norm": 1.2390391826629639, + "learning_rate": 7.981117948371927e-05, + "loss": 0.6126761436462402, + "step": 5020 + }, + { + "epoch": 2.118987341772152, + "grad_norm": 1.1946247816085815, + "learning_rate": 7.979189666610818e-05, + "loss": 0.614434003829956, + "step": 5022 + }, + { + "epoch": 2.119831223628692, + "grad_norm": 1.1008374691009521, + "learning_rate": 7.977260697607867e-05, + "loss": 0.5947603583335876, + "step": 5024 + }, + { + "epoch": 2.120675105485232, + "grad_norm": 1.14899480342865, + "learning_rate": 7.975331041808054e-05, + "loss": 0.583965539932251, + "step": 5026 + }, + { + "epoch": 2.1215189873417724, + "grad_norm": 1.1627864837646484, + "learning_rate": 7.973400699656512e-05, + "loss": 0.615121603012085, + "step": 5028 + }, + { + "epoch": 2.1223628691983123, + "grad_norm": 1.3622617721557617, + "learning_rate": 7.971469671598532e-05, + "loss": 0.6268601417541504, + "step": 5030 + }, + { + "epoch": 2.1232067510548522, + "grad_norm": 1.1735879182815552, + "learning_rate": 7.96953795807957e-05, + "loss": 0.6021270155906677, + "step": 5032 + }, + { + "epoch": 2.124050632911392, + "grad_norm": 1.3856201171875, + "learning_rate": 7.96760555954523e-05, + "loss": 0.636816680431366, + "step": 5034 + }, + { + "epoch": 2.1248945147679326, + "grad_norm": 1.1410126686096191, + "learning_rate": 7.965672476441282e-05, + "loss": 0.5324423313140869, + "step": 5036 + }, + { + "epoch": 2.1257383966244725, + "grad_norm": 1.446070909500122, + "learning_rate": 7.963738709213651e-05, + "loss": 0.7433624267578125, + "step": 5038 + }, + { + "epoch": 2.1265822784810124, + "grad_norm": 1.3041753768920898, + "learning_rate": 7.961804258308419e-05, + "loss": 0.6359145641326904, + "step": 5040 + }, + { + "epoch": 2.127426160337553, + "grad_norm": 1.2043813467025757, + "learning_rate": 7.959869124171826e-05, + "loss": 0.6164234280586243, + "step": 5042 + }, + { + "epoch": 2.1282700421940928, + "grad_norm": 1.2375630140304565, + "learning_rate": 7.957933307250273e-05, + "loss": 0.6437279582023621, + "step": 5044 + }, + { + "epoch": 2.1291139240506327, + "grad_norm": 1.210644245147705, + "learning_rate": 7.955996807990314e-05, + "loss": 0.585924506187439, + "step": 5046 + }, + { + "epoch": 2.129957805907173, + "grad_norm": 1.2011489868164062, + "learning_rate": 7.954059626838661e-05, + "loss": 0.6081803441047668, + "step": 5048 + }, + { + "epoch": 2.130801687763713, + "grad_norm": 1.0365782976150513, + "learning_rate": 7.952121764242187e-05, + "loss": 0.5609047412872314, + "step": 5050 + }, + { + "epoch": 2.131645569620253, + "grad_norm": 1.7950767278671265, + "learning_rate": 7.950183220647918e-05, + "loss": 0.5612874031066895, + "step": 5052 + }, + { + "epoch": 2.1324894514767934, + "grad_norm": 1.2933409214019775, + "learning_rate": 7.94824399650304e-05, + "loss": 0.6554630994796753, + "step": 5054 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 1.129828929901123, + "learning_rate": 7.946304092254894e-05, + "loss": 0.5623239278793335, + "step": 5056 + }, + { + "epoch": 2.1341772151898732, + "grad_norm": 1.1060296297073364, + "learning_rate": 7.944363508350978e-05, + "loss": 0.5036910772323608, + "step": 5058 + }, + { + "epoch": 2.1350210970464136, + "grad_norm": 1.2622627019882202, + "learning_rate": 7.94242224523895e-05, + "loss": 0.5840913653373718, + "step": 5060 + }, + { + "epoch": 2.1358649789029536, + "grad_norm": 1.3803153038024902, + "learning_rate": 7.940480303366618e-05, + "loss": 0.6365578770637512, + "step": 5062 + }, + { + "epoch": 2.1367088607594935, + "grad_norm": 1.2524651288986206, + "learning_rate": 7.938537683181955e-05, + "loss": 0.6167916655540466, + "step": 5064 + }, + { + "epoch": 2.137552742616034, + "grad_norm": 1.3320350646972656, + "learning_rate": 7.936594385133083e-05, + "loss": 0.6356930732727051, + "step": 5066 + }, + { + "epoch": 2.138396624472574, + "grad_norm": 1.3180949687957764, + "learning_rate": 7.934650409668285e-05, + "loss": 0.5888242721557617, + "step": 5068 + }, + { + "epoch": 2.1392405063291138, + "grad_norm": 1.1376243829727173, + "learning_rate": 7.932705757235999e-05, + "loss": 0.608725905418396, + "step": 5070 + }, + { + "epoch": 2.140084388185654, + "grad_norm": 1.1734369993209839, + "learning_rate": 7.930760428284817e-05, + "loss": 0.5824158787727356, + "step": 5072 + }, + { + "epoch": 2.140928270042194, + "grad_norm": 1.1038579940795898, + "learning_rate": 7.928814423263493e-05, + "loss": 0.5629416704177856, + "step": 5074 + }, + { + "epoch": 2.141772151898734, + "grad_norm": 1.269780158996582, + "learning_rate": 7.926867742620929e-05, + "loss": 0.5994445085525513, + "step": 5076 + }, + { + "epoch": 2.1426160337552744, + "grad_norm": 1.2274279594421387, + "learning_rate": 7.924920386806188e-05, + "loss": 0.5845475792884827, + "step": 5078 + }, + { + "epoch": 2.1434599156118144, + "grad_norm": 1.168766975402832, + "learning_rate": 7.922972356268488e-05, + "loss": 0.621201753616333, + "step": 5080 + }, + { + "epoch": 2.1443037974683543, + "grad_norm": 1.0057638883590698, + "learning_rate": 7.921023651457203e-05, + "loss": 0.5282597541809082, + "step": 5082 + }, + { + "epoch": 2.1451476793248947, + "grad_norm": 1.432309865951538, + "learning_rate": 7.91907427282186e-05, + "loss": 0.632583737373352, + "step": 5084 + }, + { + "epoch": 2.1459915611814346, + "grad_norm": 1.3939776420593262, + "learning_rate": 7.917124220812144e-05, + "loss": 0.6239289045333862, + "step": 5086 + }, + { + "epoch": 2.1468354430379746, + "grad_norm": 1.3741775751113892, + "learning_rate": 7.915173495877895e-05, + "loss": 0.5749062895774841, + "step": 5088 + }, + { + "epoch": 2.147679324894515, + "grad_norm": 1.3123528957366943, + "learning_rate": 7.913222098469109e-05, + "loss": 0.6011738181114197, + "step": 5090 + }, + { + "epoch": 2.148523206751055, + "grad_norm": 1.3473498821258545, + "learning_rate": 7.911270029035932e-05, + "loss": 0.5804699659347534, + "step": 5092 + }, + { + "epoch": 2.149367088607595, + "grad_norm": 1.0873067378997803, + "learning_rate": 7.909317288028673e-05, + "loss": 0.6446103453636169, + "step": 5094 + }, + { + "epoch": 2.1502109704641352, + "grad_norm": 1.1374083757400513, + "learning_rate": 7.907363875897789e-05, + "loss": 0.6136524677276611, + "step": 5096 + }, + { + "epoch": 2.151054852320675, + "grad_norm": 1.1356533765792847, + "learning_rate": 7.905409793093896e-05, + "loss": 0.5107976794242859, + "step": 5098 + }, + { + "epoch": 2.151898734177215, + "grad_norm": 1.2579567432403564, + "learning_rate": 7.903455040067763e-05, + "loss": 0.6073099374771118, + "step": 5100 + }, + { + "epoch": 2.151898734177215, + "eval_loss": 0.6902023553848267, + "eval_runtime": 733.915, + "eval_samples_per_second": 2.871, + "eval_steps_per_second": 2.871, + "step": 5100 + }, + { + "epoch": 2.1527426160337555, + "grad_norm": 1.2401398420333862, + "learning_rate": 7.901499617270315e-05, + "loss": 0.5562406182289124, + "step": 5102 + }, + { + "epoch": 2.1535864978902954, + "grad_norm": 1.086590051651001, + "learning_rate": 7.899543525152628e-05, + "loss": 0.5749467015266418, + "step": 5104 + }, + { + "epoch": 2.1544303797468354, + "grad_norm": 1.206458568572998, + "learning_rate": 7.897586764165939e-05, + "loss": 0.6326877474784851, + "step": 5106 + }, + { + "epoch": 2.1552742616033758, + "grad_norm": 1.030740737915039, + "learning_rate": 7.895629334761632e-05, + "loss": 0.5616445541381836, + "step": 5108 + }, + { + "epoch": 2.1561181434599157, + "grad_norm": 1.3338581323623657, + "learning_rate": 7.89367123739125e-05, + "loss": 0.6307384371757507, + "step": 5110 + }, + { + "epoch": 2.1569620253164556, + "grad_norm": 1.2684671878814697, + "learning_rate": 7.891712472506485e-05, + "loss": 0.6087653636932373, + "step": 5112 + }, + { + "epoch": 2.1578059071729956, + "grad_norm": 1.1610581874847412, + "learning_rate": 7.889753040559188e-05, + "loss": 0.5747998952865601, + "step": 5114 + }, + { + "epoch": 2.158649789029536, + "grad_norm": 1.4069275856018066, + "learning_rate": 7.887792942001366e-05, + "loss": 0.6143770217895508, + "step": 5116 + }, + { + "epoch": 2.159493670886076, + "grad_norm": 1.0858227014541626, + "learning_rate": 7.885832177285173e-05, + "loss": 0.552534282207489, + "step": 5118 + }, + { + "epoch": 2.160337552742616, + "grad_norm": 1.067070722579956, + "learning_rate": 7.88387074686292e-05, + "loss": 0.5781989693641663, + "step": 5120 + }, + { + "epoch": 2.1611814345991562, + "grad_norm": 1.139981746673584, + "learning_rate": 7.881908651187072e-05, + "loss": 0.5521422624588013, + "step": 5122 + }, + { + "epoch": 2.162025316455696, + "grad_norm": 1.0987457036972046, + "learning_rate": 7.879945890710245e-05, + "loss": 0.5755025744438171, + "step": 5124 + }, + { + "epoch": 2.162869198312236, + "grad_norm": 1.1530758142471313, + "learning_rate": 7.877982465885214e-05, + "loss": 0.5783509612083435, + "step": 5126 + }, + { + "epoch": 2.1637130801687765, + "grad_norm": 1.2285696268081665, + "learning_rate": 7.876018377164899e-05, + "loss": 0.5942281484603882, + "step": 5128 + }, + { + "epoch": 2.1645569620253164, + "grad_norm": 1.1283711194992065, + "learning_rate": 7.874053625002378e-05, + "loss": 0.5539707541465759, + "step": 5130 + }, + { + "epoch": 2.1654008438818564, + "grad_norm": 1.3213335275650024, + "learning_rate": 7.872088209850885e-05, + "loss": 0.5955292582511902, + "step": 5132 + }, + { + "epoch": 2.1662447257383968, + "grad_norm": 1.1748592853546143, + "learning_rate": 7.8701221321638e-05, + "loss": 0.5422899723052979, + "step": 5134 + }, + { + "epoch": 2.1670886075949367, + "grad_norm": 1.0752148628234863, + "learning_rate": 7.868155392394662e-05, + "loss": 0.5547205209732056, + "step": 5136 + }, + { + "epoch": 2.1679324894514767, + "grad_norm": 1.1814554929733276, + "learning_rate": 7.86618799099716e-05, + "loss": 0.5938948392868042, + "step": 5138 + }, + { + "epoch": 2.168776371308017, + "grad_norm": 1.3455278873443604, + "learning_rate": 7.864219928425132e-05, + "loss": 0.6468925476074219, + "step": 5140 + }, + { + "epoch": 2.169620253164557, + "grad_norm": 1.2695354223251343, + "learning_rate": 7.862251205132576e-05, + "loss": 0.5704391002655029, + "step": 5142 + }, + { + "epoch": 2.170464135021097, + "grad_norm": 1.1529468297958374, + "learning_rate": 7.860281821573638e-05, + "loss": 0.6057283878326416, + "step": 5144 + }, + { + "epoch": 2.1713080168776373, + "grad_norm": 1.3461004495620728, + "learning_rate": 7.858311778202616e-05, + "loss": 0.6135527491569519, + "step": 5146 + }, + { + "epoch": 2.1721518987341772, + "grad_norm": 1.1258536577224731, + "learning_rate": 7.856341075473962e-05, + "loss": 0.5585638880729675, + "step": 5148 + }, + { + "epoch": 2.172995780590717, + "grad_norm": 1.254898190498352, + "learning_rate": 7.854369713842279e-05, + "loss": 0.5780918002128601, + "step": 5150 + }, + { + "epoch": 2.1738396624472576, + "grad_norm": 1.2730201482772827, + "learning_rate": 7.852397693762321e-05, + "loss": 0.595267117023468, + "step": 5152 + }, + { + "epoch": 2.1746835443037975, + "grad_norm": 1.1875078678131104, + "learning_rate": 7.850425015688999e-05, + "loss": 0.5636162161827087, + "step": 5154 + }, + { + "epoch": 2.1755274261603375, + "grad_norm": 1.0930945873260498, + "learning_rate": 7.848451680077366e-05, + "loss": 0.6362089514732361, + "step": 5156 + }, + { + "epoch": 2.176371308016878, + "grad_norm": 1.2274452447891235, + "learning_rate": 7.846477687382639e-05, + "loss": 0.6268675327301025, + "step": 5158 + }, + { + "epoch": 2.1772151898734178, + "grad_norm": 1.2023133039474487, + "learning_rate": 7.844503038060176e-05, + "loss": 0.6014906167984009, + "step": 5160 + }, + { + "epoch": 2.1780590717299577, + "grad_norm": 1.2616889476776123, + "learning_rate": 7.842527732565491e-05, + "loss": 0.6180019974708557, + "step": 5162 + }, + { + "epoch": 2.1789029535864977, + "grad_norm": 1.1046907901763916, + "learning_rate": 7.84055177135425e-05, + "loss": 0.5400100946426392, + "step": 5164 + }, + { + "epoch": 2.179746835443038, + "grad_norm": 1.1664032936096191, + "learning_rate": 7.83857515488227e-05, + "loss": 0.5713199973106384, + "step": 5166 + }, + { + "epoch": 2.180590717299578, + "grad_norm": 1.2526558637619019, + "learning_rate": 7.836597883605519e-05, + "loss": 0.5741307735443115, + "step": 5168 + }, + { + "epoch": 2.181434599156118, + "grad_norm": 1.0457103252410889, + "learning_rate": 7.834619957980112e-05, + "loss": 0.47188031673431396, + "step": 5170 + }, + { + "epoch": 2.1822784810126583, + "grad_norm": 1.1978110074996948, + "learning_rate": 7.832641378462319e-05, + "loss": 0.6149471998214722, + "step": 5172 + }, + { + "epoch": 2.1831223628691983, + "grad_norm": 1.2231460809707642, + "learning_rate": 7.830662145508567e-05, + "loss": 0.5520018339157104, + "step": 5174 + }, + { + "epoch": 2.183966244725738, + "grad_norm": 1.4367618560791016, + "learning_rate": 7.828682259575417e-05, + "loss": 0.6536548733711243, + "step": 5176 + }, + { + "epoch": 2.1848101265822786, + "grad_norm": 1.0891374349594116, + "learning_rate": 7.826701721119598e-05, + "loss": 0.5324372053146362, + "step": 5178 + }, + { + "epoch": 2.1856540084388185, + "grad_norm": 1.118695616722107, + "learning_rate": 7.82472053059798e-05, + "loss": 0.6127952337265015, + "step": 5180 + }, + { + "epoch": 2.1864978902953585, + "grad_norm": 1.1116070747375488, + "learning_rate": 7.822738688467585e-05, + "loss": 0.505962610244751, + "step": 5182 + }, + { + "epoch": 2.187341772151899, + "grad_norm": 1.2140545845031738, + "learning_rate": 7.820756195185586e-05, + "loss": 0.6210073232650757, + "step": 5184 + }, + { + "epoch": 2.188185654008439, + "grad_norm": 1.2135601043701172, + "learning_rate": 7.818773051209307e-05, + "loss": 0.6517674326896667, + "step": 5186 + }, + { + "epoch": 2.1890295358649787, + "grad_norm": 1.3875514268875122, + "learning_rate": 7.816789256996218e-05, + "loss": 0.5577492117881775, + "step": 5188 + }, + { + "epoch": 2.189873417721519, + "grad_norm": 1.181325912475586, + "learning_rate": 7.814804813003949e-05, + "loss": 0.6010199189186096, + "step": 5190 + }, + { + "epoch": 2.190717299578059, + "grad_norm": 1.102044701576233, + "learning_rate": 7.812819719690265e-05, + "loss": 0.5635302662849426, + "step": 5192 + }, + { + "epoch": 2.191561181434599, + "grad_norm": 1.4227958917617798, + "learning_rate": 7.810833977513094e-05, + "loss": 0.5804321765899658, + "step": 5194 + }, + { + "epoch": 2.1924050632911394, + "grad_norm": 1.2573446035385132, + "learning_rate": 7.80884758693051e-05, + "loss": 0.6005555987358093, + "step": 5196 + }, + { + "epoch": 2.1932489451476793, + "grad_norm": 1.3534085750579834, + "learning_rate": 7.80686054840073e-05, + "loss": 0.6263643503189087, + "step": 5198 + }, + { + "epoch": 2.1940928270042193, + "grad_norm": 1.6895852088928223, + "learning_rate": 7.804872862382131e-05, + "loss": 0.6235764622688293, + "step": 5200 + }, + { + "epoch": 2.1940928270042193, + "eval_loss": 0.6915348172187805, + "eval_runtime": 1167.9782, + "eval_samples_per_second": 1.804, + "eval_steps_per_second": 1.804, + "step": 5200 + }, + { + "epoch": 2.1949367088607596, + "grad_norm": 1.138973593711853, + "learning_rate": 7.802884529333227e-05, + "loss": 0.5586035847663879, + "step": 5202 + }, + { + "epoch": 2.1957805907172996, + "grad_norm": 1.3664026260375977, + "learning_rate": 7.800895549712697e-05, + "loss": 0.5768917202949524, + "step": 5204 + }, + { + "epoch": 2.1966244725738395, + "grad_norm": 1.2182449102401733, + "learning_rate": 7.798905923979353e-05, + "loss": 0.6046215891838074, + "step": 5206 + }, + { + "epoch": 2.19746835443038, + "grad_norm": 1.2692211866378784, + "learning_rate": 7.796915652592167e-05, + "loss": 0.5412904024124146, + "step": 5208 + }, + { + "epoch": 2.19831223628692, + "grad_norm": 1.200822114944458, + "learning_rate": 7.794924736010256e-05, + "loss": 0.5328584909439087, + "step": 5210 + }, + { + "epoch": 2.19915611814346, + "grad_norm": 1.1093779802322388, + "learning_rate": 7.792933174692886e-05, + "loss": 0.5497913360595703, + "step": 5212 + }, + { + "epoch": 2.2, + "grad_norm": 1.3838921785354614, + "learning_rate": 7.790940969099471e-05, + "loss": 0.5908066034317017, + "step": 5214 + }, + { + "epoch": 2.20084388185654, + "grad_norm": 1.1411913633346558, + "learning_rate": 7.788948119689576e-05, + "loss": 0.6117307543754578, + "step": 5216 + }, + { + "epoch": 2.20168776371308, + "grad_norm": 1.5668916702270508, + "learning_rate": 7.786954626922913e-05, + "loss": 0.5788605809211731, + "step": 5218 + }, + { + "epoch": 2.2025316455696204, + "grad_norm": 1.195027232170105, + "learning_rate": 7.784960491259344e-05, + "loss": 0.5948591828346252, + "step": 5220 + }, + { + "epoch": 2.2033755274261604, + "grad_norm": 1.2665271759033203, + "learning_rate": 7.782965713158872e-05, + "loss": 0.6321669220924377, + "step": 5222 + }, + { + "epoch": 2.2042194092827003, + "grad_norm": 1.123711109161377, + "learning_rate": 7.78097029308166e-05, + "loss": 0.5853859186172485, + "step": 5224 + }, + { + "epoch": 2.2050632911392407, + "grad_norm": 1.9381071329116821, + "learning_rate": 7.77897423148801e-05, + "loss": 0.6485977172851562, + "step": 5226 + }, + { + "epoch": 2.2059071729957807, + "grad_norm": 1.4062265157699585, + "learning_rate": 7.776977528838376e-05, + "loss": 0.6243517398834229, + "step": 5228 + }, + { + "epoch": 2.2067510548523206, + "grad_norm": 1.2127182483673096, + "learning_rate": 7.774980185593358e-05, + "loss": 0.5770578980445862, + "step": 5230 + }, + { + "epoch": 2.207594936708861, + "grad_norm": 1.250847578048706, + "learning_rate": 7.772982202213709e-05, + "loss": 0.6521194577217102, + "step": 5232 + }, + { + "epoch": 2.208438818565401, + "grad_norm": 1.2568131685256958, + "learning_rate": 7.77098357916032e-05, + "loss": 0.5755271911621094, + "step": 5234 + }, + { + "epoch": 2.209282700421941, + "grad_norm": 1.2422975301742554, + "learning_rate": 7.768984316894236e-05, + "loss": 0.5486469864845276, + "step": 5236 + }, + { + "epoch": 2.2101265822784812, + "grad_norm": 1.1018635034561157, + "learning_rate": 7.766984415876652e-05, + "loss": 0.5512928366661072, + "step": 5238 + }, + { + "epoch": 2.210970464135021, + "grad_norm": 1.2261123657226562, + "learning_rate": 7.764983876568903e-05, + "loss": 0.5753499269485474, + "step": 5240 + }, + { + "epoch": 2.211814345991561, + "grad_norm": 1.2222342491149902, + "learning_rate": 7.762982699432474e-05, + "loss": 0.5404848456382751, + "step": 5242 + }, + { + "epoch": 2.212658227848101, + "grad_norm": 1.231494426727295, + "learning_rate": 7.760980884929004e-05, + "loss": 0.5999218821525574, + "step": 5244 + }, + { + "epoch": 2.2135021097046415, + "grad_norm": 1.1530078649520874, + "learning_rate": 7.758978433520268e-05, + "loss": 0.6123101115226746, + "step": 5246 + }, + { + "epoch": 2.2143459915611814, + "grad_norm": 1.182706594467163, + "learning_rate": 7.756975345668194e-05, + "loss": 0.5945886969566345, + "step": 5248 + }, + { + "epoch": 2.2151898734177213, + "grad_norm": 1.0788652896881104, + "learning_rate": 7.754971621834857e-05, + "loss": 0.5698213577270508, + "step": 5250 + }, + { + "epoch": 2.2160337552742617, + "grad_norm": 1.2243359088897705, + "learning_rate": 7.752967262482477e-05, + "loss": 0.5959678888320923, + "step": 5252 + }, + { + "epoch": 2.2168776371308017, + "grad_norm": 1.4292869567871094, + "learning_rate": 7.750962268073421e-05, + "loss": 0.586794376373291, + "step": 5254 + }, + { + "epoch": 2.2177215189873416, + "grad_norm": 1.1809570789337158, + "learning_rate": 7.748956639070204e-05, + "loss": 0.5513298511505127, + "step": 5256 + }, + { + "epoch": 2.218565400843882, + "grad_norm": 1.485813856124878, + "learning_rate": 7.746950375935484e-05, + "loss": 0.6402831673622131, + "step": 5258 + }, + { + "epoch": 2.219409282700422, + "grad_norm": 1.0851374864578247, + "learning_rate": 7.744943479132069e-05, + "loss": 0.5729117393493652, + "step": 5260 + }, + { + "epoch": 2.220253164556962, + "grad_norm": 1.4308949708938599, + "learning_rate": 7.742935949122911e-05, + "loss": 0.6239725947380066, + "step": 5262 + }, + { + "epoch": 2.2210970464135023, + "grad_norm": 1.379258155822754, + "learning_rate": 7.740927786371107e-05, + "loss": 0.6260181069374084, + "step": 5264 + }, + { + "epoch": 2.221940928270042, + "grad_norm": 1.1661925315856934, + "learning_rate": 7.738918991339905e-05, + "loss": 0.6074157357215881, + "step": 5266 + }, + { + "epoch": 2.222784810126582, + "grad_norm": 1.168901801109314, + "learning_rate": 7.736909564492694e-05, + "loss": 0.6119515895843506, + "step": 5268 + }, + { + "epoch": 2.2236286919831225, + "grad_norm": 1.1451057195663452, + "learning_rate": 7.734899506293008e-05, + "loss": 0.5505842566490173, + "step": 5270 + }, + { + "epoch": 2.2244725738396625, + "grad_norm": 1.2303991317749023, + "learning_rate": 7.732888817204533e-05, + "loss": 0.6117991805076599, + "step": 5272 + }, + { + "epoch": 2.2253164556962024, + "grad_norm": 1.04572331905365, + "learning_rate": 7.730877497691092e-05, + "loss": 0.5589770078659058, + "step": 5274 + }, + { + "epoch": 2.226160337552743, + "grad_norm": 1.2047234773635864, + "learning_rate": 7.72886554821666e-05, + "loss": 0.6288654208183289, + "step": 5276 + }, + { + "epoch": 2.2270042194092827, + "grad_norm": 1.2036652565002441, + "learning_rate": 7.726852969245355e-05, + "loss": 0.6174501776695251, + "step": 5278 + }, + { + "epoch": 2.2278481012658227, + "grad_norm": 1.1740167140960693, + "learning_rate": 7.72483976124144e-05, + "loss": 0.6027677655220032, + "step": 5280 + }, + { + "epoch": 2.228691983122363, + "grad_norm": 1.0600008964538574, + "learning_rate": 7.722825924669326e-05, + "loss": 0.6016151309013367, + "step": 5282 + }, + { + "epoch": 2.229535864978903, + "grad_norm": 1.2631008625030518, + "learning_rate": 7.720811459993562e-05, + "loss": 0.5905849933624268, + "step": 5284 + }, + { + "epoch": 2.230379746835443, + "grad_norm": 1.1024738550186157, + "learning_rate": 7.718796367678848e-05, + "loss": 0.5129587054252625, + "step": 5286 + }, + { + "epoch": 2.2312236286919833, + "grad_norm": 1.23116934299469, + "learning_rate": 7.716780648190028e-05, + "loss": 0.5709586143493652, + "step": 5288 + }, + { + "epoch": 2.2320675105485233, + "grad_norm": 1.2739102840423584, + "learning_rate": 7.714764301992088e-05, + "loss": 0.5454761385917664, + "step": 5290 + }, + { + "epoch": 2.232911392405063, + "grad_norm": 1.303963303565979, + "learning_rate": 7.712747329550162e-05, + "loss": 0.537248969078064, + "step": 5292 + }, + { + "epoch": 2.233755274261603, + "grad_norm": 1.2454309463500977, + "learning_rate": 7.710729731329529e-05, + "loss": 0.6364415884017944, + "step": 5294 + }, + { + "epoch": 2.2345991561181435, + "grad_norm": 1.2401882410049438, + "learning_rate": 7.708711507795605e-05, + "loss": 0.5640100240707397, + "step": 5296 + }, + { + "epoch": 2.2354430379746835, + "grad_norm": 1.197432041168213, + "learning_rate": 7.706692659413959e-05, + "loss": 0.5919729471206665, + "step": 5298 + }, + { + "epoch": 2.2362869198312234, + "grad_norm": 1.1779764890670776, + "learning_rate": 7.704673186650298e-05, + "loss": 0.5569849014282227, + "step": 5300 + }, + { + "epoch": 2.2362869198312234, + "eval_loss": 0.6898328065872192, + "eval_runtime": 739.3794, + "eval_samples_per_second": 2.85, + "eval_steps_per_second": 2.85, + "step": 5300 + }, + { + "epoch": 2.237130801687764, + "grad_norm": 1.1371463537216187, + "learning_rate": 7.702653089970479e-05, + "loss": 0.5823061466217041, + "step": 5302 + }, + { + "epoch": 2.2379746835443037, + "grad_norm": 1.1877846717834473, + "learning_rate": 7.700632369840497e-05, + "loss": 0.5556252002716064, + "step": 5304 + }, + { + "epoch": 2.2388185654008437, + "grad_norm": 1.1580896377563477, + "learning_rate": 7.698611026726492e-05, + "loss": 0.5794119834899902, + "step": 5306 + }, + { + "epoch": 2.239662447257384, + "grad_norm": 1.29141366481781, + "learning_rate": 7.696589061094755e-05, + "loss": 0.5828680396080017, + "step": 5308 + }, + { + "epoch": 2.240506329113924, + "grad_norm": 1.1286728382110596, + "learning_rate": 7.694566473411706e-05, + "loss": 0.6161736845970154, + "step": 5310 + }, + { + "epoch": 2.241350210970464, + "grad_norm": 1.0969985723495483, + "learning_rate": 7.692543264143925e-05, + "loss": 0.570767879486084, + "step": 5312 + }, + { + "epoch": 2.2421940928270043, + "grad_norm": 1.2902227640151978, + "learning_rate": 7.690519433758123e-05, + "loss": 0.631476104259491, + "step": 5314 + }, + { + "epoch": 2.2430379746835443, + "grad_norm": 1.432735800743103, + "learning_rate": 7.68849498272116e-05, + "loss": 0.6142309904098511, + "step": 5316 + }, + { + "epoch": 2.243881856540084, + "grad_norm": 1.0824161767959595, + "learning_rate": 7.686469911500038e-05, + "loss": 0.5871514081954956, + "step": 5318 + }, + { + "epoch": 2.2447257383966246, + "grad_norm": 1.1694978475570679, + "learning_rate": 7.684444220561902e-05, + "loss": 0.6144557595252991, + "step": 5320 + }, + { + "epoch": 2.2455696202531645, + "grad_norm": 1.2981040477752686, + "learning_rate": 7.68241791037404e-05, + "loss": 0.6049425601959229, + "step": 5322 + }, + { + "epoch": 2.2464135021097045, + "grad_norm": 1.132128357887268, + "learning_rate": 7.680390981403885e-05, + "loss": 0.5571867823600769, + "step": 5324 + }, + { + "epoch": 2.247257383966245, + "grad_norm": 1.1760079860687256, + "learning_rate": 7.678363434119005e-05, + "loss": 0.5710517168045044, + "step": 5326 + }, + { + "epoch": 2.248101265822785, + "grad_norm": 1.1918572187423706, + "learning_rate": 7.67633526898712e-05, + "loss": 0.5508866906166077, + "step": 5328 + }, + { + "epoch": 2.2489451476793247, + "grad_norm": 1.1837294101715088, + "learning_rate": 7.674306486476091e-05, + "loss": 0.6242696046829224, + "step": 5330 + }, + { + "epoch": 2.249789029535865, + "grad_norm": 1.384918212890625, + "learning_rate": 7.672277087053914e-05, + "loss": 0.5821678042411804, + "step": 5332 + }, + { + "epoch": 2.250632911392405, + "grad_norm": 1.1248877048492432, + "learning_rate": 7.670247071188738e-05, + "loss": 0.5415928363800049, + "step": 5334 + }, + { + "epoch": 2.251476793248945, + "grad_norm": 1.228140950202942, + "learning_rate": 7.668216439348843e-05, + "loss": 0.5475174188613892, + "step": 5336 + }, + { + "epoch": 2.2523206751054854, + "grad_norm": 1.3816046714782715, + "learning_rate": 7.666185192002662e-05, + "loss": 0.5793306231498718, + "step": 5338 + }, + { + "epoch": 2.2531645569620253, + "grad_norm": 1.2446565628051758, + "learning_rate": 7.664153329618759e-05, + "loss": 0.6221131682395935, + "step": 5340 + }, + { + "epoch": 2.2540084388185653, + "grad_norm": 1.1677669286727905, + "learning_rate": 7.662120852665852e-05, + "loss": 0.5403847694396973, + "step": 5342 + }, + { + "epoch": 2.2548523206751057, + "grad_norm": 1.2485873699188232, + "learning_rate": 7.66008776161279e-05, + "loss": 0.620201587677002, + "step": 5344 + }, + { + "epoch": 2.2556962025316456, + "grad_norm": 1.2486802339553833, + "learning_rate": 7.658054056928568e-05, + "loss": 0.5969216227531433, + "step": 5346 + }, + { + "epoch": 2.2565400843881855, + "grad_norm": 1.2621372938156128, + "learning_rate": 7.656019739082326e-05, + "loss": 0.6376339793205261, + "step": 5348 + }, + { + "epoch": 2.257383966244726, + "grad_norm": 1.238633155822754, + "learning_rate": 7.65398480854334e-05, + "loss": 0.6374872326850891, + "step": 5350 + }, + { + "epoch": 2.258227848101266, + "grad_norm": 1.3031803369522095, + "learning_rate": 7.651949265781029e-05, + "loss": 0.6348551511764526, + "step": 5352 + }, + { + "epoch": 2.259071729957806, + "grad_norm": 1.3735158443450928, + "learning_rate": 7.649913111264952e-05, + "loss": 0.6267750859260559, + "step": 5354 + }, + { + "epoch": 2.259915611814346, + "grad_norm": 1.1227772235870361, + "learning_rate": 7.647876345464817e-05, + "loss": 0.623030960559845, + "step": 5356 + }, + { + "epoch": 2.260759493670886, + "grad_norm": 1.4555678367614746, + "learning_rate": 7.645838968850459e-05, + "loss": 0.5810713171958923, + "step": 5358 + }, + { + "epoch": 2.261603375527426, + "grad_norm": 1.227725863456726, + "learning_rate": 7.643800981891867e-05, + "loss": 0.6150093078613281, + "step": 5360 + }, + { + "epoch": 2.2624472573839665, + "grad_norm": 1.0648300647735596, + "learning_rate": 7.641762385059161e-05, + "loss": 0.5350445508956909, + "step": 5362 + }, + { + "epoch": 2.2632911392405064, + "grad_norm": 1.179452896118164, + "learning_rate": 7.639723178822613e-05, + "loss": 0.6253421306610107, + "step": 5364 + }, + { + "epoch": 2.2641350210970463, + "grad_norm": 1.0983240604400635, + "learning_rate": 7.637683363652621e-05, + "loss": 0.5512562990188599, + "step": 5366 + }, + { + "epoch": 2.2649789029535867, + "grad_norm": 1.1825451850891113, + "learning_rate": 7.635642940019736e-05, + "loss": 0.5584151148796082, + "step": 5368 + }, + { + "epoch": 2.2658227848101267, + "grad_norm": 1.1022000312805176, + "learning_rate": 7.633601908394643e-05, + "loss": 0.5881790518760681, + "step": 5370 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 1.1935697793960571, + "learning_rate": 7.631560269248169e-05, + "loss": 0.6060683131217957, + "step": 5372 + }, + { + "epoch": 2.267510548523207, + "grad_norm": 1.1174103021621704, + "learning_rate": 7.62951802305128e-05, + "loss": 0.5877062678337097, + "step": 5374 + }, + { + "epoch": 2.268354430379747, + "grad_norm": 1.3934977054595947, + "learning_rate": 7.627475170275086e-05, + "loss": 0.5145504474639893, + "step": 5376 + }, + { + "epoch": 2.269198312236287, + "grad_norm": 1.2637842893600464, + "learning_rate": 7.625431711390831e-05, + "loss": 0.6194025874137878, + "step": 5378 + }, + { + "epoch": 2.270042194092827, + "grad_norm": 1.2034388780593872, + "learning_rate": 7.623387646869902e-05, + "loss": 0.6205627918243408, + "step": 5380 + }, + { + "epoch": 2.270886075949367, + "grad_norm": 0.953880250453949, + "learning_rate": 7.621342977183826e-05, + "loss": 0.5609696507453918, + "step": 5382 + }, + { + "epoch": 2.271729957805907, + "grad_norm": 1.2841949462890625, + "learning_rate": 7.619297702804272e-05, + "loss": 0.6044906377792358, + "step": 5384 + }, + { + "epoch": 2.272573839662447, + "grad_norm": 1.146804690361023, + "learning_rate": 7.617251824203037e-05, + "loss": 0.5420435667037964, + "step": 5386 + }, + { + "epoch": 2.2734177215189875, + "grad_norm": 1.2225698232650757, + "learning_rate": 7.615205341852076e-05, + "loss": 0.6230710744857788, + "step": 5388 + }, + { + "epoch": 2.2742616033755274, + "grad_norm": 1.3423371315002441, + "learning_rate": 7.613158256223467e-05, + "loss": 0.6486349701881409, + "step": 5390 + }, + { + "epoch": 2.2751054852320673, + "grad_norm": 1.0840023756027222, + "learning_rate": 7.611110567789435e-05, + "loss": 0.6527825593948364, + "step": 5392 + }, + { + "epoch": 2.2759493670886077, + "grad_norm": 1.342466950416565, + "learning_rate": 7.609062277022341e-05, + "loss": 0.6859483122825623, + "step": 5394 + }, + { + "epoch": 2.2767932489451477, + "grad_norm": 1.0406129360198975, + "learning_rate": 7.607013384394691e-05, + "loss": 0.5536003708839417, + "step": 5396 + }, + { + "epoch": 2.2776371308016876, + "grad_norm": 1.0853544473648071, + "learning_rate": 7.604963890379118e-05, + "loss": 0.5488654971122742, + "step": 5398 + }, + { + "epoch": 2.278481012658228, + "grad_norm": 1.0330145359039307, + "learning_rate": 7.602913795448407e-05, + "loss": 0.6072142720222473, + "step": 5400 + }, + { + "epoch": 2.278481012658228, + "eval_loss": 0.6875645518302917, + "eval_runtime": 861.3558, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 5400 + }, + { + "epoch": 2.279324894514768, + "grad_norm": 1.1858742237091064, + "learning_rate": 7.600863100075472e-05, + "loss": 0.5420109033584595, + "step": 5402 + }, + { + "epoch": 2.280168776371308, + "grad_norm": 1.2126039266586304, + "learning_rate": 7.598811804733373e-05, + "loss": 0.6109243631362915, + "step": 5404 + }, + { + "epoch": 2.2810126582278483, + "grad_norm": 1.1290241479873657, + "learning_rate": 7.5967599098953e-05, + "loss": 0.5889696478843689, + "step": 5406 + }, + { + "epoch": 2.281856540084388, + "grad_norm": 1.320263147354126, + "learning_rate": 7.594707416034586e-05, + "loss": 0.6548630595207214, + "step": 5408 + }, + { + "epoch": 2.282700421940928, + "grad_norm": 1.346169114112854, + "learning_rate": 7.592654323624703e-05, + "loss": 0.6556787490844727, + "step": 5410 + }, + { + "epoch": 2.2835443037974685, + "grad_norm": 1.2104716300964355, + "learning_rate": 7.590600633139265e-05, + "loss": 0.5631673336029053, + "step": 5412 + }, + { + "epoch": 2.2843881856540085, + "grad_norm": 1.3298237323760986, + "learning_rate": 7.58854634505201e-05, + "loss": 0.5931088328361511, + "step": 5414 + }, + { + "epoch": 2.2852320675105484, + "grad_norm": 1.4201204776763916, + "learning_rate": 7.586491459836829e-05, + "loss": 0.6966755986213684, + "step": 5416 + }, + { + "epoch": 2.286075949367089, + "grad_norm": 1.253135323524475, + "learning_rate": 7.584435977967743e-05, + "loss": 0.6172569394111633, + "step": 5418 + }, + { + "epoch": 2.2869198312236287, + "grad_norm": 1.133144736289978, + "learning_rate": 7.582379899918911e-05, + "loss": 0.5376655459403992, + "step": 5420 + }, + { + "epoch": 2.2877637130801687, + "grad_norm": 1.1103745698928833, + "learning_rate": 7.580323226164632e-05, + "loss": 0.6138498187065125, + "step": 5422 + }, + { + "epoch": 2.2886075949367086, + "grad_norm": 1.091636300086975, + "learning_rate": 7.57826595717934e-05, + "loss": 0.5049096345901489, + "step": 5424 + }, + { + "epoch": 2.289451476793249, + "grad_norm": 1.2486571073532104, + "learning_rate": 7.57620809343761e-05, + "loss": 0.5666115283966064, + "step": 5426 + }, + { + "epoch": 2.290295358649789, + "grad_norm": 1.510684847831726, + "learning_rate": 7.57414963541415e-05, + "loss": 0.49512919783592224, + "step": 5428 + }, + { + "epoch": 2.291139240506329, + "grad_norm": 1.1142191886901855, + "learning_rate": 7.572090583583805e-05, + "loss": 0.558807373046875, + "step": 5430 + }, + { + "epoch": 2.2919831223628693, + "grad_norm": 1.1162657737731934, + "learning_rate": 7.57003093842156e-05, + "loss": 0.6245265603065491, + "step": 5432 + }, + { + "epoch": 2.292827004219409, + "grad_norm": 1.2784614562988281, + "learning_rate": 7.567970700402537e-05, + "loss": 0.5505527853965759, + "step": 5434 + }, + { + "epoch": 2.293670886075949, + "grad_norm": 1.3142638206481934, + "learning_rate": 7.565909870001992e-05, + "loss": 0.6137702465057373, + "step": 5436 + }, + { + "epoch": 2.2945147679324895, + "grad_norm": 1.072805404663086, + "learning_rate": 7.563848447695318e-05, + "loss": 0.540766716003418, + "step": 5438 + }, + { + "epoch": 2.2953586497890295, + "grad_norm": 1.2861377000808716, + "learning_rate": 7.561786433958048e-05, + "loss": 0.6806555986404419, + "step": 5440 + }, + { + "epoch": 2.2962025316455694, + "grad_norm": 1.3193045854568481, + "learning_rate": 7.559723829265847e-05, + "loss": 0.6191258430480957, + "step": 5442 + }, + { + "epoch": 2.29704641350211, + "grad_norm": 1.1969127655029297, + "learning_rate": 7.55766063409452e-05, + "loss": 0.6067718863487244, + "step": 5444 + }, + { + "epoch": 2.2978902953586497, + "grad_norm": 1.2129666805267334, + "learning_rate": 7.555596848920006e-05, + "loss": 0.5673627257347107, + "step": 5446 + }, + { + "epoch": 2.2987341772151897, + "grad_norm": 1.1639961004257202, + "learning_rate": 7.553532474218379e-05, + "loss": 0.61825031042099, + "step": 5448 + }, + { + "epoch": 2.29957805907173, + "grad_norm": 1.3893283605575562, + "learning_rate": 7.551467510465852e-05, + "loss": 0.6096790432929993, + "step": 5450 + }, + { + "epoch": 2.30042194092827, + "grad_norm": 1.0708417892456055, + "learning_rate": 7.549401958138772e-05, + "loss": 0.6121414303779602, + "step": 5452 + }, + { + "epoch": 2.30126582278481, + "grad_norm": 1.3299298286437988, + "learning_rate": 7.547335817713624e-05, + "loss": 0.6504668593406677, + "step": 5454 + }, + { + "epoch": 2.3021097046413503, + "grad_norm": 1.3594682216644287, + "learning_rate": 7.545269089667022e-05, + "loss": 0.5761144161224365, + "step": 5456 + }, + { + "epoch": 2.3029535864978903, + "grad_norm": 1.1089586019515991, + "learning_rate": 7.543201774475726e-05, + "loss": 0.5457773804664612, + "step": 5458 + }, + { + "epoch": 2.3037974683544302, + "grad_norm": 1.3472918272018433, + "learning_rate": 7.541133872616624e-05, + "loss": 0.6014775037765503, + "step": 5460 + }, + { + "epoch": 2.3046413502109706, + "grad_norm": 1.2757689952850342, + "learning_rate": 7.53906538456674e-05, + "loss": 0.6246467232704163, + "step": 5462 + }, + { + "epoch": 2.3054852320675105, + "grad_norm": 1.4598166942596436, + "learning_rate": 7.536996310803236e-05, + "loss": 0.6583935022354126, + "step": 5464 + }, + { + "epoch": 2.3063291139240505, + "grad_norm": 1.2861602306365967, + "learning_rate": 7.534926651803407e-05, + "loss": 0.562523603439331, + "step": 5466 + }, + { + "epoch": 2.307172995780591, + "grad_norm": 1.0953221321105957, + "learning_rate": 7.532856408044684e-05, + "loss": 0.6093505620956421, + "step": 5468 + }, + { + "epoch": 2.308016877637131, + "grad_norm": 1.0982829332351685, + "learning_rate": 7.530785580004631e-05, + "loss": 0.6196447014808655, + "step": 5470 + }, + { + "epoch": 2.3088607594936708, + "grad_norm": 1.2224280834197998, + "learning_rate": 7.52871416816095e-05, + "loss": 0.6360989212989807, + "step": 5472 + }, + { + "epoch": 2.309704641350211, + "grad_norm": 1.244486927986145, + "learning_rate": 7.526642172991476e-05, + "loss": 0.6189543008804321, + "step": 5474 + }, + { + "epoch": 2.310548523206751, + "grad_norm": 1.2408053874969482, + "learning_rate": 7.524569594974178e-05, + "loss": 0.6137582659721375, + "step": 5476 + }, + { + "epoch": 2.311392405063291, + "grad_norm": 1.3323272466659546, + "learning_rate": 7.522496434587157e-05, + "loss": 0.6462169289588928, + "step": 5478 + }, + { + "epoch": 2.3122362869198314, + "grad_norm": 1.1076425313949585, + "learning_rate": 7.520422692308657e-05, + "loss": 0.5495362877845764, + "step": 5480 + }, + { + "epoch": 2.3130801687763713, + "grad_norm": 1.3298509120941162, + "learning_rate": 7.518348368617046e-05, + "loss": 0.5560636520385742, + "step": 5482 + }, + { + "epoch": 2.3139240506329113, + "grad_norm": 1.0740195512771606, + "learning_rate": 7.516273463990832e-05, + "loss": 0.5763371586799622, + "step": 5484 + }, + { + "epoch": 2.3147679324894517, + "grad_norm": 1.0748567581176758, + "learning_rate": 7.514197978908657e-05, + "loss": 0.5111498832702637, + "step": 5486 + }, + { + "epoch": 2.3156118143459916, + "grad_norm": 1.2047218084335327, + "learning_rate": 7.512121913849294e-05, + "loss": 0.6599951982498169, + "step": 5488 + }, + { + "epoch": 2.3164556962025316, + "grad_norm": 1.2956700325012207, + "learning_rate": 7.510045269291651e-05, + "loss": 0.6409770846366882, + "step": 5490 + }, + { + "epoch": 2.317299578059072, + "grad_norm": 1.241860032081604, + "learning_rate": 7.50796804571477e-05, + "loss": 0.5967662334442139, + "step": 5492 + }, + { + "epoch": 2.318143459915612, + "grad_norm": 1.1612682342529297, + "learning_rate": 7.50589024359783e-05, + "loss": 0.5856342315673828, + "step": 5494 + }, + { + "epoch": 2.318987341772152, + "grad_norm": 1.0895500183105469, + "learning_rate": 7.503811863420135e-05, + "loss": 0.5652023553848267, + "step": 5496 + }, + { + "epoch": 2.319831223628692, + "grad_norm": 1.3374481201171875, + "learning_rate": 7.50173290566113e-05, + "loss": 0.6777268648147583, + "step": 5498 + }, + { + "epoch": 2.320675105485232, + "grad_norm": 1.192614197731018, + "learning_rate": 7.499653370800391e-05, + "loss": 0.6052314043045044, + "step": 5500 + }, + { + "epoch": 2.320675105485232, + "eval_loss": 0.6867148876190186, + "eval_runtime": 941.3545, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 2.238, + "step": 5500 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 1 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.712498806040281e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-5500/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-5500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6000/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6000/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6000/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7d617aec17526ce7e61d9dd1b7d4241f2909485b --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a93b24f41e0599fbe5645239ddef1be9e541db97e93b7b3fb2d617a1eb14075 +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6000/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..687a20a277ef1f64c5d7eec1dd2cad88bec92355 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:491d13785a2c13bb97229cdedfb0d42bb632b5b5994d5ab85a141001d3746254 +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6000/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..77ce503fd40bd5cd98985e480c839113f9340587 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d7a45bcf4f7261247d3dc5347443e4dbc427bae8b01c77b222449c279223b1e +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6000/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..54a27b859cfa18c671afcfe32c86f7fb4db4644b --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb56e52536f183e50740494b470aeead2ddc2627e248b7cfa447bc06372fce34 +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6000/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..032a0d15aa11efa7218e411fbf26d47be4234151 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/trainer_state.json @@ -0,0 +1,21523 @@ +{ + "best_global_step": 6000, + "best_metric": 0.6813357472419739, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-6000", + "epoch": 2.5316455696202533, + "eval_steps": 100, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + }, + { + "epoch": 0.8447257383966245, + "grad_norm": 1.145434021949768, + "learning_rate": 9.893006338154401e-05, + "loss": 0.731850802898407, + "step": 2002 + }, + { + "epoch": 0.8455696202531645, + "grad_norm": 1.0618077516555786, + "learning_rate": 9.892511635084101e-05, + "loss": 0.6711665391921997, + "step": 2004 + }, + { + "epoch": 0.8464135021097047, + "grad_norm": 1.1657867431640625, + "learning_rate": 9.892015803405331e-05, + "loss": 0.6894803643226624, + "step": 2006 + }, + { + "epoch": 0.8472573839662447, + "grad_norm": 1.080140233039856, + "learning_rate": 9.891518843232467e-05, + "loss": 0.628146231174469, + "step": 2008 + }, + { + "epoch": 0.8481012658227848, + "grad_norm": 1.0664509534835815, + "learning_rate": 9.891020754680151e-05, + "loss": 0.740858793258667, + "step": 2010 + }, + { + "epoch": 0.8489451476793249, + "grad_norm": 1.5567615032196045, + "learning_rate": 9.89052153786328e-05, + "loss": 0.7763919234275818, + "step": 2012 + }, + { + "epoch": 0.849789029535865, + "grad_norm": 1.4347095489501953, + "learning_rate": 9.890021192897016e-05, + "loss": 0.8131396770477295, + "step": 2014 + }, + { + "epoch": 0.850632911392405, + "grad_norm": 1.1787892580032349, + "learning_rate": 9.889519719896776e-05, + "loss": 0.6829051375389099, + "step": 2016 + }, + { + "epoch": 0.8514767932489451, + "grad_norm": 1.239745855331421, + "learning_rate": 9.889017118978241e-05, + "loss": 0.7664558291435242, + "step": 2018 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 1.1224207878112793, + "learning_rate": 9.888513390257352e-05, + "loss": 0.7307376861572266, + "step": 2020 + }, + { + "epoch": 0.8531645569620253, + "grad_norm": 1.100536823272705, + "learning_rate": 9.88800853385031e-05, + "loss": 0.6786578893661499, + "step": 2022 + }, + { + "epoch": 0.8540084388185654, + "grad_norm": 1.25773024559021, + "learning_rate": 9.887502549873576e-05, + "loss": 0.7971984148025513, + "step": 2024 + }, + { + "epoch": 0.8548523206751055, + "grad_norm": 0.9980104565620422, + "learning_rate": 9.886995438443868e-05, + "loss": 0.6990941166877747, + "step": 2026 + }, + { + "epoch": 0.8556962025316456, + "grad_norm": 1.0464621782302856, + "learning_rate": 9.886487199678171e-05, + "loss": 0.763938307762146, + "step": 2028 + }, + { + "epoch": 0.8565400843881856, + "grad_norm": 1.2303017377853394, + "learning_rate": 9.885977833693724e-05, + "loss": 0.7165632247924805, + "step": 2030 + }, + { + "epoch": 0.8573839662447258, + "grad_norm": 1.2203325033187866, + "learning_rate": 9.885467340608027e-05, + "loss": 0.7586364150047302, + "step": 2032 + }, + { + "epoch": 0.8582278481012658, + "grad_norm": 1.113882064819336, + "learning_rate": 9.884955720538843e-05, + "loss": 0.703253984451294, + "step": 2034 + }, + { + "epoch": 0.8590717299578059, + "grad_norm": 1.1731632947921753, + "learning_rate": 9.88444297360419e-05, + "loss": 0.8530917763710022, + "step": 2036 + }, + { + "epoch": 0.859915611814346, + "grad_norm": 1.4592338800430298, + "learning_rate": 9.883929099922349e-05, + "loss": 0.8166638612747192, + "step": 2038 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.1279125213623047, + "learning_rate": 9.883414099611864e-05, + "loss": 0.6762415170669556, + "step": 2040 + }, + { + "epoch": 0.8616033755274262, + "grad_norm": 1.1587293148040771, + "learning_rate": 9.882897972791534e-05, + "loss": 0.6826539039611816, + "step": 2042 + }, + { + "epoch": 0.8624472573839662, + "grad_norm": 1.1909502744674683, + "learning_rate": 9.88238071958042e-05, + "loss": 0.7372410893440247, + "step": 2044 + }, + { + "epoch": 0.8632911392405064, + "grad_norm": 1.0340155363082886, + "learning_rate": 9.881862340097841e-05, + "loss": 0.699260950088501, + "step": 2046 + }, + { + "epoch": 0.8641350210970464, + "grad_norm": 1.1745870113372803, + "learning_rate": 9.881342834463379e-05, + "loss": 0.7689789533615112, + "step": 2048 + }, + { + "epoch": 0.8649789029535865, + "grad_norm": 1.0003606081008911, + "learning_rate": 9.880822202796872e-05, + "loss": 0.6877372860908508, + "step": 2050 + }, + { + "epoch": 0.8658227848101265, + "grad_norm": 1.2546781301498413, + "learning_rate": 9.88030044521842e-05, + "loss": 0.7632413506507874, + "step": 2052 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.1178704500198364, + "learning_rate": 9.879777561848385e-05, + "loss": 0.6776729822158813, + "step": 2054 + }, + { + "epoch": 0.8675105485232067, + "grad_norm": 1.523606777191162, + "learning_rate": 9.879253552807384e-05, + "loss": 0.7592973709106445, + "step": 2056 + }, + { + "epoch": 0.8683544303797468, + "grad_norm": 1.3490995168685913, + "learning_rate": 9.878728418216296e-05, + "loss": 0.8028839230537415, + "step": 2058 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 1.1851624250411987, + "learning_rate": 9.87820215819626e-05, + "loss": 0.7499933838844299, + "step": 2060 + }, + { + "epoch": 0.870042194092827, + "grad_norm": 1.1877925395965576, + "learning_rate": 9.877674772868672e-05, + "loss": 0.7324717044830322, + "step": 2062 + }, + { + "epoch": 0.8708860759493671, + "grad_norm": 1.2982885837554932, + "learning_rate": 9.877146262355194e-05, + "loss": 0.7456585168838501, + "step": 2064 + }, + { + "epoch": 0.8717299578059071, + "grad_norm": 1.043912649154663, + "learning_rate": 9.876616626777739e-05, + "loss": 0.7552799582481384, + "step": 2066 + }, + { + "epoch": 0.8725738396624473, + "grad_norm": 1.172580599784851, + "learning_rate": 9.876085866258487e-05, + "loss": 0.6964990496635437, + "step": 2068 + }, + { + "epoch": 0.8734177215189873, + "grad_norm": 1.26815927028656, + "learning_rate": 9.875553980919871e-05, + "loss": 0.7368612289428711, + "step": 2070 + }, + { + "epoch": 0.8742616033755274, + "grad_norm": 1.1268136501312256, + "learning_rate": 9.875020970884587e-05, + "loss": 0.7400802969932556, + "step": 2072 + }, + { + "epoch": 0.8751054852320675, + "grad_norm": 1.0556721687316895, + "learning_rate": 9.874486836275594e-05, + "loss": 0.6931334137916565, + "step": 2074 + }, + { + "epoch": 0.8759493670886076, + "grad_norm": 1.1967823505401611, + "learning_rate": 9.873951577216106e-05, + "loss": 0.7124089002609253, + "step": 2076 + }, + { + "epoch": 0.8767932489451477, + "grad_norm": 1.1753164529800415, + "learning_rate": 9.873415193829591e-05, + "loss": 0.7462030053138733, + "step": 2078 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 1.326923131942749, + "learning_rate": 9.872877686239789e-05, + "loss": 0.778078019618988, + "step": 2080 + }, + { + "epoch": 0.8784810126582279, + "grad_norm": 1.1472662687301636, + "learning_rate": 9.87233905457069e-05, + "loss": 0.6592919826507568, + "step": 2082 + }, + { + "epoch": 0.8793248945147679, + "grad_norm": 1.1162762641906738, + "learning_rate": 9.871799298946544e-05, + "loss": 0.661717414855957, + "step": 2084 + }, + { + "epoch": 0.880168776371308, + "grad_norm": 1.1694408655166626, + "learning_rate": 9.871258419491866e-05, + "loss": 0.6203670501708984, + "step": 2086 + }, + { + "epoch": 0.8810126582278481, + "grad_norm": 1.229691505432129, + "learning_rate": 9.870716416331425e-05, + "loss": 0.758888304233551, + "step": 2088 + }, + { + "epoch": 0.8818565400843882, + "grad_norm": 1.540377140045166, + "learning_rate": 9.870173289590251e-05, + "loss": 0.760649561882019, + "step": 2090 + }, + { + "epoch": 0.8827004219409282, + "grad_norm": 1.173628568649292, + "learning_rate": 9.869629039393632e-05, + "loss": 0.6981227397918701, + "step": 2092 + }, + { + "epoch": 0.8835443037974684, + "grad_norm": 1.1404013633728027, + "learning_rate": 9.869083665867116e-05, + "loss": 0.7808336615562439, + "step": 2094 + }, + { + "epoch": 0.8843881856540085, + "grad_norm": 1.1038721799850464, + "learning_rate": 9.868537169136511e-05, + "loss": 0.7540555596351624, + "step": 2096 + }, + { + "epoch": 0.8852320675105485, + "grad_norm": 1.1510080099105835, + "learning_rate": 9.867989549327885e-05, + "loss": 0.6650454998016357, + "step": 2098 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.166912317276001, + "learning_rate": 9.867440806567561e-05, + "loss": 0.673769474029541, + "step": 2100 + }, + { + "epoch": 0.8860759493670886, + "eval_loss": 0.7559094429016113, + "eval_runtime": 847.8311, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2100 + }, + { + "epoch": 0.8869198312236287, + "grad_norm": 1.227583885192871, + "learning_rate": 9.866890940982121e-05, + "loss": 0.8314241766929626, + "step": 2102 + }, + { + "epoch": 0.8877637130801688, + "grad_norm": 1.1813976764678955, + "learning_rate": 9.866339952698413e-05, + "loss": 0.6770843863487244, + "step": 2104 + }, + { + "epoch": 0.8886075949367088, + "grad_norm": 1.2471063137054443, + "learning_rate": 9.865787841843539e-05, + "loss": 0.7142292857170105, + "step": 2106 + }, + { + "epoch": 0.889451476793249, + "grad_norm": 1.1602860689163208, + "learning_rate": 9.865234608544858e-05, + "loss": 0.6981731653213501, + "step": 2108 + }, + { + "epoch": 0.890295358649789, + "grad_norm": 1.145677089691162, + "learning_rate": 9.864680252929992e-05, + "loss": 0.7019379138946533, + "step": 2110 + }, + { + "epoch": 0.8911392405063291, + "grad_norm": 1.2222462892532349, + "learning_rate": 9.86412477512682e-05, + "loss": 0.7690986394882202, + "step": 2112 + }, + { + "epoch": 0.8919831223628693, + "grad_norm": 1.1288166046142578, + "learning_rate": 9.863568175263478e-05, + "loss": 0.7241792678833008, + "step": 2114 + }, + { + "epoch": 0.8928270042194093, + "grad_norm": 1.1773978471755981, + "learning_rate": 9.863010453468364e-05, + "loss": 0.7392162084579468, + "step": 2116 + }, + { + "epoch": 0.8936708860759494, + "grad_norm": 1.102638840675354, + "learning_rate": 9.862451609870136e-05, + "loss": 0.7603078484535217, + "step": 2118 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.861891644597707e-05, + "loss": 0.6804911494255066, + "step": 2120 + }, + { + "epoch": 0.8953586497890296, + "grad_norm": 1.1381969451904297, + "learning_rate": 9.86133055778025e-05, + "loss": 0.787288248538971, + "step": 2122 + }, + { + "epoch": 0.8962025316455696, + "grad_norm": 1.2454546689987183, + "learning_rate": 9.860768349547196e-05, + "loss": 0.7282505035400391, + "step": 2124 + }, + { + "epoch": 0.8970464135021097, + "grad_norm": 1.2568305730819702, + "learning_rate": 9.860205020028237e-05, + "loss": 0.7554803490638733, + "step": 2126 + }, + { + "epoch": 0.8978902953586498, + "grad_norm": 1.1523523330688477, + "learning_rate": 9.859640569353321e-05, + "loss": 0.7126525044441223, + "step": 2128 + }, + { + "epoch": 0.8987341772151899, + "grad_norm": 1.314878225326538, + "learning_rate": 9.859074997652658e-05, + "loss": 0.7300811409950256, + "step": 2130 + }, + { + "epoch": 0.8995780590717299, + "grad_norm": 1.1272218227386475, + "learning_rate": 9.858508305056713e-05, + "loss": 0.7217329144477844, + "step": 2132 + }, + { + "epoch": 0.90042194092827, + "grad_norm": 1.10934317111969, + "learning_rate": 9.857940491696211e-05, + "loss": 0.714308500289917, + "step": 2134 + }, + { + "epoch": 0.9012658227848102, + "grad_norm": 1.1991039514541626, + "learning_rate": 9.857371557702136e-05, + "loss": 0.6613366007804871, + "step": 2136 + }, + { + "epoch": 0.9021097046413502, + "grad_norm": 1.3176918029785156, + "learning_rate": 9.85680150320573e-05, + "loss": 0.6972863078117371, + "step": 2138 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 1.1966592073440552, + "learning_rate": 9.856230328338496e-05, + "loss": 0.7299100160598755, + "step": 2140 + }, + { + "epoch": 0.9037974683544304, + "grad_norm": 1.2889270782470703, + "learning_rate": 9.85565803323219e-05, + "loss": 0.7145020961761475, + "step": 2142 + }, + { + "epoch": 0.9046413502109705, + "grad_norm": 1.2112789154052734, + "learning_rate": 9.855084618018828e-05, + "loss": 0.6717942953109741, + "step": 2144 + }, + { + "epoch": 0.9054852320675105, + "grad_norm": 1.2550239562988281, + "learning_rate": 9.85451008283069e-05, + "loss": 0.7460196018218994, + "step": 2146 + }, + { + "epoch": 0.9063291139240506, + "grad_norm": 1.2926387786865234, + "learning_rate": 9.853934427800309e-05, + "loss": 0.8300626873970032, + "step": 2148 + }, + { + "epoch": 0.9071729957805907, + "grad_norm": 1.0690672397613525, + "learning_rate": 9.853357653060478e-05, + "loss": 0.715215802192688, + "step": 2150 + }, + { + "epoch": 0.9080168776371308, + "grad_norm": 1.1021424531936646, + "learning_rate": 9.852779758744245e-05, + "loss": 0.7021427154541016, + "step": 2152 + }, + { + "epoch": 0.9088607594936708, + "grad_norm": 1.0713517665863037, + "learning_rate": 9.852200744984921e-05, + "loss": 0.7576406598091125, + "step": 2154 + }, + { + "epoch": 0.909704641350211, + "grad_norm": 1.277526617050171, + "learning_rate": 9.851620611916075e-05, + "loss": 0.7008846998214722, + "step": 2156 + }, + { + "epoch": 0.9105485232067511, + "grad_norm": 1.2434618473052979, + "learning_rate": 9.85103935967153e-05, + "loss": 0.7536613345146179, + "step": 2158 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 1.1654841899871826, + "learning_rate": 9.850456988385371e-05, + "loss": 0.7435567378997803, + "step": 2160 + }, + { + "epoch": 0.9122362869198313, + "grad_norm": 1.0718246698379517, + "learning_rate": 9.849873498191939e-05, + "loss": 0.7725666165351868, + "step": 2162 + }, + { + "epoch": 0.9130801687763713, + "grad_norm": 1.3425630331039429, + "learning_rate": 9.849288889225835e-05, + "loss": 0.7833593487739563, + "step": 2164 + }, + { + "epoch": 0.9139240506329114, + "grad_norm": 1.1989985704421997, + "learning_rate": 9.848703161621917e-05, + "loss": 0.7290158867835999, + "step": 2166 + }, + { + "epoch": 0.9147679324894514, + "grad_norm": 1.0549380779266357, + "learning_rate": 9.8481163155153e-05, + "loss": 0.6787996888160706, + "step": 2168 + }, + { + "epoch": 0.9156118143459916, + "grad_norm": 1.0757017135620117, + "learning_rate": 9.847528351041359e-05, + "loss": 0.7645748853683472, + "step": 2170 + }, + { + "epoch": 0.9164556962025316, + "grad_norm": 1.0636975765228271, + "learning_rate": 9.846939268335726e-05, + "loss": 0.6640698313713074, + "step": 2172 + }, + { + "epoch": 0.9172995780590717, + "grad_norm": 1.2038439512252808, + "learning_rate": 9.846349067534291e-05, + "loss": 0.7216284275054932, + "step": 2174 + }, + { + "epoch": 0.9181434599156119, + "grad_norm": 1.17854642868042, + "learning_rate": 9.845757748773203e-05, + "loss": 0.7244991660118103, + "step": 2176 + }, + { + "epoch": 0.9189873417721519, + "grad_norm": 1.0391159057617188, + "learning_rate": 9.845165312188864e-05, + "loss": 0.6043152809143066, + "step": 2178 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 1.2382071018218994, + "learning_rate": 9.844571757917944e-05, + "loss": 0.7791659832000732, + "step": 2180 + }, + { + "epoch": 0.920675105485232, + "grad_norm": 1.0855708122253418, + "learning_rate": 9.84397708609736e-05, + "loss": 0.7190433144569397, + "step": 2182 + }, + { + "epoch": 0.9215189873417722, + "grad_norm": 1.103308916091919, + "learning_rate": 9.843381296864291e-05, + "loss": 0.6648658514022827, + "step": 2184 + }, + { + "epoch": 0.9223628691983122, + "grad_norm": 1.073517918586731, + "learning_rate": 9.842784390356178e-05, + "loss": 0.6891760230064392, + "step": 2186 + }, + { + "epoch": 0.9232067510548523, + "grad_norm": 1.0806199312210083, + "learning_rate": 9.842186366710712e-05, + "loss": 0.6880859136581421, + "step": 2188 + }, + { + "epoch": 0.9240506329113924, + "grad_norm": 1.0631483793258667, + "learning_rate": 9.841587226065848e-05, + "loss": 0.6238307952880859, + "step": 2190 + }, + { + "epoch": 0.9248945147679325, + "grad_norm": 1.2630863189697266, + "learning_rate": 9.840986968559795e-05, + "loss": 0.6905744075775146, + "step": 2192 + }, + { + "epoch": 0.9257383966244725, + "grad_norm": 1.1307560205459595, + "learning_rate": 9.840385594331022e-05, + "loss": 0.7531564235687256, + "step": 2194 + }, + { + "epoch": 0.9265822784810127, + "grad_norm": 1.0294862985610962, + "learning_rate": 9.839783103518254e-05, + "loss": 0.6750671863555908, + "step": 2196 + }, + { + "epoch": 0.9274261603375528, + "grad_norm": 1.2446976900100708, + "learning_rate": 9.839179496260472e-05, + "loss": 0.7200804352760315, + "step": 2198 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 1.2673420906066895, + "learning_rate": 9.83857477269692e-05, + "loss": 0.7002623677253723, + "step": 2200 + }, + { + "epoch": 0.9282700421940928, + "eval_loss": 0.7497645616531372, + "eval_runtime": 856.8766, + "eval_samples_per_second": 2.459, + "eval_steps_per_second": 2.459, + "step": 2200 + }, + { + "epoch": 0.9291139240506329, + "grad_norm": 1.5114624500274658, + "learning_rate": 9.837968932967094e-05, + "loss": 0.7718265056610107, + "step": 2202 + }, + { + "epoch": 0.929957805907173, + "grad_norm": 1.2059369087219238, + "learning_rate": 9.837361977210751e-05, + "loss": 0.7204271554946899, + "step": 2204 + }, + { + "epoch": 0.9308016877637131, + "grad_norm": 1.2077301740646362, + "learning_rate": 9.836753905567902e-05, + "loss": 0.7371073961257935, + "step": 2206 + }, + { + "epoch": 0.9316455696202531, + "grad_norm": 1.120097279548645, + "learning_rate": 9.836144718178818e-05, + "loss": 0.6601167321205139, + "step": 2208 + }, + { + "epoch": 0.9324894514767933, + "grad_norm": 1.1755714416503906, + "learning_rate": 9.835534415184029e-05, + "loss": 0.6897423267364502, + "step": 2210 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3587000370025635, + "learning_rate": 9.834922996724317e-05, + "loss": 0.758438229560852, + "step": 2212 + }, + { + "epoch": 0.9341772151898734, + "grad_norm": 1.1898177862167358, + "learning_rate": 9.834310462940727e-05, + "loss": 0.7489214539527893, + "step": 2214 + }, + { + "epoch": 0.9350210970464135, + "grad_norm": 1.0814623832702637, + "learning_rate": 9.833696813974558e-05, + "loss": 0.6844488382339478, + "step": 2216 + }, + { + "epoch": 0.9358649789029536, + "grad_norm": 1.1060179471969604, + "learning_rate": 9.833082049967366e-05, + "loss": 0.6617586016654968, + "step": 2218 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 1.1780575513839722, + "learning_rate": 9.832466171060968e-05, + "loss": 0.7383584976196289, + "step": 2220 + }, + { + "epoch": 0.9375527426160337, + "grad_norm": 1.3734618425369263, + "learning_rate": 9.831849177397432e-05, + "loss": 0.7764308452606201, + "step": 2222 + }, + { + "epoch": 0.9383966244725739, + "grad_norm": 1.1367733478546143, + "learning_rate": 9.831231069119089e-05, + "loss": 0.6834397912025452, + "step": 2224 + }, + { + "epoch": 0.9392405063291139, + "grad_norm": 1.1695492267608643, + "learning_rate": 9.830611846368524e-05, + "loss": 0.7054480910301208, + "step": 2226 + }, + { + "epoch": 0.940084388185654, + "grad_norm": 1.0345736742019653, + "learning_rate": 9.829991509288579e-05, + "loss": 0.694448709487915, + "step": 2228 + }, + { + "epoch": 0.9409282700421941, + "grad_norm": 1.298105239868164, + "learning_rate": 9.829370058022356e-05, + "loss": 0.6839741468429565, + "step": 2230 + }, + { + "epoch": 0.9417721518987342, + "grad_norm": 1.2905502319335938, + "learning_rate": 9.828747492713209e-05, + "loss": 0.7886884212493896, + "step": 2232 + }, + { + "epoch": 0.9426160337552743, + "grad_norm": 1.12301504611969, + "learning_rate": 9.828123813504753e-05, + "loss": 0.7206413149833679, + "step": 2234 + }, + { + "epoch": 0.9434599156118143, + "grad_norm": 1.2644896507263184, + "learning_rate": 9.82749902054086e-05, + "loss": 0.7700693607330322, + "step": 2236 + }, + { + "epoch": 0.9443037974683545, + "grad_norm": 1.1626365184783936, + "learning_rate": 9.826873113965655e-05, + "loss": 0.7199711203575134, + "step": 2238 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 1.0728627443313599, + "learning_rate": 9.826246093923528e-05, + "loss": 0.7183539271354675, + "step": 2240 + }, + { + "epoch": 0.9459915611814346, + "grad_norm": 1.1444766521453857, + "learning_rate": 9.825617960559114e-05, + "loss": 0.7417964935302734, + "step": 2242 + }, + { + "epoch": 0.9468354430379747, + "grad_norm": 1.4059823751449585, + "learning_rate": 9.824988714017316e-05, + "loss": 0.7949740290641785, + "step": 2244 + }, + { + "epoch": 0.9476793248945148, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.824358354443286e-05, + "loss": 0.6433083415031433, + "step": 2246 + }, + { + "epoch": 0.9485232067510548, + "grad_norm": 1.0879144668579102, + "learning_rate": 9.823726881982438e-05, + "loss": 0.6519861817359924, + "step": 2248 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.2289162874221802, + "learning_rate": 9.82309429678044e-05, + "loss": 0.7280195355415344, + "step": 2250 + }, + { + "epoch": 0.950210970464135, + "grad_norm": 1.1755765676498413, + "learning_rate": 9.822460598983217e-05, + "loss": 0.7524687647819519, + "step": 2252 + }, + { + "epoch": 0.9510548523206751, + "grad_norm": 1.179807186126709, + "learning_rate": 9.821825788736949e-05, + "loss": 0.7543174624443054, + "step": 2254 + }, + { + "epoch": 0.9518987341772152, + "grad_norm": 1.1234289407730103, + "learning_rate": 9.821189866188079e-05, + "loss": 0.716377854347229, + "step": 2256 + }, + { + "epoch": 0.9527426160337553, + "grad_norm": 1.0324063301086426, + "learning_rate": 9.820552831483297e-05, + "loss": 0.6403332948684692, + "step": 2258 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 1.1459579467773438, + "learning_rate": 9.819914684769558e-05, + "loss": 0.7406947612762451, + "step": 2260 + }, + { + "epoch": 0.9544303797468354, + "grad_norm": 1.2886124849319458, + "learning_rate": 9.819275426194072e-05, + "loss": 0.749687671661377, + "step": 2262 + }, + { + "epoch": 0.9552742616033755, + "grad_norm": 1.3349844217300415, + "learning_rate": 9.818635055904299e-05, + "loss": 0.778410017490387, + "step": 2264 + }, + { + "epoch": 0.9561181434599156, + "grad_norm": 1.0994901657104492, + "learning_rate": 9.81799357404796e-05, + "loss": 0.6701914668083191, + "step": 2266 + }, + { + "epoch": 0.9569620253164557, + "grad_norm": 1.1787796020507812, + "learning_rate": 9.817350980773038e-05, + "loss": 0.7205135226249695, + "step": 2268 + }, + { + "epoch": 0.9578059071729957, + "grad_norm": 1.100813627243042, + "learning_rate": 9.816707276227763e-05, + "loss": 0.6897916197776794, + "step": 2270 + }, + { + "epoch": 0.9586497890295359, + "grad_norm": 1.1280698776245117, + "learning_rate": 9.816062460560627e-05, + "loss": 0.6763570308685303, + "step": 2272 + }, + { + "epoch": 0.959493670886076, + "grad_norm": 1.2322514057159424, + "learning_rate": 9.815416533920374e-05, + "loss": 0.6948683857917786, + "step": 2274 + }, + { + "epoch": 0.960337552742616, + "grad_norm": 1.3963630199432373, + "learning_rate": 9.814769496456008e-05, + "loss": 0.7876828908920288, + "step": 2276 + }, + { + "epoch": 0.9611814345991562, + "grad_norm": 1.2093676328659058, + "learning_rate": 9.814121348316792e-05, + "loss": 0.8191362619400024, + "step": 2278 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 1.2223572731018066, + "learning_rate": 9.813472089652233e-05, + "loss": 0.7162626385688782, + "step": 2280 + }, + { + "epoch": 0.9628691983122363, + "grad_norm": 1.1498078107833862, + "learning_rate": 9.812821720612111e-05, + "loss": 0.7183970212936401, + "step": 2282 + }, + { + "epoch": 0.9637130801687763, + "grad_norm": 1.1563853025436401, + "learning_rate": 9.812170241346449e-05, + "loss": 0.734487771987915, + "step": 2284 + }, + { + "epoch": 0.9645569620253165, + "grad_norm": 1.1823415756225586, + "learning_rate": 9.81151765200553e-05, + "loss": 0.7312371730804443, + "step": 2286 + }, + { + "epoch": 0.9654008438818565, + "grad_norm": 1.1336151361465454, + "learning_rate": 9.810863952739899e-05, + "loss": 0.7668377757072449, + "step": 2288 + }, + { + "epoch": 0.9662447257383966, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.810209143700347e-05, + "loss": 0.7100399732589722, + "step": 2290 + }, + { + "epoch": 0.9670886075949368, + "grad_norm": 1.1368129253387451, + "learning_rate": 9.809553225037926e-05, + "loss": 0.7169836163520813, + "step": 2292 + }, + { + "epoch": 0.9679324894514768, + "grad_norm": 1.141107439994812, + "learning_rate": 9.808896196903947e-05, + "loss": 0.7709535956382751, + "step": 2294 + }, + { + "epoch": 0.9687763713080169, + "grad_norm": 1.276405930519104, + "learning_rate": 9.808238059449971e-05, + "loss": 0.7300511002540588, + "step": 2296 + }, + { + "epoch": 0.9696202531645569, + "grad_norm": 0.9817046523094177, + "learning_rate": 9.80757881282782e-05, + "loss": 0.6259129047393799, + "step": 2298 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 1.3965257406234741, + "learning_rate": 9.806918457189566e-05, + "loss": 0.7361716032028198, + "step": 2300 + }, + { + "epoch": 0.9704641350210971, + "eval_loss": 0.7464568614959717, + "eval_runtime": 864.2128, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 2.438, + "step": 2300 + }, + { + "epoch": 0.9713080168776371, + "grad_norm": 1.2168612480163574, + "learning_rate": 9.806256992687544e-05, + "loss": 0.805477499961853, + "step": 2302 + }, + { + "epoch": 0.9721518987341772, + "grad_norm": 1.0418168306350708, + "learning_rate": 9.80559441947434e-05, + "loss": 0.6673368811607361, + "step": 2304 + }, + { + "epoch": 0.9729957805907173, + "grad_norm": 1.223128318786621, + "learning_rate": 9.804930737702796e-05, + "loss": 0.7585647106170654, + "step": 2306 + }, + { + "epoch": 0.9738396624472574, + "grad_norm": 1.264511227607727, + "learning_rate": 9.804265947526011e-05, + "loss": 0.7642034888267517, + "step": 2308 + }, + { + "epoch": 0.9746835443037974, + "grad_norm": 1.076887607574463, + "learning_rate": 9.803600049097339e-05, + "loss": 0.7094541192054749, + "step": 2310 + }, + { + "epoch": 0.9755274261603376, + "grad_norm": 1.0214987993240356, + "learning_rate": 9.802933042570392e-05, + "loss": 0.7370059490203857, + "step": 2312 + }, + { + "epoch": 0.9763713080168777, + "grad_norm": 1.3075295686721802, + "learning_rate": 9.802264928099035e-05, + "loss": 0.726834237575531, + "step": 2314 + }, + { + "epoch": 0.9772151898734177, + "grad_norm": 1.057386040687561, + "learning_rate": 9.801595705837385e-05, + "loss": 0.6742353439331055, + "step": 2316 + }, + { + "epoch": 0.9780590717299578, + "grad_norm": 1.3998085260391235, + "learning_rate": 9.800925375939825e-05, + "loss": 0.6862425208091736, + "step": 2318 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 1.080574631690979, + "learning_rate": 9.800253938560983e-05, + "loss": 0.6212031245231628, + "step": 2320 + }, + { + "epoch": 0.979746835443038, + "grad_norm": 1.3643771409988403, + "learning_rate": 9.799581393855748e-05, + "loss": 0.7522522211074829, + "step": 2322 + }, + { + "epoch": 0.980590717299578, + "grad_norm": 1.2455768585205078, + "learning_rate": 9.798907741979264e-05, + "loss": 0.7265716791152954, + "step": 2324 + }, + { + "epoch": 0.9814345991561182, + "grad_norm": 1.078774333000183, + "learning_rate": 9.798232983086927e-05, + "loss": 0.7160419225692749, + "step": 2326 + }, + { + "epoch": 0.9822784810126582, + "grad_norm": 1.3013948202133179, + "learning_rate": 9.797557117334394e-05, + "loss": 0.7991124391555786, + "step": 2328 + }, + { + "epoch": 0.9831223628691983, + "grad_norm": 1.2216732501983643, + "learning_rate": 9.796880144877572e-05, + "loss": 0.7193916440010071, + "step": 2330 + }, + { + "epoch": 0.9839662447257383, + "grad_norm": 1.1469542980194092, + "learning_rate": 9.796202065872627e-05, + "loss": 0.7184370756149292, + "step": 2332 + }, + { + "epoch": 0.9848101265822785, + "grad_norm": 1.0431830883026123, + "learning_rate": 9.795522880475979e-05, + "loss": 0.6474619507789612, + "step": 2334 + }, + { + "epoch": 0.9856540084388186, + "grad_norm": 1.1819576025009155, + "learning_rate": 9.794842588844299e-05, + "loss": 0.6392545700073242, + "step": 2336 + }, + { + "epoch": 0.9864978902953586, + "grad_norm": 1.1984983682632446, + "learning_rate": 9.794161191134525e-05, + "loss": 0.7358114719390869, + "step": 2338 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 1.3378512859344482, + "learning_rate": 9.793478687503834e-05, + "loss": 0.6762020587921143, + "step": 2340 + }, + { + "epoch": 0.9881856540084388, + "grad_norm": 1.272674560546875, + "learning_rate": 9.792795078109673e-05, + "loss": 0.7478934526443481, + "step": 2342 + }, + { + "epoch": 0.9890295358649789, + "grad_norm": 1.153746247291565, + "learning_rate": 9.792110363109733e-05, + "loss": 0.7316533923149109, + "step": 2344 + }, + { + "epoch": 0.9898734177215189, + "grad_norm": 1.1361702680587769, + "learning_rate": 9.791424542661967e-05, + "loss": 0.7078539133071899, + "step": 2346 + }, + { + "epoch": 0.9907172995780591, + "grad_norm": 1.3043115139007568, + "learning_rate": 9.790737616924581e-05, + "loss": 0.7945935130119324, + "step": 2348 + }, + { + "epoch": 0.9915611814345991, + "grad_norm": 1.1913264989852905, + "learning_rate": 9.790049586056034e-05, + "loss": 0.8247197866439819, + "step": 2350 + }, + { + "epoch": 0.9924050632911392, + "grad_norm": 1.1560171842575073, + "learning_rate": 9.789360450215041e-05, + "loss": 0.7099657654762268, + "step": 2352 + }, + { + "epoch": 0.9932489451476794, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.788670209560575e-05, + "loss": 0.7480318546295166, + "step": 2354 + }, + { + "epoch": 0.9940928270042194, + "grad_norm": 1.1584707498550415, + "learning_rate": 9.787978864251859e-05, + "loss": 0.6870889067649841, + "step": 2356 + }, + { + "epoch": 0.9949367088607595, + "grad_norm": 1.057478666305542, + "learning_rate": 9.787286414448375e-05, + "loss": 0.6114922165870667, + "step": 2358 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 1.1431775093078613, + "learning_rate": 9.786592860309856e-05, + "loss": 0.6955118179321289, + "step": 2360 + }, + { + "epoch": 0.9966244725738397, + "grad_norm": 1.232142448425293, + "learning_rate": 9.785898201996292e-05, + "loss": 0.735048770904541, + "step": 2362 + }, + { + "epoch": 0.9974683544303797, + "grad_norm": 1.1236306428909302, + "learning_rate": 9.785202439667928e-05, + "loss": 0.7150241136550903, + "step": 2364 + }, + { + "epoch": 0.9983122362869198, + "grad_norm": 1.0517534017562866, + "learning_rate": 9.784505573485263e-05, + "loss": 0.6870222687721252, + "step": 2366 + }, + { + "epoch": 0.99915611814346, + "grad_norm": 1.1747480630874634, + "learning_rate": 9.78380760360905e-05, + "loss": 0.7521567940711975, + "step": 2368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2790346145629883, + "learning_rate": 9.783108530200298e-05, + "loss": 0.7336234450340271, + "step": 2370 + }, + { + "epoch": 1.0008438818565402, + "grad_norm": 1.1216399669647217, + "learning_rate": 9.78240835342027e-05, + "loss": 0.6378109455108643, + "step": 2372 + }, + { + "epoch": 1.00168776371308, + "grad_norm": 1.267336368560791, + "learning_rate": 9.781707073430482e-05, + "loss": 0.6174905300140381, + "step": 2374 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 1.1342934370040894, + "learning_rate": 9.781004690392706e-05, + "loss": 0.6579123139381409, + "step": 2376 + }, + { + "epoch": 1.0033755274261604, + "grad_norm": 1.1317468881607056, + "learning_rate": 9.78030120446897e-05, + "loss": 0.6679617166519165, + "step": 2378 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 1.2992616891860962, + "learning_rate": 9.779596615821552e-05, + "loss": 0.7368149161338806, + "step": 2380 + }, + { + "epoch": 1.0050632911392405, + "grad_norm": 1.1714510917663574, + "learning_rate": 9.77889092461299e-05, + "loss": 0.6887164115905762, + "step": 2382 + }, + { + "epoch": 1.0059071729957807, + "grad_norm": 1.1670639514923096, + "learning_rate": 9.778184131006071e-05, + "loss": 0.681344211101532, + "step": 2384 + }, + { + "epoch": 1.0067510548523206, + "grad_norm": 1.2487291097640991, + "learning_rate": 9.77747623516384e-05, + "loss": 0.7342769503593445, + "step": 2386 + }, + { + "epoch": 1.0075949367088608, + "grad_norm": 1.2408956289291382, + "learning_rate": 9.776767237249595e-05, + "loss": 0.577454149723053, + "step": 2388 + }, + { + "epoch": 1.0084388185654007, + "grad_norm": 1.067991852760315, + "learning_rate": 9.776057137426889e-05, + "loss": 0.6588307023048401, + "step": 2390 + }, + { + "epoch": 1.009282700421941, + "grad_norm": 1.2821543216705322, + "learning_rate": 9.775345935859525e-05, + "loss": 0.7045041918754578, + "step": 2392 + }, + { + "epoch": 1.010126582278481, + "grad_norm": 1.3160134553909302, + "learning_rate": 9.774633632711569e-05, + "loss": 0.7141479253768921, + "step": 2394 + }, + { + "epoch": 1.010970464135021, + "grad_norm": 1.66774320602417, + "learning_rate": 9.773920228147329e-05, + "loss": 0.723293662071228, + "step": 2396 + }, + { + "epoch": 1.0118143459915612, + "grad_norm": 1.027588963508606, + "learning_rate": 9.77320572233138e-05, + "loss": 0.5812023878097534, + "step": 2398 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.406507968902588, + "learning_rate": 9.77249011542854e-05, + "loss": 0.7071458101272583, + "step": 2400 + }, + { + "epoch": 1.0126582278481013, + "eval_loss": 0.7421699166297913, + "eval_runtime": 854.2185, + "eval_samples_per_second": 2.467, + "eval_steps_per_second": 2.467, + "step": 2400 + }, + { + "epoch": 1.0135021097046413, + "grad_norm": 1.1236240863800049, + "learning_rate": 9.771773407603889e-05, + "loss": 0.7049722671508789, + "step": 2402 + }, + { + "epoch": 1.0143459915611814, + "grad_norm": 1.1924289464950562, + "learning_rate": 9.771055599022756e-05, + "loss": 0.635308027267456, + "step": 2404 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 1.1744966506958008, + "learning_rate": 9.770336689850727e-05, + "loss": 0.7286487817764282, + "step": 2406 + }, + { + "epoch": 1.0160337552742615, + "grad_norm": 1.2131173610687256, + "learning_rate": 9.769616680253639e-05, + "loss": 0.6828222274780273, + "step": 2408 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 1.0517828464508057, + "learning_rate": 9.768895570397585e-05, + "loss": 0.6652156114578247, + "step": 2410 + }, + { + "epoch": 1.0177215189873419, + "grad_norm": 1.1603758335113525, + "learning_rate": 9.768173360448912e-05, + "loss": 0.7278267741203308, + "step": 2412 + }, + { + "epoch": 1.0185654008438818, + "grad_norm": 1.3167752027511597, + "learning_rate": 9.767450050574218e-05, + "loss": 0.6082334518432617, + "step": 2414 + }, + { + "epoch": 1.019409282700422, + "grad_norm": 1.1754449605941772, + "learning_rate": 9.766725640940358e-05, + "loss": 0.67228102684021, + "step": 2416 + }, + { + "epoch": 1.0202531645569621, + "grad_norm": 1.060952067375183, + "learning_rate": 9.766000131714442e-05, + "loss": 0.5984366536140442, + "step": 2418 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 1.0826152563095093, + "learning_rate": 9.765273523063825e-05, + "loss": 0.690661609172821, + "step": 2420 + }, + { + "epoch": 1.0219409282700422, + "grad_norm": 1.423723816871643, + "learning_rate": 9.764545815156125e-05, + "loss": 0.7960668802261353, + "step": 2422 + }, + { + "epoch": 1.0227848101265822, + "grad_norm": 1.0882549285888672, + "learning_rate": 9.763817008159212e-05, + "loss": 0.6971074342727661, + "step": 2424 + }, + { + "epoch": 1.0236286919831223, + "grad_norm": 1.1053040027618408, + "learning_rate": 9.763087102241206e-05, + "loss": 0.6854458451271057, + "step": 2426 + }, + { + "epoch": 1.0244725738396625, + "grad_norm": 1.1975224018096924, + "learning_rate": 9.762356097570482e-05, + "loss": 0.6724489331245422, + "step": 2428 + }, + { + "epoch": 1.0253164556962024, + "grad_norm": 1.1692171096801758, + "learning_rate": 9.76162399431567e-05, + "loss": 0.7064506411552429, + "step": 2430 + }, + { + "epoch": 1.0261603375527426, + "grad_norm": 1.1927787065505981, + "learning_rate": 9.760890792645649e-05, + "loss": 0.6605257391929626, + "step": 2432 + }, + { + "epoch": 1.0270042194092828, + "grad_norm": 1.4147427082061768, + "learning_rate": 9.760156492729558e-05, + "loss": 0.6872501373291016, + "step": 2434 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 1.2503126859664917, + "learning_rate": 9.759421094736785e-05, + "loss": 0.7117500305175781, + "step": 2436 + }, + { + "epoch": 1.0286919831223629, + "grad_norm": 1.229978084564209, + "learning_rate": 9.758684598836971e-05, + "loss": 0.6740369200706482, + "step": 2438 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 1.4765945672988892, + "learning_rate": 9.757947005200014e-05, + "loss": 0.7215790748596191, + "step": 2440 + }, + { + "epoch": 1.030379746835443, + "grad_norm": 1.282632827758789, + "learning_rate": 9.757208313996061e-05, + "loss": 0.6961746215820312, + "step": 2442 + }, + { + "epoch": 1.0312236286919831, + "grad_norm": 1.259828805923462, + "learning_rate": 9.756468525395512e-05, + "loss": 0.6348349452018738, + "step": 2444 + }, + { + "epoch": 1.0320675105485233, + "grad_norm": 1.0984172821044922, + "learning_rate": 9.755727639569024e-05, + "loss": 0.6756057739257812, + "step": 2446 + }, + { + "epoch": 1.0329113924050632, + "grad_norm": 1.235835075378418, + "learning_rate": 9.754985656687506e-05, + "loss": 0.6968509554862976, + "step": 2448 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 1.273032546043396, + "learning_rate": 9.754242576922119e-05, + "loss": 0.6793950796127319, + "step": 2450 + }, + { + "epoch": 1.0345991561181433, + "grad_norm": 1.251996397972107, + "learning_rate": 9.753498400444274e-05, + "loss": 0.645270586013794, + "step": 2452 + }, + { + "epoch": 1.0354430379746835, + "grad_norm": 1.4310805797576904, + "learning_rate": 9.752753127425642e-05, + "loss": 0.7291322350502014, + "step": 2454 + }, + { + "epoch": 1.0362869198312237, + "grad_norm": 1.6582196950912476, + "learning_rate": 9.752006758038142e-05, + "loss": 0.7553019523620605, + "step": 2456 + }, + { + "epoch": 1.0371308016877636, + "grad_norm": 1.081773042678833, + "learning_rate": 9.751259292453947e-05, + "loss": 0.5637331008911133, + "step": 2458 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 1.1483876705169678, + "learning_rate": 9.750510730845483e-05, + "loss": 0.6012396216392517, + "step": 2460 + }, + { + "epoch": 1.038818565400844, + "grad_norm": 1.0879185199737549, + "learning_rate": 9.749761073385428e-05, + "loss": 0.6795822381973267, + "step": 2462 + }, + { + "epoch": 1.0396624472573839, + "grad_norm": 1.2378218173980713, + "learning_rate": 9.749010320246714e-05, + "loss": 0.6895145773887634, + "step": 2464 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 1.253233790397644, + "learning_rate": 9.748258471602527e-05, + "loss": 0.7124115228652954, + "step": 2466 + }, + { + "epoch": 1.0413502109704642, + "grad_norm": 1.3994864225387573, + "learning_rate": 9.747505527626302e-05, + "loss": 0.7304861545562744, + "step": 2468 + }, + { + "epoch": 1.0421940928270041, + "grad_norm": 1.2360669374465942, + "learning_rate": 9.74675148849173e-05, + "loss": 0.6845837831497192, + "step": 2470 + }, + { + "epoch": 1.0430379746835443, + "grad_norm": 1.126849889755249, + "learning_rate": 9.74599635437275e-05, + "loss": 0.6780203580856323, + "step": 2472 + }, + { + "epoch": 1.0438818565400845, + "grad_norm": 1.169788122177124, + "learning_rate": 9.745240125443562e-05, + "loss": 0.7550003528594971, + "step": 2474 + }, + { + "epoch": 1.0447257383966244, + "grad_norm": 1.1311867237091064, + "learning_rate": 9.744482801878612e-05, + "loss": 0.6910399198532104, + "step": 2476 + }, + { + "epoch": 1.0455696202531646, + "grad_norm": 1.1267731189727783, + "learning_rate": 9.743724383852597e-05, + "loss": 0.7164814472198486, + "step": 2478 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 1.2239704132080078, + "learning_rate": 9.742964871540472e-05, + "loss": 0.6428439617156982, + "step": 2480 + }, + { + "epoch": 1.0472573839662447, + "grad_norm": 1.1854743957519531, + "learning_rate": 9.742204265117443e-05, + "loss": 0.6994290351867676, + "step": 2482 + }, + { + "epoch": 1.0481012658227848, + "grad_norm": 1.0695894956588745, + "learning_rate": 9.741442564758964e-05, + "loss": 0.6725777983665466, + "step": 2484 + }, + { + "epoch": 1.048945147679325, + "grad_norm": 1.1799863576889038, + "learning_rate": 9.740679770640748e-05, + "loss": 0.6538674235343933, + "step": 2486 + }, + { + "epoch": 1.049789029535865, + "grad_norm": 1.295546293258667, + "learning_rate": 9.739915882938754e-05, + "loss": 0.780756950378418, + "step": 2488 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 1.2371755838394165, + "learning_rate": 9.739150901829198e-05, + "loss": 0.6657930612564087, + "step": 2490 + }, + { + "epoch": 1.051476793248945, + "grad_norm": 1.103037714958191, + "learning_rate": 9.738384827488547e-05, + "loss": 0.6675208210945129, + "step": 2492 + }, + { + "epoch": 1.0523206751054852, + "grad_norm": 1.1835435628890991, + "learning_rate": 9.737617660093517e-05, + "loss": 0.6693358421325684, + "step": 2494 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 1.003771424293518, + "learning_rate": 9.736849399821082e-05, + "loss": 0.624502956867218, + "step": 2496 + }, + { + "epoch": 1.0540084388185653, + "grad_norm": 1.1391769647598267, + "learning_rate": 9.736080046848463e-05, + "loss": 0.6350868344306946, + "step": 2498 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 1.376518726348877, + "learning_rate": 9.735309601353134e-05, + "loss": 0.6721012592315674, + "step": 2500 + }, + { + "epoch": 1.0548523206751055, + "eval_loss": 0.741338849067688, + "eval_runtime": 847.7478, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2500 + }, + { + "epoch": 1.0556962025316456, + "grad_norm": 1.194190502166748, + "learning_rate": 9.734538063512824e-05, + "loss": 0.6888233423233032, + "step": 2502 + }, + { + "epoch": 1.0565400843881856, + "grad_norm": 1.378830909729004, + "learning_rate": 9.733765433505513e-05, + "loss": 0.7095553278923035, + "step": 2504 + }, + { + "epoch": 1.0573839662447257, + "grad_norm": 1.1289541721343994, + "learning_rate": 9.732991711509428e-05, + "loss": 0.6734166145324707, + "step": 2506 + }, + { + "epoch": 1.058227848101266, + "grad_norm": 1.1858116388320923, + "learning_rate": 9.732216897703054e-05, + "loss": 0.7006195187568665, + "step": 2508 + }, + { + "epoch": 1.0590717299578059, + "grad_norm": 1.1365686655044556, + "learning_rate": 9.731440992265127e-05, + "loss": 0.6481205821037292, + "step": 2510 + }, + { + "epoch": 1.059915611814346, + "grad_norm": 1.2886228561401367, + "learning_rate": 9.730663995374632e-05, + "loss": 0.679282546043396, + "step": 2512 + }, + { + "epoch": 1.0607594936708862, + "grad_norm": 1.355322003364563, + "learning_rate": 9.729885907210808e-05, + "loss": 0.7656359672546387, + "step": 2514 + }, + { + "epoch": 1.0616033755274261, + "grad_norm": 1.1552364826202393, + "learning_rate": 9.729106727953142e-05, + "loss": 0.5996183156967163, + "step": 2516 + }, + { + "epoch": 1.0624472573839663, + "grad_norm": 1.1419235467910767, + "learning_rate": 9.728326457781381e-05, + "loss": 0.7599716782569885, + "step": 2518 + }, + { + "epoch": 1.0632911392405062, + "grad_norm": 1.2240079641342163, + "learning_rate": 9.727545096875512e-05, + "loss": 0.7150241732597351, + "step": 2520 + }, + { + "epoch": 1.0641350210970464, + "grad_norm": 1.2463440895080566, + "learning_rate": 9.726762645415785e-05, + "loss": 0.734352171421051, + "step": 2522 + }, + { + "epoch": 1.0649789029535865, + "grad_norm": 1.1680364608764648, + "learning_rate": 9.725979103582697e-05, + "loss": 0.6950796842575073, + "step": 2524 + }, + { + "epoch": 1.0658227848101265, + "grad_norm": 1.1680421829223633, + "learning_rate": 9.725194471556991e-05, + "loss": 0.7096341252326965, + "step": 2526 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.043717861175537, + "learning_rate": 9.724408749519671e-05, + "loss": 0.6486304402351379, + "step": 2528 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 1.1240284442901611, + "learning_rate": 9.723621937651985e-05, + "loss": 0.6519505381584167, + "step": 2530 + }, + { + "epoch": 1.0683544303797468, + "grad_norm": 1.185223937034607, + "learning_rate": 9.722834036135439e-05, + "loss": 0.6724293231964111, + "step": 2532 + }, + { + "epoch": 1.069198312236287, + "grad_norm": 1.3234196901321411, + "learning_rate": 9.722045045151784e-05, + "loss": 0.6886576414108276, + "step": 2534 + }, + { + "epoch": 1.070042194092827, + "grad_norm": 1.333084225654602, + "learning_rate": 9.721254964883024e-05, + "loss": 0.688493549823761, + "step": 2536 + }, + { + "epoch": 1.070886075949367, + "grad_norm": 1.2435462474822998, + "learning_rate": 9.720463795511419e-05, + "loss": 0.6527412533760071, + "step": 2538 + }, + { + "epoch": 1.0717299578059072, + "grad_norm": 1.1521880626678467, + "learning_rate": 9.719671537219472e-05, + "loss": 0.6508163809776306, + "step": 2540 + }, + { + "epoch": 1.0725738396624473, + "grad_norm": 1.015013575553894, + "learning_rate": 9.718878190189947e-05, + "loss": 0.6954023838043213, + "step": 2542 + }, + { + "epoch": 1.0734177215189873, + "grad_norm": 1.1507678031921387, + "learning_rate": 9.718083754605851e-05, + "loss": 0.7201322913169861, + "step": 2544 + }, + { + "epoch": 1.0742616033755275, + "grad_norm": 1.0569016933441162, + "learning_rate": 9.717288230650444e-05, + "loss": 0.6688649654388428, + "step": 2546 + }, + { + "epoch": 1.0751054852320676, + "grad_norm": 1.2178492546081543, + "learning_rate": 9.716491618507241e-05, + "loss": 0.7077898979187012, + "step": 2548 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 1.3587230443954468, + "learning_rate": 9.715693918360002e-05, + "loss": 0.7312119603157043, + "step": 2550 + }, + { + "epoch": 1.0767932489451477, + "grad_norm": 1.1930122375488281, + "learning_rate": 9.714895130392744e-05, + "loss": 0.6910589337348938, + "step": 2552 + }, + { + "epoch": 1.0776371308016879, + "grad_norm": 1.2440707683563232, + "learning_rate": 9.71409525478973e-05, + "loss": 0.7942836284637451, + "step": 2554 + }, + { + "epoch": 1.0784810126582278, + "grad_norm": 1.3755065202713013, + "learning_rate": 9.713294291735477e-05, + "loss": 0.6652286052703857, + "step": 2556 + }, + { + "epoch": 1.079324894514768, + "grad_norm": 1.165448784828186, + "learning_rate": 9.71249224141475e-05, + "loss": 0.6025735139846802, + "step": 2558 + }, + { + "epoch": 1.080168776371308, + "grad_norm": 1.2981204986572266, + "learning_rate": 9.711689104012569e-05, + "loss": 0.7343734502792358, + "step": 2560 + }, + { + "epoch": 1.081012658227848, + "grad_norm": 1.2040622234344482, + "learning_rate": 9.710884879714202e-05, + "loss": 0.6903306841850281, + "step": 2562 + }, + { + "epoch": 1.0818565400843883, + "grad_norm": 1.1835904121398926, + "learning_rate": 9.710079568705168e-05, + "loss": 0.69134920835495, + "step": 2564 + }, + { + "epoch": 1.0827004219409282, + "grad_norm": 1.3345229625701904, + "learning_rate": 9.709273171171235e-05, + "loss": 0.6471185088157654, + "step": 2566 + }, + { + "epoch": 1.0835443037974684, + "grad_norm": 1.0884469747543335, + "learning_rate": 9.708465687298425e-05, + "loss": 0.6302382349967957, + "step": 2568 + }, + { + "epoch": 1.0843881856540085, + "grad_norm": 1.1994211673736572, + "learning_rate": 9.707657117273007e-05, + "loss": 0.7329678535461426, + "step": 2570 + }, + { + "epoch": 1.0852320675105485, + "grad_norm": 1.2609503269195557, + "learning_rate": 9.706847461281507e-05, + "loss": 0.719862163066864, + "step": 2572 + }, + { + "epoch": 1.0860759493670886, + "grad_norm": 1.2686879634857178, + "learning_rate": 9.706036719510694e-05, + "loss": 0.7142901420593262, + "step": 2574 + }, + { + "epoch": 1.0869198312236288, + "grad_norm": 1.2763310670852661, + "learning_rate": 9.705224892147591e-05, + "loss": 0.7009075284004211, + "step": 2576 + }, + { + "epoch": 1.0877637130801687, + "grad_norm": 1.1704022884368896, + "learning_rate": 9.70441197937947e-05, + "loss": 0.6873779296875, + "step": 2578 + }, + { + "epoch": 1.0886075949367089, + "grad_norm": 1.0482875108718872, + "learning_rate": 9.703597981393856e-05, + "loss": 0.6437726020812988, + "step": 2580 + }, + { + "epoch": 1.0894514767932488, + "grad_norm": 1.28431236743927, + "learning_rate": 9.702782898378521e-05, + "loss": 0.6933431625366211, + "step": 2582 + }, + { + "epoch": 1.090295358649789, + "grad_norm": 1.0962283611297607, + "learning_rate": 9.701966730521491e-05, + "loss": 0.6488757133483887, + "step": 2584 + }, + { + "epoch": 1.0911392405063292, + "grad_norm": 1.2177873849868774, + "learning_rate": 9.70114947801104e-05, + "loss": 0.6385396122932434, + "step": 2586 + }, + { + "epoch": 1.091983122362869, + "grad_norm": 1.197059988975525, + "learning_rate": 9.70033114103569e-05, + "loss": 0.6826614737510681, + "step": 2588 + }, + { + "epoch": 1.0928270042194093, + "grad_norm": 1.1624075174331665, + "learning_rate": 9.699511719784217e-05, + "loss": 0.605629563331604, + "step": 2590 + }, + { + "epoch": 1.0936708860759494, + "grad_norm": 1.2975167036056519, + "learning_rate": 9.698691214445648e-05, + "loss": 0.734926700592041, + "step": 2592 + }, + { + "epoch": 1.0945147679324894, + "grad_norm": 1.215414047241211, + "learning_rate": 9.697869625209255e-05, + "loss": 0.7281333804130554, + "step": 2594 + }, + { + "epoch": 1.0953586497890295, + "grad_norm": 1.1862860918045044, + "learning_rate": 9.697046952264563e-05, + "loss": 0.7388250827789307, + "step": 2596 + }, + { + "epoch": 1.0962025316455697, + "grad_norm": 1.1127797365188599, + "learning_rate": 9.696223195801348e-05, + "loss": 0.6495320796966553, + "step": 2598 + }, + { + "epoch": 1.0970464135021096, + "grad_norm": 1.0863338708877563, + "learning_rate": 9.695398356009636e-05, + "loss": 0.7157143950462341, + "step": 2600 + }, + { + "epoch": 1.0970464135021096, + "eval_loss": 0.7377332448959351, + "eval_runtime": 859.6612, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 2600 + }, + { + "epoch": 1.0978902953586498, + "grad_norm": 1.1228652000427246, + "learning_rate": 9.694572433079699e-05, + "loss": 0.6597335934638977, + "step": 2602 + }, + { + "epoch": 1.09873417721519, + "grad_norm": 1.3077653646469116, + "learning_rate": 9.69374542720206e-05, + "loss": 0.6715680360794067, + "step": 2604 + }, + { + "epoch": 1.09957805907173, + "grad_norm": 1.241603970527649, + "learning_rate": 9.692917338567499e-05, + "loss": 0.6910243034362793, + "step": 2606 + }, + { + "epoch": 1.10042194092827, + "grad_norm": 1.1372551918029785, + "learning_rate": 9.692088167367037e-05, + "loss": 0.6519553065299988, + "step": 2608 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 1.2894765138626099, + "learning_rate": 9.691257913791949e-05, + "loss": 0.6542758941650391, + "step": 2610 + }, + { + "epoch": 1.1021097046413502, + "grad_norm": 1.0800915956497192, + "learning_rate": 9.690426578033755e-05, + "loss": 0.6886795163154602, + "step": 2612 + }, + { + "epoch": 1.1029535864978903, + "grad_norm": 1.3394384384155273, + "learning_rate": 9.689594160284233e-05, + "loss": 0.7512150406837463, + "step": 2614 + }, + { + "epoch": 1.1037974683544305, + "grad_norm": 1.2175323963165283, + "learning_rate": 9.688760660735402e-05, + "loss": 0.67207932472229, + "step": 2616 + }, + { + "epoch": 1.1046413502109704, + "grad_norm": 1.2181185483932495, + "learning_rate": 9.687926079579537e-05, + "loss": 0.6591740846633911, + "step": 2618 + }, + { + "epoch": 1.1054852320675106, + "grad_norm": 1.1740983724594116, + "learning_rate": 9.68709041700916e-05, + "loss": 0.6431041359901428, + "step": 2620 + }, + { + "epoch": 1.1063291139240505, + "grad_norm": 1.1792434453964233, + "learning_rate": 9.686253673217038e-05, + "loss": 0.6573615074157715, + "step": 2622 + }, + { + "epoch": 1.1071729957805907, + "grad_norm": 1.058391809463501, + "learning_rate": 9.685415848396196e-05, + "loss": 0.5576209425926208, + "step": 2624 + }, + { + "epoch": 1.1080168776371309, + "grad_norm": 1.3203206062316895, + "learning_rate": 9.684576942739903e-05, + "loss": 0.668684184551239, + "step": 2626 + }, + { + "epoch": 1.1088607594936708, + "grad_norm": 1.2391762733459473, + "learning_rate": 9.68373695644168e-05, + "loss": 0.6800089478492737, + "step": 2628 + }, + { + "epoch": 1.109704641350211, + "grad_norm": 1.2323405742645264, + "learning_rate": 9.682895889695292e-05, + "loss": 0.6433757543563843, + "step": 2630 + }, + { + "epoch": 1.1105485232067511, + "grad_norm": 1.2656551599502563, + "learning_rate": 9.682053742694759e-05, + "loss": 0.6628785729408264, + "step": 2632 + }, + { + "epoch": 1.111392405063291, + "grad_norm": 1.2984392642974854, + "learning_rate": 9.681210515634349e-05, + "loss": 0.6838971972465515, + "step": 2634 + }, + { + "epoch": 1.1122362869198312, + "grad_norm": 1.3200393915176392, + "learning_rate": 9.680366208708576e-05, + "loss": 0.7548647522926331, + "step": 2636 + }, + { + "epoch": 1.1130801687763714, + "grad_norm": 1.225388526916504, + "learning_rate": 9.679520822112208e-05, + "loss": 0.6553335189819336, + "step": 2638 + }, + { + "epoch": 1.1139240506329113, + "grad_norm": 1.2350653409957886, + "learning_rate": 9.678674356040259e-05, + "loss": 0.631401538848877, + "step": 2640 + }, + { + "epoch": 1.1147679324894515, + "grad_norm": 1.2325507402420044, + "learning_rate": 9.677826810687989e-05, + "loss": 0.6459156274795532, + "step": 2642 + }, + { + "epoch": 1.1156118143459917, + "grad_norm": 1.0008996725082397, + "learning_rate": 9.676978186250915e-05, + "loss": 0.6425284743309021, + "step": 2644 + }, + { + "epoch": 1.1164556962025316, + "grad_norm": 1.3767247200012207, + "learning_rate": 9.676128482924796e-05, + "loss": 0.6451422572135925, + "step": 2646 + }, + { + "epoch": 1.1172995780590718, + "grad_norm": 1.2070895433425903, + "learning_rate": 9.675277700905643e-05, + "loss": 0.6713272929191589, + "step": 2648 + }, + { + "epoch": 1.1181434599156117, + "grad_norm": 1.1582069396972656, + "learning_rate": 9.674425840389716e-05, + "loss": 0.6285044550895691, + "step": 2650 + }, + { + "epoch": 1.1189873417721519, + "grad_norm": 1.1641311645507812, + "learning_rate": 9.67357290157352e-05, + "loss": 0.624229907989502, + "step": 2652 + }, + { + "epoch": 1.119831223628692, + "grad_norm": 1.3071147203445435, + "learning_rate": 9.672718884653814e-05, + "loss": 0.7214919328689575, + "step": 2654 + }, + { + "epoch": 1.120675105485232, + "grad_norm": 1.2157800197601318, + "learning_rate": 9.671863789827602e-05, + "loss": 0.8062215447425842, + "step": 2656 + }, + { + "epoch": 1.1215189873417721, + "grad_norm": 1.2843927145004272, + "learning_rate": 9.671007617292138e-05, + "loss": 0.6362426280975342, + "step": 2658 + }, + { + "epoch": 1.1223628691983123, + "grad_norm": 1.1182712316513062, + "learning_rate": 9.670150367244927e-05, + "loss": 0.6181318163871765, + "step": 2660 + }, + { + "epoch": 1.1232067510548522, + "grad_norm": 1.566605806350708, + "learning_rate": 9.669292039883717e-05, + "loss": 0.6973897218704224, + "step": 2662 + }, + { + "epoch": 1.1240506329113924, + "grad_norm": 1.0726850032806396, + "learning_rate": 9.66843263540651e-05, + "loss": 0.6117324829101562, + "step": 2664 + }, + { + "epoch": 1.1248945147679326, + "grad_norm": 1.2953020334243774, + "learning_rate": 9.66757215401155e-05, + "loss": 0.642676830291748, + "step": 2666 + }, + { + "epoch": 1.1257383966244725, + "grad_norm": 1.1184383630752563, + "learning_rate": 9.66671059589734e-05, + "loss": 0.6757452487945557, + "step": 2668 + }, + { + "epoch": 1.1265822784810127, + "grad_norm": 1.2732970714569092, + "learning_rate": 9.66584796126262e-05, + "loss": 0.6861951947212219, + "step": 2670 + }, + { + "epoch": 1.1274261603375528, + "grad_norm": 1.2713000774383545, + "learning_rate": 9.664984250306383e-05, + "loss": 0.6727077960968018, + "step": 2672 + }, + { + "epoch": 1.1282700421940928, + "grad_norm": 1.269827961921692, + "learning_rate": 9.664119463227874e-05, + "loss": 0.7355974912643433, + "step": 2674 + }, + { + "epoch": 1.129113924050633, + "grad_norm": 1.3067172765731812, + "learning_rate": 9.663253600226581e-05, + "loss": 0.7121313214302063, + "step": 2676 + }, + { + "epoch": 1.129957805907173, + "grad_norm": 1.2958797216415405, + "learning_rate": 9.662386661502242e-05, + "loss": 0.6671369075775146, + "step": 2678 + }, + { + "epoch": 1.130801687763713, + "grad_norm": 1.2943401336669922, + "learning_rate": 9.661518647254842e-05, + "loss": 0.6153768301010132, + "step": 2680 + }, + { + "epoch": 1.1316455696202532, + "grad_norm": 1.1744167804718018, + "learning_rate": 9.660649557684616e-05, + "loss": 0.6070778965950012, + "step": 2682 + }, + { + "epoch": 1.1324894514767934, + "grad_norm": 1.159209132194519, + "learning_rate": 9.659779392992047e-05, + "loss": 0.676887035369873, + "step": 2684 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.1937510967254639, + "learning_rate": 9.658908153377866e-05, + "loss": 0.6086745262145996, + "step": 2686 + }, + { + "epoch": 1.1341772151898735, + "grad_norm": 1.1461687088012695, + "learning_rate": 9.658035839043049e-05, + "loss": 0.6493708491325378, + "step": 2688 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 2.066361665725708, + "learning_rate": 9.657162450188824e-05, + "loss": 0.6813004016876221, + "step": 2690 + }, + { + "epoch": 1.1358649789029536, + "grad_norm": 1.086910367012024, + "learning_rate": 9.656287987016664e-05, + "loss": 0.721062183380127, + "step": 2692 + }, + { + "epoch": 1.1367088607594937, + "grad_norm": 1.1869292259216309, + "learning_rate": 9.65541244972829e-05, + "loss": 0.5975021123886108, + "step": 2694 + }, + { + "epoch": 1.1375527426160337, + "grad_norm": 1.2456518411636353, + "learning_rate": 9.654535838525674e-05, + "loss": 0.6818324327468872, + "step": 2696 + }, + { + "epoch": 1.1383966244725738, + "grad_norm": 1.5271464586257935, + "learning_rate": 9.653658153611031e-05, + "loss": 0.6844469308853149, + "step": 2698 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.1403794288635254, + "learning_rate": 9.652779395186827e-05, + "loss": 0.6388684511184692, + "step": 2700 + }, + { + "epoch": 1.139240506329114, + "eval_loss": 0.7335711717605591, + "eval_runtime": 861.9651, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 2700 + }, + { + "epoch": 1.140084388185654, + "grad_norm": 1.1091634035110474, + "learning_rate": 9.651899563455775e-05, + "loss": 0.6154619455337524, + "step": 2702 + }, + { + "epoch": 1.140928270042194, + "grad_norm": 1.3280601501464844, + "learning_rate": 9.651018658620837e-05, + "loss": 0.629319429397583, + "step": 2704 + }, + { + "epoch": 1.1417721518987343, + "grad_norm": 1.226806402206421, + "learning_rate": 9.650136680885216e-05, + "loss": 0.6088175773620605, + "step": 2706 + }, + { + "epoch": 1.1426160337552742, + "grad_norm": 1.0593408346176147, + "learning_rate": 9.649253630452372e-05, + "loss": 0.6199659705162048, + "step": 2708 + }, + { + "epoch": 1.1434599156118144, + "grad_norm": 1.1112475395202637, + "learning_rate": 9.648369507526008e-05, + "loss": 0.7233364582061768, + "step": 2710 + }, + { + "epoch": 1.1443037974683543, + "grad_norm": 1.1737885475158691, + "learning_rate": 9.647484312310068e-05, + "loss": 0.6687955856323242, + "step": 2712 + }, + { + "epoch": 1.1451476793248945, + "grad_norm": 1.194532036781311, + "learning_rate": 9.646598045008756e-05, + "loss": 0.6508969068527222, + "step": 2714 + }, + { + "epoch": 1.1459915611814346, + "grad_norm": 1.069395899772644, + "learning_rate": 9.645710705826517e-05, + "loss": 0.6408317685127258, + "step": 2716 + }, + { + "epoch": 1.1468354430379746, + "grad_norm": 1.2429133653640747, + "learning_rate": 9.644822294968037e-05, + "loss": 0.650763750076294, + "step": 2718 + }, + { + "epoch": 1.1476793248945147, + "grad_norm": 1.2950133085250854, + "learning_rate": 9.64393281263826e-05, + "loss": 0.6952191591262817, + "step": 2720 + }, + { + "epoch": 1.148523206751055, + "grad_norm": 1.1972628831863403, + "learning_rate": 9.643042259042372e-05, + "loss": 0.6772956252098083, + "step": 2722 + }, + { + "epoch": 1.1493670886075948, + "grad_norm": 1.1670407056808472, + "learning_rate": 9.642150634385805e-05, + "loss": 0.6734447479248047, + "step": 2724 + }, + { + "epoch": 1.150210970464135, + "grad_norm": 1.120302677154541, + "learning_rate": 9.641257938874243e-05, + "loss": 0.6387717127799988, + "step": 2726 + }, + { + "epoch": 1.1510548523206752, + "grad_norm": 1.1241344213485718, + "learning_rate": 9.640364172713609e-05, + "loss": 0.6592874526977539, + "step": 2728 + }, + { + "epoch": 1.1518987341772151, + "grad_norm": 1.2627261877059937, + "learning_rate": 9.639469336110083e-05, + "loss": 0.7257466912269592, + "step": 2730 + }, + { + "epoch": 1.1527426160337553, + "grad_norm": 1.0528618097305298, + "learning_rate": 9.638573429270083e-05, + "loss": 0.572188138961792, + "step": 2732 + }, + { + "epoch": 1.1535864978902954, + "grad_norm": 1.212536334991455, + "learning_rate": 9.637676452400277e-05, + "loss": 0.678981602191925, + "step": 2734 + }, + { + "epoch": 1.1544303797468354, + "grad_norm": 1.152167797088623, + "learning_rate": 9.636778405707582e-05, + "loss": 0.6375001072883606, + "step": 2736 + }, + { + "epoch": 1.1552742616033755, + "grad_norm": 1.2400429248809814, + "learning_rate": 9.635879289399161e-05, + "loss": 0.7602289319038391, + "step": 2738 + }, + { + "epoch": 1.1561181434599157, + "grad_norm": 1.3488622903823853, + "learning_rate": 9.634979103682421e-05, + "loss": 0.6209543943405151, + "step": 2740 + }, + { + "epoch": 1.1569620253164556, + "grad_norm": 1.1999555826187134, + "learning_rate": 9.634077848765019e-05, + "loss": 0.6215830445289612, + "step": 2742 + }, + { + "epoch": 1.1578059071729958, + "grad_norm": 1.2008578777313232, + "learning_rate": 9.633175524854855e-05, + "loss": 0.6634654998779297, + "step": 2744 + }, + { + "epoch": 1.158649789029536, + "grad_norm": 1.3920676708221436, + "learning_rate": 9.63227213216008e-05, + "loss": 0.7515161633491516, + "step": 2746 + }, + { + "epoch": 1.159493670886076, + "grad_norm": 1.0551656484603882, + "learning_rate": 9.631367670889089e-05, + "loss": 0.724361777305603, + "step": 2748 + }, + { + "epoch": 1.160337552742616, + "grad_norm": 1.2820028066635132, + "learning_rate": 9.630462141250523e-05, + "loss": 0.6673553586006165, + "step": 2750 + }, + { + "epoch": 1.1611814345991562, + "grad_norm": 1.1452983617782593, + "learning_rate": 9.62955554345327e-05, + "loss": 0.7029784917831421, + "step": 2752 + }, + { + "epoch": 1.1620253164556962, + "grad_norm": 1.1808624267578125, + "learning_rate": 9.628647877706466e-05, + "loss": 0.7355457544326782, + "step": 2754 + }, + { + "epoch": 1.1628691983122363, + "grad_norm": 1.0574703216552734, + "learning_rate": 9.627739144219492e-05, + "loss": 0.6144933700561523, + "step": 2756 + }, + { + "epoch": 1.1637130801687763, + "grad_norm": 1.215733528137207, + "learning_rate": 9.626829343201974e-05, + "loss": 0.6843759417533875, + "step": 2758 + }, + { + "epoch": 1.1645569620253164, + "grad_norm": 1.1667706966400146, + "learning_rate": 9.625918474863787e-05, + "loss": 0.6197049617767334, + "step": 2760 + }, + { + "epoch": 1.1654008438818566, + "grad_norm": 1.3765631914138794, + "learning_rate": 9.62500653941505e-05, + "loss": 0.715958297252655, + "step": 2762 + }, + { + "epoch": 1.1662447257383965, + "grad_norm": 1.173715591430664, + "learning_rate": 9.62409353706613e-05, + "loss": 0.7433139085769653, + "step": 2764 + }, + { + "epoch": 1.1670886075949367, + "grad_norm": 1.1837430000305176, + "learning_rate": 9.623179468027637e-05, + "loss": 0.7174371480941772, + "step": 2766 + }, + { + "epoch": 1.1679324894514769, + "grad_norm": 1.1577154397964478, + "learning_rate": 9.622264332510432e-05, + "loss": 0.7184823751449585, + "step": 2768 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 1.165246605873108, + "learning_rate": 9.621348130725617e-05, + "loss": 0.693343460559845, + "step": 2770 + }, + { + "epoch": 1.169620253164557, + "grad_norm": 1.2853080034255981, + "learning_rate": 9.620430862884542e-05, + "loss": 0.6999852061271667, + "step": 2772 + }, + { + "epoch": 1.1704641350210971, + "grad_norm": 1.1782865524291992, + "learning_rate": 9.619512529198806e-05, + "loss": 0.6034331321716309, + "step": 2774 + }, + { + "epoch": 1.171308016877637, + "grad_norm": 1.4055447578430176, + "learning_rate": 9.61859312988025e-05, + "loss": 0.7588269710540771, + "step": 2776 + }, + { + "epoch": 1.1721518987341772, + "grad_norm": 1.1148805618286133, + "learning_rate": 9.617672665140957e-05, + "loss": 0.6913981437683105, + "step": 2778 + }, + { + "epoch": 1.1729957805907172, + "grad_norm": 1.1311042308807373, + "learning_rate": 9.616751135193266e-05, + "loss": 0.5976925492286682, + "step": 2780 + }, + { + "epoch": 1.1738396624472573, + "grad_norm": 1.2378602027893066, + "learning_rate": 9.615828540249754e-05, + "loss": 0.6897050142288208, + "step": 2782 + }, + { + "epoch": 1.1746835443037975, + "grad_norm": 1.3445732593536377, + "learning_rate": 9.614904880523248e-05, + "loss": 0.6772098541259766, + "step": 2784 + }, + { + "epoch": 1.1755274261603375, + "grad_norm": 1.3380862474441528, + "learning_rate": 9.613980156226815e-05, + "loss": 0.6354818344116211, + "step": 2786 + }, + { + "epoch": 1.1763713080168776, + "grad_norm": 1.0955157279968262, + "learning_rate": 9.613054367573773e-05, + "loss": 0.6541208028793335, + "step": 2788 + }, + { + "epoch": 1.1772151898734178, + "grad_norm": 1.0176626443862915, + "learning_rate": 9.612127514777686e-05, + "loss": 0.6472887992858887, + "step": 2790 + }, + { + "epoch": 1.1780590717299577, + "grad_norm": 1.2644864320755005, + "learning_rate": 9.611199598052357e-05, + "loss": 0.7511212229728699, + "step": 2792 + }, + { + "epoch": 1.1789029535864979, + "grad_norm": 1.248197317123413, + "learning_rate": 9.61027061761184e-05, + "loss": 0.696236789226532, + "step": 2794 + }, + { + "epoch": 1.179746835443038, + "grad_norm": 1.189935564994812, + "learning_rate": 9.609340573670436e-05, + "loss": 0.5962010622024536, + "step": 2796 + }, + { + "epoch": 1.180590717299578, + "grad_norm": 1.1760492324829102, + "learning_rate": 9.608409466442685e-05, + "loss": 0.5981685519218445, + "step": 2798 + }, + { + "epoch": 1.1814345991561181, + "grad_norm": 1.1820716857910156, + "learning_rate": 9.607477296143374e-05, + "loss": 0.6186091303825378, + "step": 2800 + }, + { + "epoch": 1.1814345991561181, + "eval_loss": 0.7298192977905273, + "eval_runtime": 849.544, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 2.48, + "step": 2800 + }, + { + "epoch": 1.1822784810126583, + "grad_norm": 1.0353888273239136, + "learning_rate": 9.606544062987541e-05, + "loss": 0.5859389901161194, + "step": 2802 + }, + { + "epoch": 1.1831223628691983, + "grad_norm": 1.3141933679580688, + "learning_rate": 9.605609767190464e-05, + "loss": 0.6573460698127747, + "step": 2804 + }, + { + "epoch": 1.1839662447257384, + "grad_norm": 1.1209372282028198, + "learning_rate": 9.604674408967664e-05, + "loss": 0.6991921067237854, + "step": 2806 + }, + { + "epoch": 1.1848101265822786, + "grad_norm": 1.2830493450164795, + "learning_rate": 9.603737988534913e-05, + "loss": 0.6438087821006775, + "step": 2808 + }, + { + "epoch": 1.1856540084388185, + "grad_norm": 1.1427195072174072, + "learning_rate": 9.602800506108225e-05, + "loss": 0.6452094316482544, + "step": 2810 + }, + { + "epoch": 1.1864978902953587, + "grad_norm": 1.316420078277588, + "learning_rate": 9.601861961903857e-05, + "loss": 0.6745601296424866, + "step": 2812 + }, + { + "epoch": 1.1873417721518988, + "grad_norm": 1.1643308401107788, + "learning_rate": 9.600922356138317e-05, + "loss": 0.6761514544487, + "step": 2814 + }, + { + "epoch": 1.1881856540084388, + "grad_norm": 1.036056399345398, + "learning_rate": 9.59998168902835e-05, + "loss": 0.6453908681869507, + "step": 2816 + }, + { + "epoch": 1.189029535864979, + "grad_norm": 1.2211129665374756, + "learning_rate": 9.599039960790954e-05, + "loss": 0.6576406359672546, + "step": 2818 + }, + { + "epoch": 1.189873417721519, + "grad_norm": 1.084114670753479, + "learning_rate": 9.598097171643364e-05, + "loss": 0.6214181780815125, + "step": 2820 + }, + { + "epoch": 1.190717299578059, + "grad_norm": 1.1297314167022705, + "learning_rate": 9.597153321803064e-05, + "loss": 0.6381646990776062, + "step": 2822 + }, + { + "epoch": 1.1915611814345992, + "grad_norm": 1.2568120956420898, + "learning_rate": 9.596208411487784e-05, + "loss": 0.7129076719284058, + "step": 2824 + }, + { + "epoch": 1.1924050632911392, + "grad_norm": 1.07041335105896, + "learning_rate": 9.595262440915493e-05, + "loss": 0.7123546004295349, + "step": 2826 + }, + { + "epoch": 1.1932489451476793, + "grad_norm": 1.3950074911117554, + "learning_rate": 9.594315410304413e-05, + "loss": 0.7263038158416748, + "step": 2828 + }, + { + "epoch": 1.1940928270042195, + "grad_norm": 1.2470672130584717, + "learning_rate": 9.593367319873002e-05, + "loss": 0.6863036751747131, + "step": 2830 + }, + { + "epoch": 1.1949367088607594, + "grad_norm": 1.2065461874008179, + "learning_rate": 9.592418169839968e-05, + "loss": 0.745354175567627, + "step": 2832 + }, + { + "epoch": 1.1957805907172996, + "grad_norm": 1.1710152626037598, + "learning_rate": 9.591467960424261e-05, + "loss": 0.6401656866073608, + "step": 2834 + }, + { + "epoch": 1.1966244725738397, + "grad_norm": 1.3324087858200073, + "learning_rate": 9.590516691845077e-05, + "loss": 0.7402615547180176, + "step": 2836 + }, + { + "epoch": 1.1974683544303797, + "grad_norm": 1.0100195407867432, + "learning_rate": 9.589564364321855e-05, + "loss": 0.5723769068717957, + "step": 2838 + }, + { + "epoch": 1.1983122362869199, + "grad_norm": 1.2706246376037598, + "learning_rate": 9.588610978074277e-05, + "loss": 0.6618966460227966, + "step": 2840 + }, + { + "epoch": 1.1991561181434598, + "grad_norm": 1.1921758651733398, + "learning_rate": 9.587656533322273e-05, + "loss": 0.7090804576873779, + "step": 2842 + }, + { + "epoch": 1.2, + "grad_norm": 1.36713445186615, + "learning_rate": 9.586701030286014e-05, + "loss": 0.6930652856826782, + "step": 2844 + }, + { + "epoch": 1.2008438818565401, + "grad_norm": 1.3084295988082886, + "learning_rate": 9.585744469185917e-05, + "loss": 0.7386236190795898, + "step": 2846 + }, + { + "epoch": 1.20168776371308, + "grad_norm": 1.198922038078308, + "learning_rate": 9.584786850242642e-05, + "loss": 0.6179903149604797, + "step": 2848 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.2106369733810425, + "learning_rate": 9.583828173677092e-05, + "loss": 0.7027528882026672, + "step": 2850 + }, + { + "epoch": 1.2033755274261604, + "grad_norm": 1.2959522008895874, + "learning_rate": 9.582868439710418e-05, + "loss": 0.6612945199012756, + "step": 2852 + }, + { + "epoch": 1.2042194092827003, + "grad_norm": 1.1441705226898193, + "learning_rate": 9.58190764856401e-05, + "loss": 0.7085917592048645, + "step": 2854 + }, + { + "epoch": 1.2050632911392405, + "grad_norm": 1.1586185693740845, + "learning_rate": 9.580945800459504e-05, + "loss": 0.7480600476264954, + "step": 2856 + }, + { + "epoch": 1.2059071729957807, + "grad_norm": 1.2068266868591309, + "learning_rate": 9.579982895618783e-05, + "loss": 0.7185836434364319, + "step": 2858 + }, + { + "epoch": 1.2067510548523206, + "grad_norm": 1.2188525199890137, + "learning_rate": 9.579018934263966e-05, + "loss": 0.6737306118011475, + "step": 2860 + }, + { + "epoch": 1.2075949367088608, + "grad_norm": 1.1513181924819946, + "learning_rate": 9.578053916617423e-05, + "loss": 0.7239293456077576, + "step": 2862 + }, + { + "epoch": 1.208438818565401, + "grad_norm": 1.2063703536987305, + "learning_rate": 9.577087842901764e-05, + "loss": 0.6416276097297668, + "step": 2864 + }, + { + "epoch": 1.2092827004219409, + "grad_norm": 1.102460503578186, + "learning_rate": 9.576120713339844e-05, + "loss": 0.697213351726532, + "step": 2866 + }, + { + "epoch": 1.210126582278481, + "grad_norm": 1.2484638690948486, + "learning_rate": 9.575152528154763e-05, + "loss": 0.6664742231369019, + "step": 2868 + }, + { + "epoch": 1.2109704641350212, + "grad_norm": 1.4476624727249146, + "learning_rate": 9.57418328756986e-05, + "loss": 0.6914868354797363, + "step": 2870 + }, + { + "epoch": 1.2118143459915611, + "grad_norm": 1.0130122900009155, + "learning_rate": 9.573212991808722e-05, + "loss": 0.662024736404419, + "step": 2872 + }, + { + "epoch": 1.2126582278481013, + "grad_norm": 1.014470100402832, + "learning_rate": 9.572241641095177e-05, + "loss": 0.6330409646034241, + "step": 2874 + }, + { + "epoch": 1.2135021097046415, + "grad_norm": 1.1803333759307861, + "learning_rate": 9.571269235653298e-05, + "loss": 0.6607463955879211, + "step": 2876 + }, + { + "epoch": 1.2143459915611814, + "grad_norm": 1.261366844177246, + "learning_rate": 9.570295775707398e-05, + "loss": 0.6925629377365112, + "step": 2878 + }, + { + "epoch": 1.2151898734177216, + "grad_norm": 1.226670503616333, + "learning_rate": 9.569321261482037e-05, + "loss": 0.7070510983467102, + "step": 2880 + }, + { + "epoch": 1.2160337552742617, + "grad_norm": 1.164565920829773, + "learning_rate": 9.568345693202016e-05, + "loss": 0.7243561744689941, + "step": 2882 + }, + { + "epoch": 1.2168776371308017, + "grad_norm": 1.060331106185913, + "learning_rate": 9.567369071092382e-05, + "loss": 0.6316909790039062, + "step": 2884 + }, + { + "epoch": 1.2177215189873418, + "grad_norm": 1.1998693943023682, + "learning_rate": 9.566391395378419e-05, + "loss": 0.6139125227928162, + "step": 2886 + }, + { + "epoch": 1.2185654008438818, + "grad_norm": 1.1875834465026855, + "learning_rate": 9.565412666285661e-05, + "loss": 0.688897430896759, + "step": 2888 + }, + { + "epoch": 1.219409282700422, + "grad_norm": 1.199174404144287, + "learning_rate": 9.564432884039882e-05, + "loss": 0.684590756893158, + "step": 2890 + }, + { + "epoch": 1.220253164556962, + "grad_norm": 1.2428219318389893, + "learning_rate": 9.563452048867099e-05, + "loss": 0.67433100938797, + "step": 2892 + }, + { + "epoch": 1.221097046413502, + "grad_norm": 1.0826431512832642, + "learning_rate": 9.562470160993568e-05, + "loss": 0.6959785223007202, + "step": 2894 + }, + { + "epoch": 1.2219409282700422, + "grad_norm": 1.3140246868133545, + "learning_rate": 9.561487220645797e-05, + "loss": 0.6443175673484802, + "step": 2896 + }, + { + "epoch": 1.2227848101265824, + "grad_norm": 1.2758334875106812, + "learning_rate": 9.560503228050529e-05, + "loss": 0.6715332865715027, + "step": 2898 + }, + { + "epoch": 1.2236286919831223, + "grad_norm": 1.3326421976089478, + "learning_rate": 9.559518183434753e-05, + "loss": 0.6896081566810608, + "step": 2900 + }, + { + "epoch": 1.2236286919831223, + "eval_loss": 0.7281573414802551, + "eval_runtime": 854.563, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 2.466, + "step": 2900 + }, + { + "epoch": 1.2244725738396625, + "grad_norm": 1.3225606679916382, + "learning_rate": 9.558532087025697e-05, + "loss": 0.6797633171081543, + "step": 2902 + }, + { + "epoch": 1.2253164556962026, + "grad_norm": 1.3058340549468994, + "learning_rate": 9.55754493905084e-05, + "loss": 0.6510948538780212, + "step": 2904 + }, + { + "epoch": 1.2261603375527426, + "grad_norm": 1.140268087387085, + "learning_rate": 9.556556739737892e-05, + "loss": 0.6481176614761353, + "step": 2906 + }, + { + "epoch": 1.2270042194092827, + "grad_norm": 1.465113639831543, + "learning_rate": 9.555567489314816e-05, + "loss": 0.7533771991729736, + "step": 2908 + }, + { + "epoch": 1.2278481012658227, + "grad_norm": 1.1468979120254517, + "learning_rate": 9.554577188009812e-05, + "loss": 0.6924305558204651, + "step": 2910 + }, + { + "epoch": 1.2286919831223628, + "grad_norm": 1.2193517684936523, + "learning_rate": 9.553585836051321e-05, + "loss": 0.7082820534706116, + "step": 2912 + }, + { + "epoch": 1.229535864978903, + "grad_norm": 1.2015037536621094, + "learning_rate": 9.552593433668034e-05, + "loss": 0.6735695004463196, + "step": 2914 + }, + { + "epoch": 1.230379746835443, + "grad_norm": 1.1915435791015625, + "learning_rate": 9.551599981088874e-05, + "loss": 0.7312048673629761, + "step": 2916 + }, + { + "epoch": 1.231223628691983, + "grad_norm": 1.2849410772323608, + "learning_rate": 9.550605478543013e-05, + "loss": 0.6590308547019958, + "step": 2918 + }, + { + "epoch": 1.2320675105485233, + "grad_norm": 1.192238688468933, + "learning_rate": 9.549609926259866e-05, + "loss": 0.6237715482711792, + "step": 2920 + }, + { + "epoch": 1.2329113924050632, + "grad_norm": 1.141845703125, + "learning_rate": 9.548613324469085e-05, + "loss": 0.6546295881271362, + "step": 2922 + }, + { + "epoch": 1.2337552742616034, + "grad_norm": 1.1662311553955078, + "learning_rate": 9.547615673400566e-05, + "loss": 0.5800934433937073, + "step": 2924 + }, + { + "epoch": 1.2345991561181435, + "grad_norm": 1.120578646659851, + "learning_rate": 9.546616973284453e-05, + "loss": 0.6487136483192444, + "step": 2926 + }, + { + "epoch": 1.2354430379746835, + "grad_norm": 1.0884860754013062, + "learning_rate": 9.54561722435112e-05, + "loss": 0.7515342235565186, + "step": 2928 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 1.4208670854568481, + "learning_rate": 9.544616426831196e-05, + "loss": 0.7162003517150879, + "step": 2930 + }, + { + "epoch": 1.2371308016877638, + "grad_norm": 1.083389401435852, + "learning_rate": 9.543614580955543e-05, + "loss": 0.708450198173523, + "step": 2932 + }, + { + "epoch": 1.2379746835443037, + "grad_norm": 1.141364336013794, + "learning_rate": 9.542611686955268e-05, + "loss": 0.6255859732627869, + "step": 2934 + }, + { + "epoch": 1.238818565400844, + "grad_norm": 1.122036099433899, + "learning_rate": 9.54160774506172e-05, + "loss": 0.6485402584075928, + "step": 2936 + }, + { + "epoch": 1.239662447257384, + "grad_norm": 1.3514165878295898, + "learning_rate": 9.540602755506487e-05, + "loss": 0.6735473871231079, + "step": 2938 + }, + { + "epoch": 1.240506329113924, + "grad_norm": 1.1762629747390747, + "learning_rate": 9.539596718521403e-05, + "loss": 0.6154970526695251, + "step": 2940 + }, + { + "epoch": 1.2413502109704642, + "grad_norm": 1.1609408855438232, + "learning_rate": 9.53858963433854e-05, + "loss": 0.6410251259803772, + "step": 2942 + }, + { + "epoch": 1.2421940928270043, + "grad_norm": 1.1750361919403076, + "learning_rate": 9.537581503190214e-05, + "loss": 0.6841039657592773, + "step": 2944 + }, + { + "epoch": 1.2430379746835443, + "grad_norm": 1.3125680685043335, + "learning_rate": 9.536572325308982e-05, + "loss": 0.7293462753295898, + "step": 2946 + }, + { + "epoch": 1.2438818565400844, + "grad_norm": 1.1737277507781982, + "learning_rate": 9.53556210092764e-05, + "loss": 0.7713663578033447, + "step": 2948 + }, + { + "epoch": 1.2447257383966246, + "grad_norm": 1.1702152490615845, + "learning_rate": 9.53455083027923e-05, + "loss": 0.6612298488616943, + "step": 2950 + }, + { + "epoch": 1.2455696202531645, + "grad_norm": 1.2594486474990845, + "learning_rate": 9.533538513597028e-05, + "loss": 0.6725803017616272, + "step": 2952 + }, + { + "epoch": 1.2464135021097047, + "grad_norm": 1.180816411972046, + "learning_rate": 9.532525151114562e-05, + "loss": 0.6421069502830505, + "step": 2954 + }, + { + "epoch": 1.2472573839662446, + "grad_norm": 1.25814688205719, + "learning_rate": 9.531510743065593e-05, + "loss": 0.7042996287345886, + "step": 2956 + }, + { + "epoch": 1.2481012658227848, + "grad_norm": 1.2101783752441406, + "learning_rate": 9.530495289684122e-05, + "loss": 0.7359137535095215, + "step": 2958 + }, + { + "epoch": 1.248945147679325, + "grad_norm": 1.1438405513763428, + "learning_rate": 9.5294787912044e-05, + "loss": 0.6186386346817017, + "step": 2960 + }, + { + "epoch": 1.249789029535865, + "grad_norm": 1.163364291191101, + "learning_rate": 9.52846124786091e-05, + "loss": 0.6243056058883667, + "step": 2962 + }, + { + "epoch": 1.250632911392405, + "grad_norm": 1.0695953369140625, + "learning_rate": 9.52744265988838e-05, + "loss": 0.6568763852119446, + "step": 2964 + }, + { + "epoch": 1.2514767932489452, + "grad_norm": 1.2228879928588867, + "learning_rate": 9.52642302752178e-05, + "loss": 0.6486776471138, + "step": 2966 + }, + { + "epoch": 1.2523206751054852, + "grad_norm": 1.2262967824935913, + "learning_rate": 9.52540235099632e-05, + "loss": 0.6293455958366394, + "step": 2968 + }, + { + "epoch": 1.2531645569620253, + "grad_norm": 1.0862956047058105, + "learning_rate": 9.524380630547449e-05, + "loss": 0.6549884080886841, + "step": 2970 + }, + { + "epoch": 1.2540084388185653, + "grad_norm": 1.1721880435943604, + "learning_rate": 9.52335786641086e-05, + "loss": 0.6126490831375122, + "step": 2972 + }, + { + "epoch": 1.2548523206751054, + "grad_norm": 1.2452391386032104, + "learning_rate": 9.522334058822483e-05, + "loss": 0.7078590393066406, + "step": 2974 + }, + { + "epoch": 1.2556962025316456, + "grad_norm": 1.2290222644805908, + "learning_rate": 9.521309208018492e-05, + "loss": 0.6166214942932129, + "step": 2976 + }, + { + "epoch": 1.2565400843881855, + "grad_norm": 1.1823618412017822, + "learning_rate": 9.520283314235299e-05, + "loss": 0.666228175163269, + "step": 2978 + }, + { + "epoch": 1.2573839662447257, + "grad_norm": 1.1702475547790527, + "learning_rate": 9.51925637770956e-05, + "loss": 0.7436795830726624, + "step": 2980 + }, + { + "epoch": 1.2582278481012659, + "grad_norm": 1.0879321098327637, + "learning_rate": 9.518228398678168e-05, + "loss": 0.7120893001556396, + "step": 2982 + }, + { + "epoch": 1.2590717299578058, + "grad_norm": 1.1608418226242065, + "learning_rate": 9.517199377378261e-05, + "loss": 0.6931713223457336, + "step": 2984 + }, + { + "epoch": 1.259915611814346, + "grad_norm": 1.1289087533950806, + "learning_rate": 9.51616931404721e-05, + "loss": 0.6803538799285889, + "step": 2986 + }, + { + "epoch": 1.2607594936708861, + "grad_norm": 1.1622236967086792, + "learning_rate": 9.515138208922633e-05, + "loss": 0.6499706506729126, + "step": 2988 + }, + { + "epoch": 1.261603375527426, + "grad_norm": 1.2492594718933105, + "learning_rate": 9.514106062242386e-05, + "loss": 0.6132655739784241, + "step": 2990 + }, + { + "epoch": 1.2624472573839662, + "grad_norm": 1.1538822650909424, + "learning_rate": 9.513072874244567e-05, + "loss": 0.6309265494346619, + "step": 2992 + }, + { + "epoch": 1.2632911392405064, + "grad_norm": 1.0828478336334229, + "learning_rate": 9.512038645167509e-05, + "loss": 0.6297751665115356, + "step": 2994 + }, + { + "epoch": 1.2641350210970463, + "grad_norm": 1.2440937757492065, + "learning_rate": 9.511003375249792e-05, + "loss": 0.6335258483886719, + "step": 2996 + }, + { + "epoch": 1.2649789029535865, + "grad_norm": 1.1259970664978027, + "learning_rate": 9.50996706473023e-05, + "loss": 0.6513770818710327, + "step": 2998 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.1530309915542603, + "learning_rate": 9.508929713847884e-05, + "loss": 0.6490892767906189, + "step": 3000 + }, + { + "epoch": 1.2658227848101267, + "eval_loss": 0.72515869140625, + "eval_runtime": 868.0515, + "eval_samples_per_second": 2.427, + "eval_steps_per_second": 2.427, + "step": 3000 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.2257169485092163, + "learning_rate": 9.507891322842048e-05, + "loss": 0.6936060786247253, + "step": 3002 + }, + { + "epoch": 1.2675105485232068, + "grad_norm": 1.0380109548568726, + "learning_rate": 9.506851891952259e-05, + "loss": 0.5941951870918274, + "step": 3004 + }, + { + "epoch": 1.268354430379747, + "grad_norm": 1.2830222845077515, + "learning_rate": 9.505811421418296e-05, + "loss": 0.648429811000824, + "step": 3006 + }, + { + "epoch": 1.2691983122362869, + "grad_norm": 1.2212986946105957, + "learning_rate": 9.504769911480171e-05, + "loss": 0.6868565678596497, + "step": 3008 + }, + { + "epoch": 1.270042194092827, + "grad_norm": 1.104656457901001, + "learning_rate": 9.503727362378145e-05, + "loss": 0.6777986288070679, + "step": 3010 + }, + { + "epoch": 1.2708860759493672, + "grad_norm": 1.1449005603790283, + "learning_rate": 9.502683774352713e-05, + "loss": 0.6581128239631653, + "step": 3012 + }, + { + "epoch": 1.2717299578059071, + "grad_norm": 1.2753362655639648, + "learning_rate": 9.501639147644608e-05, + "loss": 0.689930260181427, + "step": 3014 + }, + { + "epoch": 1.2725738396624473, + "grad_norm": 1.3367106914520264, + "learning_rate": 9.500593482494809e-05, + "loss": 0.7549214363098145, + "step": 3016 + }, + { + "epoch": 1.2734177215189875, + "grad_norm": 1.2309048175811768, + "learning_rate": 9.499546779144528e-05, + "loss": 0.6713513135910034, + "step": 3018 + }, + { + "epoch": 1.2742616033755274, + "grad_norm": 1.3833240270614624, + "learning_rate": 9.49849903783522e-05, + "loss": 0.7045458555221558, + "step": 3020 + }, + { + "epoch": 1.2751054852320676, + "grad_norm": 1.1402570009231567, + "learning_rate": 9.49745025880858e-05, + "loss": 0.708249568939209, + "step": 3022 + }, + { + "epoch": 1.2759493670886077, + "grad_norm": 1.0476267337799072, + "learning_rate": 9.496400442306541e-05, + "loss": 0.616210401058197, + "step": 3024 + }, + { + "epoch": 1.2767932489451477, + "grad_norm": 1.1045979261398315, + "learning_rate": 9.495349588571274e-05, + "loss": 0.6691827178001404, + "step": 3026 + }, + { + "epoch": 1.2776371308016878, + "grad_norm": 1.1760368347167969, + "learning_rate": 9.494297697845194e-05, + "loss": 0.6198306083679199, + "step": 3028 + }, + { + "epoch": 1.2784810126582278, + "grad_norm": 1.0015549659729004, + "learning_rate": 9.493244770370946e-05, + "loss": 0.5756480097770691, + "step": 3030 + }, + { + "epoch": 1.279324894514768, + "grad_norm": 1.2190428972244263, + "learning_rate": 9.492190806391427e-05, + "loss": 0.6794419884681702, + "step": 3032 + }, + { + "epoch": 1.2801687763713079, + "grad_norm": 1.0210410356521606, + "learning_rate": 9.491135806149762e-05, + "loss": 0.5847988724708557, + "step": 3034 + }, + { + "epoch": 1.281012658227848, + "grad_norm": 1.0678503513336182, + "learning_rate": 9.490079769889319e-05, + "loss": 0.6760231256484985, + "step": 3036 + }, + { + "epoch": 1.2818565400843882, + "grad_norm": 1.1811012029647827, + "learning_rate": 9.489022697853709e-05, + "loss": 0.7188448309898376, + "step": 3038 + }, + { + "epoch": 1.2827004219409281, + "grad_norm": 1.1134302616119385, + "learning_rate": 9.487964590286776e-05, + "loss": 0.674904465675354, + "step": 3040 + }, + { + "epoch": 1.2835443037974683, + "grad_norm": 1.1868232488632202, + "learning_rate": 9.486905447432603e-05, + "loss": 0.6016344428062439, + "step": 3042 + }, + { + "epoch": 1.2843881856540085, + "grad_norm": 1.1586613655090332, + "learning_rate": 9.485845269535517e-05, + "loss": 0.6965603828430176, + "step": 3044 + }, + { + "epoch": 1.2852320675105484, + "grad_norm": 1.149837613105774, + "learning_rate": 9.48478405684008e-05, + "loss": 0.656144380569458, + "step": 3046 + }, + { + "epoch": 1.2860759493670886, + "grad_norm": 1.228752613067627, + "learning_rate": 9.48372180959109e-05, + "loss": 0.6388653516769409, + "step": 3048 + }, + { + "epoch": 1.2869198312236287, + "grad_norm": 1.2403100728988647, + "learning_rate": 9.482658528033595e-05, + "loss": 0.6255465745925903, + "step": 3050 + }, + { + "epoch": 1.2877637130801687, + "grad_norm": 1.2483839988708496, + "learning_rate": 9.481594212412865e-05, + "loss": 0.6828253269195557, + "step": 3052 + }, + { + "epoch": 1.2886075949367088, + "grad_norm": 1.4161021709442139, + "learning_rate": 9.480528862974422e-05, + "loss": 0.7072080373764038, + "step": 3054 + }, + { + "epoch": 1.289451476793249, + "grad_norm": 1.1500437259674072, + "learning_rate": 9.479462479964021e-05, + "loss": 0.6082415580749512, + "step": 3056 + }, + { + "epoch": 1.290295358649789, + "grad_norm": 1.196595549583435, + "learning_rate": 9.478395063627654e-05, + "loss": 0.6653015613555908, + "step": 3058 + }, + { + "epoch": 1.2911392405063291, + "grad_norm": 1.2832285165786743, + "learning_rate": 9.477326614211557e-05, + "loss": 0.7095832824707031, + "step": 3060 + }, + { + "epoch": 1.2919831223628693, + "grad_norm": 1.2234288454055786, + "learning_rate": 9.476257131962198e-05, + "loss": 0.7183426022529602, + "step": 3062 + }, + { + "epoch": 1.2928270042194092, + "grad_norm": 1.2350459098815918, + "learning_rate": 9.475186617126286e-05, + "loss": 0.713284432888031, + "step": 3064 + }, + { + "epoch": 1.2936708860759494, + "grad_norm": 1.2079555988311768, + "learning_rate": 9.47411506995077e-05, + "loss": 0.6580002307891846, + "step": 3066 + }, + { + "epoch": 1.2945147679324895, + "grad_norm": 1.129796028137207, + "learning_rate": 9.473042490682835e-05, + "loss": 0.5967763662338257, + "step": 3068 + }, + { + "epoch": 1.2953586497890295, + "grad_norm": 1.1706618070602417, + "learning_rate": 9.471968879569901e-05, + "loss": 0.6724388003349304, + "step": 3070 + }, + { + "epoch": 1.2962025316455696, + "grad_norm": 1.0336005687713623, + "learning_rate": 9.470894236859635e-05, + "loss": 0.6527577638626099, + "step": 3072 + }, + { + "epoch": 1.2970464135021098, + "grad_norm": 1.1124558448791504, + "learning_rate": 9.469818562799932e-05, + "loss": 0.677132785320282, + "step": 3074 + }, + { + "epoch": 1.2978902953586497, + "grad_norm": 1.158069372177124, + "learning_rate": 9.468741857638933e-05, + "loss": 0.649718165397644, + "step": 3076 + }, + { + "epoch": 1.29873417721519, + "grad_norm": 1.092926263809204, + "learning_rate": 9.46766412162501e-05, + "loss": 0.6872133612632751, + "step": 3078 + }, + { + "epoch": 1.29957805907173, + "grad_norm": 1.1324822902679443, + "learning_rate": 9.466585355006777e-05, + "loss": 0.6495246291160583, + "step": 3080 + }, + { + "epoch": 1.30042194092827, + "grad_norm": 1.5882837772369385, + "learning_rate": 9.465505558033086e-05, + "loss": 0.6730570197105408, + "step": 3082 + }, + { + "epoch": 1.3012658227848102, + "grad_norm": 0.9866069555282593, + "learning_rate": 9.464424730953023e-05, + "loss": 0.5677527785301208, + "step": 3084 + }, + { + "epoch": 1.3021097046413503, + "grad_norm": 1.1560224294662476, + "learning_rate": 9.463342874015917e-05, + "loss": 0.6247856020927429, + "step": 3086 + }, + { + "epoch": 1.3029535864978903, + "grad_norm": 1.135939359664917, + "learning_rate": 9.462259987471329e-05, + "loss": 0.6889358758926392, + "step": 3088 + }, + { + "epoch": 1.3037974683544304, + "grad_norm": 1.3935760259628296, + "learning_rate": 9.461176071569063e-05, + "loss": 0.7097522020339966, + "step": 3090 + }, + { + "epoch": 1.3046413502109704, + "grad_norm": 1.153518795967102, + "learning_rate": 9.460091126559155e-05, + "loss": 0.7044580578804016, + "step": 3092 + }, + { + "epoch": 1.3054852320675105, + "grad_norm": 1.2112717628479004, + "learning_rate": 9.45900515269188e-05, + "loss": 0.6119300723075867, + "step": 3094 + }, + { + "epoch": 1.3063291139240507, + "grad_norm": 1.295591115951538, + "learning_rate": 9.457918150217754e-05, + "loss": 0.7150222063064575, + "step": 3096 + }, + { + "epoch": 1.3071729957805907, + "grad_norm": 1.1175775527954102, + "learning_rate": 9.456830119387527e-05, + "loss": 0.6043334007263184, + "step": 3098 + }, + { + "epoch": 1.3080168776371308, + "grad_norm": 1.4022588729858398, + "learning_rate": 9.455741060452186e-05, + "loss": 0.6354425549507141, + "step": 3100 + }, + { + "epoch": 1.3080168776371308, + "eval_loss": 0.7225774526596069, + "eval_runtime": 862.4006, + "eval_samples_per_second": 2.443, + "eval_steps_per_second": 2.443, + "step": 3100 + }, + { + "epoch": 1.3088607594936708, + "grad_norm": 1.1657692193984985, + "learning_rate": 9.454650973662957e-05, + "loss": 0.7281571626663208, + "step": 3102 + }, + { + "epoch": 1.309704641350211, + "grad_norm": 1.6169127225875854, + "learning_rate": 9.453559859271301e-05, + "loss": 0.8038214445114136, + "step": 3104 + }, + { + "epoch": 1.310548523206751, + "grad_norm": 1.1256520748138428, + "learning_rate": 9.452467717528918e-05, + "loss": 0.6488606333732605, + "step": 3106 + }, + { + "epoch": 1.311392405063291, + "grad_norm": 1.1224530935287476, + "learning_rate": 9.451374548687745e-05, + "loss": 0.6897066235542297, + "step": 3108 + }, + { + "epoch": 1.3122362869198312, + "grad_norm": 1.1123055219650269, + "learning_rate": 9.450280352999952e-05, + "loss": 0.6332913041114807, + "step": 3110 + }, + { + "epoch": 1.3130801687763713, + "grad_norm": 1.1688940525054932, + "learning_rate": 9.449185130717952e-05, + "loss": 0.7426630854606628, + "step": 3112 + }, + { + "epoch": 1.3139240506329113, + "grad_norm": 1.1898044347763062, + "learning_rate": 9.44808888209439e-05, + "loss": 0.7156099677085876, + "step": 3114 + }, + { + "epoch": 1.3147679324894515, + "grad_norm": 1.3030686378479004, + "learning_rate": 9.44699160738215e-05, + "loss": 0.7150979042053223, + "step": 3116 + }, + { + "epoch": 1.3156118143459916, + "grad_norm": 1.1539074182510376, + "learning_rate": 9.445893306834352e-05, + "loss": 0.6687285900115967, + "step": 3118 + }, + { + "epoch": 1.3164556962025316, + "grad_norm": 1.311808466911316, + "learning_rate": 9.444793980704355e-05, + "loss": 0.7340983152389526, + "step": 3120 + }, + { + "epoch": 1.3172995780590717, + "grad_norm": 1.3325430154800415, + "learning_rate": 9.44369362924575e-05, + "loss": 0.6620677709579468, + "step": 3122 + }, + { + "epoch": 1.3181434599156119, + "grad_norm": 1.201518177986145, + "learning_rate": 9.442592252712365e-05, + "loss": 0.6169955134391785, + "step": 3124 + }, + { + "epoch": 1.3189873417721518, + "grad_norm": 1.2124013900756836, + "learning_rate": 9.441489851358272e-05, + "loss": 0.6696792840957642, + "step": 3126 + }, + { + "epoch": 1.319831223628692, + "grad_norm": 1.2186850309371948, + "learning_rate": 9.440386425437768e-05, + "loss": 0.7303428649902344, + "step": 3128 + }, + { + "epoch": 1.3206751054852321, + "grad_norm": 1.3780523538589478, + "learning_rate": 9.439281975205396e-05, + "loss": 0.7093026638031006, + "step": 3130 + }, + { + "epoch": 1.321518987341772, + "grad_norm": 1.233353614807129, + "learning_rate": 9.438176500915932e-05, + "loss": 0.6821767687797546, + "step": 3132 + }, + { + "epoch": 1.3223628691983123, + "grad_norm": 1.2425329685211182, + "learning_rate": 9.437070002824385e-05, + "loss": 0.700680136680603, + "step": 3134 + }, + { + "epoch": 1.3232067510548524, + "grad_norm": 1.1600432395935059, + "learning_rate": 9.435962481186003e-05, + "loss": 0.6173145771026611, + "step": 3136 + }, + { + "epoch": 1.3240506329113924, + "grad_norm": 1.279336929321289, + "learning_rate": 9.434853936256272e-05, + "loss": 0.6597106456756592, + "step": 3138 + }, + { + "epoch": 1.3248945147679325, + "grad_norm": 1.1787258386611938, + "learning_rate": 9.433744368290909e-05, + "loss": 0.6655287742614746, + "step": 3140 + }, + { + "epoch": 1.3257383966244727, + "grad_norm": 1.3658509254455566, + "learning_rate": 9.432633777545874e-05, + "loss": 0.6312944889068604, + "step": 3142 + }, + { + "epoch": 1.3265822784810126, + "grad_norm": 1.1220000982284546, + "learning_rate": 9.431522164277356e-05, + "loss": 0.6696156859397888, + "step": 3144 + }, + { + "epoch": 1.3274261603375528, + "grad_norm": 1.224761724472046, + "learning_rate": 9.430409528741783e-05, + "loss": 0.6586571335792542, + "step": 3146 + }, + { + "epoch": 1.328270042194093, + "grad_norm": 1.227510929107666, + "learning_rate": 9.429295871195821e-05, + "loss": 0.64905846118927, + "step": 3148 + }, + { + "epoch": 1.3291139240506329, + "grad_norm": 1.1359103918075562, + "learning_rate": 9.428181191896366e-05, + "loss": 0.6407933831214905, + "step": 3150 + }, + { + "epoch": 1.329957805907173, + "grad_norm": 1.2729473114013672, + "learning_rate": 9.427065491100556e-05, + "loss": 0.7004884481430054, + "step": 3152 + }, + { + "epoch": 1.3308016877637132, + "grad_norm": 1.1182841062545776, + "learning_rate": 9.42594876906576e-05, + "loss": 0.6835907101631165, + "step": 3154 + }, + { + "epoch": 1.3316455696202532, + "grad_norm": 1.2309781312942505, + "learning_rate": 9.424831026049585e-05, + "loss": 0.7476315498352051, + "step": 3156 + }, + { + "epoch": 1.3324894514767933, + "grad_norm": 1.0857728719711304, + "learning_rate": 9.423712262309873e-05, + "loss": 0.6811426281929016, + "step": 3158 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.299680233001709, + "learning_rate": 9.4225924781047e-05, + "loss": 0.6403942108154297, + "step": 3160 + }, + { + "epoch": 1.3341772151898734, + "grad_norm": 1.226472020149231, + "learning_rate": 9.421471673692382e-05, + "loss": 0.6758930683135986, + "step": 3162 + }, + { + "epoch": 1.3350210970464136, + "grad_norm": 1.1403205394744873, + "learning_rate": 9.420349849331463e-05, + "loss": 0.7119444608688354, + "step": 3164 + }, + { + "epoch": 1.3358649789029535, + "grad_norm": 1.2888442277908325, + "learning_rate": 9.419227005280729e-05, + "loss": 0.7411463260650635, + "step": 3166 + }, + { + "epoch": 1.3367088607594937, + "grad_norm": 1.1929190158843994, + "learning_rate": 9.418103141799197e-05, + "loss": 0.5992606282234192, + "step": 3168 + }, + { + "epoch": 1.3375527426160336, + "grad_norm": 1.2574355602264404, + "learning_rate": 9.416978259146122e-05, + "loss": 0.6728890538215637, + "step": 3170 + }, + { + "epoch": 1.3383966244725738, + "grad_norm": 0.9653727412223816, + "learning_rate": 9.415852357580992e-05, + "loss": 0.6294883489608765, + "step": 3172 + }, + { + "epoch": 1.339240506329114, + "grad_norm": 1.2107670307159424, + "learning_rate": 9.414725437363532e-05, + "loss": 0.6816665530204773, + "step": 3174 + }, + { + "epoch": 1.340084388185654, + "grad_norm": 1.024849534034729, + "learning_rate": 9.4135974987537e-05, + "loss": 0.6186381578445435, + "step": 3176 + }, + { + "epoch": 1.340928270042194, + "grad_norm": 1.1556614637374878, + "learning_rate": 9.41246854201169e-05, + "loss": 0.6071005463600159, + "step": 3178 + }, + { + "epoch": 1.3417721518987342, + "grad_norm": 1.2382808923721313, + "learning_rate": 9.41133856739793e-05, + "loss": 0.7871434092521667, + "step": 3180 + }, + { + "epoch": 1.3426160337552742, + "grad_norm": 1.0499578714370728, + "learning_rate": 9.410207575173082e-05, + "loss": 0.6578201651573181, + "step": 3182 + }, + { + "epoch": 1.3434599156118143, + "grad_norm": 1.2048250436782837, + "learning_rate": 9.409075565598049e-05, + "loss": 0.6271620392799377, + "step": 3184 + }, + { + "epoch": 1.3443037974683545, + "grad_norm": 1.0287591218948364, + "learning_rate": 9.407942538933958e-05, + "loss": 0.5773864388465881, + "step": 3186 + }, + { + "epoch": 1.3451476793248944, + "grad_norm": 1.1125097274780273, + "learning_rate": 9.406808495442181e-05, + "loss": 0.6745175719261169, + "step": 3188 + }, + { + "epoch": 1.3459915611814346, + "grad_norm": 1.036125898361206, + "learning_rate": 9.405673435384319e-05, + "loss": 0.6001214385032654, + "step": 3190 + }, + { + "epoch": 1.3468354430379748, + "grad_norm": 1.2771985530853271, + "learning_rate": 9.404537359022207e-05, + "loss": 0.6703945994377136, + "step": 3192 + }, + { + "epoch": 1.3476793248945147, + "grad_norm": 1.0891097784042358, + "learning_rate": 9.403400266617918e-05, + "loss": 0.6159096360206604, + "step": 3194 + }, + { + "epoch": 1.3485232067510549, + "grad_norm": 1.1926233768463135, + "learning_rate": 9.402262158433755e-05, + "loss": 0.6439315676689148, + "step": 3196 + }, + { + "epoch": 1.349367088607595, + "grad_norm": 1.272557020187378, + "learning_rate": 9.40112303473226e-05, + "loss": 0.7125352025032043, + "step": 3198 + }, + { + "epoch": 1.350210970464135, + "grad_norm": 1.052037239074707, + "learning_rate": 9.399982895776207e-05, + "loss": 0.594719648361206, + "step": 3200 + }, + { + "epoch": 1.350210970464135, + "eval_loss": 0.7200453281402588, + "eval_runtime": 846.2953, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 3200 + }, + { + "epoch": 1.3510548523206751, + "grad_norm": 1.204728126525879, + "learning_rate": 9.398841741828601e-05, + "loss": 0.6390520334243774, + "step": 3202 + }, + { + "epoch": 1.3518987341772153, + "grad_norm": 1.0873899459838867, + "learning_rate": 9.397699573152689e-05, + "loss": 0.6010531187057495, + "step": 3204 + }, + { + "epoch": 1.3527426160337552, + "grad_norm": 1.3124359846115112, + "learning_rate": 9.396556390011944e-05, + "loss": 0.724280834197998, + "step": 3206 + }, + { + "epoch": 1.3535864978902954, + "grad_norm": 1.2179948091506958, + "learning_rate": 9.395412192670075e-05, + "loss": 0.6430405378341675, + "step": 3208 + }, + { + "epoch": 1.3544303797468356, + "grad_norm": 1.2617219686508179, + "learning_rate": 9.394266981391031e-05, + "loss": 0.7188641428947449, + "step": 3210 + }, + { + "epoch": 1.3552742616033755, + "grad_norm": 1.2151501178741455, + "learning_rate": 9.393120756438988e-05, + "loss": 0.6724364757537842, + "step": 3212 + }, + { + "epoch": 1.3561181434599157, + "grad_norm": 1.221528172492981, + "learning_rate": 9.391973518078357e-05, + "loss": 0.6340664625167847, + "step": 3214 + }, + { + "epoch": 1.3569620253164558, + "grad_norm": 1.3180092573165894, + "learning_rate": 9.390825266573786e-05, + "loss": 0.6914255023002625, + "step": 3216 + }, + { + "epoch": 1.3578059071729958, + "grad_norm": 1.103994369506836, + "learning_rate": 9.38967600219015e-05, + "loss": 0.6137136220932007, + "step": 3218 + }, + { + "epoch": 1.358649789029536, + "grad_norm": 1.33389413356781, + "learning_rate": 9.38852572519257e-05, + "loss": 0.7173700332641602, + "step": 3220 + }, + { + "epoch": 1.3594936708860759, + "grad_norm": 1.1074159145355225, + "learning_rate": 9.387374435846386e-05, + "loss": 0.5942243933677673, + "step": 3222 + }, + { + "epoch": 1.360337552742616, + "grad_norm": 1.1157063245773315, + "learning_rate": 9.386222134417182e-05, + "loss": 0.6362866163253784, + "step": 3224 + }, + { + "epoch": 1.3611814345991562, + "grad_norm": 1.1717792749404907, + "learning_rate": 9.38506882117077e-05, + "loss": 0.6784523129463196, + "step": 3226 + }, + { + "epoch": 1.3620253164556961, + "grad_norm": 1.0946043729782104, + "learning_rate": 9.383914496373197e-05, + "loss": 0.6647377014160156, + "step": 3228 + }, + { + "epoch": 1.3628691983122363, + "grad_norm": 1.1519699096679688, + "learning_rate": 9.382759160290746e-05, + "loss": 0.6302075982093811, + "step": 3230 + }, + { + "epoch": 1.3637130801687762, + "grad_norm": 0.9928684830665588, + "learning_rate": 9.381602813189929e-05, + "loss": 0.5979090332984924, + "step": 3232 + }, + { + "epoch": 1.3645569620253164, + "grad_norm": 1.2488124370574951, + "learning_rate": 9.380445455337492e-05, + "loss": 0.6949353218078613, + "step": 3234 + }, + { + "epoch": 1.3654008438818566, + "grad_norm": 1.3884797096252441, + "learning_rate": 9.379287087000416e-05, + "loss": 0.7225558161735535, + "step": 3236 + }, + { + "epoch": 1.3662447257383965, + "grad_norm": 1.2981176376342773, + "learning_rate": 9.378127708445917e-05, + "loss": 0.6993390917778015, + "step": 3238 + }, + { + "epoch": 1.3670886075949367, + "grad_norm": 0.9884640574455261, + "learning_rate": 9.376967319941438e-05, + "loss": 0.6983805894851685, + "step": 3240 + }, + { + "epoch": 1.3679324894514768, + "grad_norm": 1.2051894664764404, + "learning_rate": 9.375805921754659e-05, + "loss": 0.7062534689903259, + "step": 3242 + }, + { + "epoch": 1.3687763713080168, + "grad_norm": 1.1943434476852417, + "learning_rate": 9.374643514153494e-05, + "loss": 0.6405107378959656, + "step": 3244 + }, + { + "epoch": 1.369620253164557, + "grad_norm": 1.249214768409729, + "learning_rate": 9.373480097406086e-05, + "loss": 0.6844781637191772, + "step": 3246 + }, + { + "epoch": 1.370464135021097, + "grad_norm": 1.1847131252288818, + "learning_rate": 9.372315671780813e-05, + "loss": 0.6048306226730347, + "step": 3248 + }, + { + "epoch": 1.371308016877637, + "grad_norm": 1.125545859336853, + "learning_rate": 9.37115023754629e-05, + "loss": 0.6772685050964355, + "step": 3250 + }, + { + "epoch": 1.3721518987341772, + "grad_norm": 1.466615915298462, + "learning_rate": 9.369983794971354e-05, + "loss": 0.7536272406578064, + "step": 3252 + }, + { + "epoch": 1.3729957805907174, + "grad_norm": 1.066699504852295, + "learning_rate": 9.368816344325084e-05, + "loss": 0.6640655398368835, + "step": 3254 + }, + { + "epoch": 1.3738396624472573, + "grad_norm": 1.4793988466262817, + "learning_rate": 9.367647885876787e-05, + "loss": 0.7029458284378052, + "step": 3256 + }, + { + "epoch": 1.3746835443037975, + "grad_norm": 1.258540153503418, + "learning_rate": 9.366478419896006e-05, + "loss": 0.7231863737106323, + "step": 3258 + }, + { + "epoch": 1.3755274261603376, + "grad_norm": 1.176106333732605, + "learning_rate": 9.365307946652512e-05, + "loss": 0.6679144501686096, + "step": 3260 + }, + { + "epoch": 1.3763713080168776, + "grad_norm": 1.3301753997802734, + "learning_rate": 9.364136466416316e-05, + "loss": 0.6282188296318054, + "step": 3262 + }, + { + "epoch": 1.3772151898734177, + "grad_norm": 1.3616732358932495, + "learning_rate": 9.362963979457648e-05, + "loss": 0.6870840191841125, + "step": 3264 + }, + { + "epoch": 1.378059071729958, + "grad_norm": 1.1982418298721313, + "learning_rate": 9.361790486046985e-05, + "loss": 0.6823731660842896, + "step": 3266 + }, + { + "epoch": 1.3789029535864978, + "grad_norm": 1.1869033575057983, + "learning_rate": 9.360615986455024e-05, + "loss": 0.6582897305488586, + "step": 3268 + }, + { + "epoch": 1.379746835443038, + "grad_norm": 1.1192975044250488, + "learning_rate": 9.359440480952703e-05, + "loss": 0.716654360294342, + "step": 3270 + }, + { + "epoch": 1.3805907172995782, + "grad_norm": 1.2210016250610352, + "learning_rate": 9.358263969811189e-05, + "loss": 0.6880061626434326, + "step": 3272 + }, + { + "epoch": 1.381434599156118, + "grad_norm": 1.0358284711837769, + "learning_rate": 9.357086453301878e-05, + "loss": 0.666864812374115, + "step": 3274 + }, + { + "epoch": 1.3822784810126583, + "grad_norm": 1.2790803909301758, + "learning_rate": 9.355907931696401e-05, + "loss": 0.6872087121009827, + "step": 3276 + }, + { + "epoch": 1.3831223628691984, + "grad_norm": 1.182991623878479, + "learning_rate": 9.354728405266623e-05, + "loss": 0.5929665565490723, + "step": 3278 + }, + { + "epoch": 1.3839662447257384, + "grad_norm": 1.1071184873580933, + "learning_rate": 9.353547874284634e-05, + "loss": 0.5928181409835815, + "step": 3280 + }, + { + "epoch": 1.3848101265822785, + "grad_norm": 1.3139623403549194, + "learning_rate": 9.352366339022763e-05, + "loss": 0.6783652901649475, + "step": 3282 + }, + { + "epoch": 1.3856540084388187, + "grad_norm": 1.2534632682800293, + "learning_rate": 9.351183799753567e-05, + "loss": 0.7652941346168518, + "step": 3284 + }, + { + "epoch": 1.3864978902953586, + "grad_norm": 1.4487930536270142, + "learning_rate": 9.350000256749833e-05, + "loss": 0.7430433630943298, + "step": 3286 + }, + { + "epoch": 1.3873417721518988, + "grad_norm": 1.0786021947860718, + "learning_rate": 9.348815710284584e-05, + "loss": 0.5854598879814148, + "step": 3288 + }, + { + "epoch": 1.3881856540084387, + "grad_norm": 1.0544480085372925, + "learning_rate": 9.347630160631071e-05, + "loss": 0.6365222334861755, + "step": 3290 + }, + { + "epoch": 1.389029535864979, + "grad_norm": 0.9989988207817078, + "learning_rate": 9.346443608062778e-05, + "loss": 0.6485803127288818, + "step": 3292 + }, + { + "epoch": 1.389873417721519, + "grad_norm": 1.100951910018921, + "learning_rate": 9.345256052853419e-05, + "loss": 0.6417753100395203, + "step": 3294 + }, + { + "epoch": 1.390717299578059, + "grad_norm": 1.1398471593856812, + "learning_rate": 9.344067495276942e-05, + "loss": 0.6333693861961365, + "step": 3296 + }, + { + "epoch": 1.3915611814345992, + "grad_norm": 1.1745941638946533, + "learning_rate": 9.342877935607521e-05, + "loss": 0.677288293838501, + "step": 3298 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 1.2651115655899048, + "learning_rate": 9.34168737411957e-05, + "loss": 0.7408396005630493, + "step": 3300 + }, + { + "epoch": 1.3924050632911391, + "eval_loss": 0.7173135876655579, + "eval_runtime": 853.5344, + "eval_samples_per_second": 2.469, + "eval_steps_per_second": 2.469, + "step": 3300 + }, + { + "epoch": 1.3932489451476793, + "grad_norm": 1.0747730731964111, + "learning_rate": 9.340495811087723e-05, + "loss": 0.6810371279716492, + "step": 3302 + }, + { + "epoch": 1.3940928270042194, + "grad_norm": 1.2857651710510254, + "learning_rate": 9.339303246786854e-05, + "loss": 0.6693953275680542, + "step": 3304 + }, + { + "epoch": 1.3949367088607594, + "grad_norm": 1.4544212818145752, + "learning_rate": 9.338109681492063e-05, + "loss": 0.7019274234771729, + "step": 3306 + }, + { + "epoch": 1.3957805907172995, + "grad_norm": 1.687755823135376, + "learning_rate": 9.336915115478685e-05, + "loss": 0.6074224710464478, + "step": 3308 + }, + { + "epoch": 1.3966244725738397, + "grad_norm": 1.1645431518554688, + "learning_rate": 9.33571954902228e-05, + "loss": 0.6981383562088013, + "step": 3310 + }, + { + "epoch": 1.3974683544303796, + "grad_norm": 1.6173527240753174, + "learning_rate": 9.334522982398646e-05, + "loss": 0.7282926440238953, + "step": 3312 + }, + { + "epoch": 1.3983122362869198, + "grad_norm": 1.3132909536361694, + "learning_rate": 9.333325415883804e-05, + "loss": 0.6574883460998535, + "step": 3314 + }, + { + "epoch": 1.39915611814346, + "grad_norm": 1.1629762649536133, + "learning_rate": 9.332126849754014e-05, + "loss": 0.6559937596321106, + "step": 3316 + }, + { + "epoch": 1.4, + "grad_norm": 1.1666897535324097, + "learning_rate": 9.33092728428576e-05, + "loss": 0.683718740940094, + "step": 3318 + }, + { + "epoch": 1.40084388185654, + "grad_norm": 1.2269554138183594, + "learning_rate": 9.329726719755756e-05, + "loss": 0.6909779906272888, + "step": 3320 + }, + { + "epoch": 1.4016877637130802, + "grad_norm": 1.1010066270828247, + "learning_rate": 9.328525156440952e-05, + "loss": 0.6051948666572571, + "step": 3322 + }, + { + "epoch": 1.4025316455696202, + "grad_norm": 1.127143144607544, + "learning_rate": 9.327322594618528e-05, + "loss": 0.6266679763793945, + "step": 3324 + }, + { + "epoch": 1.4033755274261603, + "grad_norm": 1.2160708904266357, + "learning_rate": 9.326119034565887e-05, + "loss": 0.6587526202201843, + "step": 3326 + }, + { + "epoch": 1.4042194092827005, + "grad_norm": 1.0853947401046753, + "learning_rate": 9.32491447656067e-05, + "loss": 0.5916946530342102, + "step": 3328 + }, + { + "epoch": 1.4050632911392404, + "grad_norm": 1.2205027341842651, + "learning_rate": 9.323708920880744e-05, + "loss": 0.6032452583312988, + "step": 3330 + }, + { + "epoch": 1.4059071729957806, + "grad_norm": 1.1964668035507202, + "learning_rate": 9.32250236780421e-05, + "loss": 0.6649114489555359, + "step": 3332 + }, + { + "epoch": 1.4067510548523208, + "grad_norm": 1.2507994174957275, + "learning_rate": 9.321294817609394e-05, + "loss": 0.7142994403839111, + "step": 3334 + }, + { + "epoch": 1.4075949367088607, + "grad_norm": 1.1310259103775024, + "learning_rate": 9.320086270574854e-05, + "loss": 0.709568977355957, + "step": 3336 + }, + { + "epoch": 1.4084388185654009, + "grad_norm": 1.2454090118408203, + "learning_rate": 9.318876726979385e-05, + "loss": 0.7800853848457336, + "step": 3338 + }, + { + "epoch": 1.409282700421941, + "grad_norm": 1.1168389320373535, + "learning_rate": 9.317666187101996e-05, + "loss": 0.6187908053398132, + "step": 3340 + }, + { + "epoch": 1.410126582278481, + "grad_norm": 1.6696287393569946, + "learning_rate": 9.316454651221942e-05, + "loss": 0.6222613453865051, + "step": 3342 + }, + { + "epoch": 1.4109704641350211, + "grad_norm": 0.9500295519828796, + "learning_rate": 9.315242119618698e-05, + "loss": 0.6116594672203064, + "step": 3344 + }, + { + "epoch": 1.4118143459915613, + "grad_norm": 1.186358094215393, + "learning_rate": 9.314028592571973e-05, + "loss": 0.633224368095398, + "step": 3346 + }, + { + "epoch": 1.4126582278481012, + "grad_norm": 1.1855978965759277, + "learning_rate": 9.312814070361705e-05, + "loss": 0.6675921082496643, + "step": 3348 + }, + { + "epoch": 1.4135021097046414, + "grad_norm": 1.2465872764587402, + "learning_rate": 9.311598553268059e-05, + "loss": 0.7268879413604736, + "step": 3350 + }, + { + "epoch": 1.4143459915611816, + "grad_norm": 1.151274561882019, + "learning_rate": 9.310382041571435e-05, + "loss": 0.6147416830062866, + "step": 3352 + }, + { + "epoch": 1.4151898734177215, + "grad_norm": 1.1226807832717896, + "learning_rate": 9.309164535552453e-05, + "loss": 0.6678543090820312, + "step": 3354 + }, + { + "epoch": 1.4160337552742617, + "grad_norm": 1.375842571258545, + "learning_rate": 9.307946035491975e-05, + "loss": 0.6334129571914673, + "step": 3356 + }, + { + "epoch": 1.4168776371308016, + "grad_norm": 1.058353066444397, + "learning_rate": 9.306726541671081e-05, + "loss": 0.6582583785057068, + "step": 3358 + }, + { + "epoch": 1.4177215189873418, + "grad_norm": 1.0511330366134644, + "learning_rate": 9.305506054371084e-05, + "loss": 0.5877419114112854, + "step": 3360 + }, + { + "epoch": 1.4185654008438817, + "grad_norm": 1.2246462106704712, + "learning_rate": 9.304284573873532e-05, + "loss": 0.711665689945221, + "step": 3362 + }, + { + "epoch": 1.4194092827004219, + "grad_norm": 1.0242294073104858, + "learning_rate": 9.303062100460193e-05, + "loss": 0.6743642687797546, + "step": 3364 + }, + { + "epoch": 1.420253164556962, + "grad_norm": 1.1432100534439087, + "learning_rate": 9.301838634413069e-05, + "loss": 0.6825576424598694, + "step": 3366 + }, + { + "epoch": 1.421097046413502, + "grad_norm": 1.0128604173660278, + "learning_rate": 9.30061417601439e-05, + "loss": 0.624455988407135, + "step": 3368 + }, + { + "epoch": 1.4219409282700421, + "grad_norm": 1.2738330364227295, + "learning_rate": 9.299388725546617e-05, + "loss": 0.7029586434364319, + "step": 3370 + }, + { + "epoch": 1.4227848101265823, + "grad_norm": 1.0857324600219727, + "learning_rate": 9.298162283292435e-05, + "loss": 0.5994319915771484, + "step": 3372 + }, + { + "epoch": 1.4236286919831223, + "grad_norm": 1.0811917781829834, + "learning_rate": 9.296934849534763e-05, + "loss": 0.6537772417068481, + "step": 3374 + }, + { + "epoch": 1.4244725738396624, + "grad_norm": 1.006913185119629, + "learning_rate": 9.295706424556745e-05, + "loss": 0.5775008201599121, + "step": 3376 + }, + { + "epoch": 1.4253164556962026, + "grad_norm": 1.2306486368179321, + "learning_rate": 9.294477008641755e-05, + "loss": 0.7445536255836487, + "step": 3378 + }, + { + "epoch": 1.4261603375527425, + "grad_norm": 1.223608374595642, + "learning_rate": 9.293246602073398e-05, + "loss": 0.6081538796424866, + "step": 3380 + }, + { + "epoch": 1.4270042194092827, + "grad_norm": 1.0933321714401245, + "learning_rate": 9.2920152051355e-05, + "loss": 0.6134634613990784, + "step": 3382 + }, + { + "epoch": 1.4278481012658228, + "grad_norm": 1.1738401651382446, + "learning_rate": 9.290782818112127e-05, + "loss": 0.5961087346076965, + "step": 3384 + }, + { + "epoch": 1.4286919831223628, + "grad_norm": 1.1493438482284546, + "learning_rate": 9.289549441287561e-05, + "loss": 0.6284122467041016, + "step": 3386 + }, + { + "epoch": 1.429535864978903, + "grad_norm": 1.1907998323440552, + "learning_rate": 9.288315074946324e-05, + "loss": 0.6654639840126038, + "step": 3388 + }, + { + "epoch": 1.4303797468354431, + "grad_norm": 1.3423025608062744, + "learning_rate": 9.287079719373157e-05, + "loss": 0.652850329875946, + "step": 3390 + }, + { + "epoch": 1.431223628691983, + "grad_norm": 1.3932039737701416, + "learning_rate": 9.285843374853034e-05, + "loss": 0.703445315361023, + "step": 3392 + }, + { + "epoch": 1.4320675105485232, + "grad_norm": 5.349400043487549, + "learning_rate": 9.284606041671155e-05, + "loss": 0.693265438079834, + "step": 3394 + }, + { + "epoch": 1.4329113924050634, + "grad_norm": 1.0921961069107056, + "learning_rate": 9.28336772011295e-05, + "loss": 0.6578536033630371, + "step": 3396 + }, + { + "epoch": 1.4337552742616033, + "grad_norm": 1.184157133102417, + "learning_rate": 9.282128410464074e-05, + "loss": 0.7092277407646179, + "step": 3398 + }, + { + "epoch": 1.4345991561181435, + "grad_norm": 1.0923491716384888, + "learning_rate": 9.280888113010415e-05, + "loss": 0.6866328120231628, + "step": 3400 + }, + { + "epoch": 1.4345991561181435, + "eval_loss": 0.715917706489563, + "eval_runtime": 868.51, + "eval_samples_per_second": 2.426, + "eval_steps_per_second": 2.426, + "step": 3400 + }, + { + "epoch": 1.4354430379746836, + "grad_norm": 1.2515597343444824, + "learning_rate": 9.279646828038083e-05, + "loss": 0.6617444157600403, + "step": 3402 + }, + { + "epoch": 1.4362869198312236, + "grad_norm": 1.2122540473937988, + "learning_rate": 9.278404555833422e-05, + "loss": 0.6373176574707031, + "step": 3404 + }, + { + "epoch": 1.4371308016877637, + "grad_norm": 1.191904902458191, + "learning_rate": 9.277161296682997e-05, + "loss": 0.6506488919258118, + "step": 3406 + }, + { + "epoch": 1.437974683544304, + "grad_norm": 1.2492214441299438, + "learning_rate": 9.275917050873606e-05, + "loss": 0.7172291874885559, + "step": 3408 + }, + { + "epoch": 1.4388185654008439, + "grad_norm": 1.0518640279769897, + "learning_rate": 9.274671818692272e-05, + "loss": 0.6180248260498047, + "step": 3410 + }, + { + "epoch": 1.439662447257384, + "grad_norm": 1.150563359260559, + "learning_rate": 9.273425600426245e-05, + "loss": 0.6828892827033997, + "step": 3412 + }, + { + "epoch": 1.4405063291139242, + "grad_norm": 1.76945960521698, + "learning_rate": 9.272178396363005e-05, + "loss": 0.6585919857025146, + "step": 3414 + }, + { + "epoch": 1.4413502109704641, + "grad_norm": 1.2367758750915527, + "learning_rate": 9.270930206790257e-05, + "loss": 0.7548692226409912, + "step": 3416 + }, + { + "epoch": 1.4421940928270043, + "grad_norm": 1.2292778491973877, + "learning_rate": 9.269681031995936e-05, + "loss": 0.7017102837562561, + "step": 3418 + }, + { + "epoch": 1.4430379746835442, + "grad_norm": 1.2193396091461182, + "learning_rate": 9.268430872268202e-05, + "loss": 0.6657648682594299, + "step": 3420 + }, + { + "epoch": 1.4438818565400844, + "grad_norm": 1.0505954027175903, + "learning_rate": 9.267179727895443e-05, + "loss": 0.6950910091400146, + "step": 3422 + }, + { + "epoch": 1.4447257383966245, + "grad_norm": 1.1560698747634888, + "learning_rate": 9.265927599166272e-05, + "loss": 0.689308226108551, + "step": 3424 + }, + { + "epoch": 1.4455696202531645, + "grad_norm": 1.189336895942688, + "learning_rate": 9.264674486369533e-05, + "loss": 0.6481659412384033, + "step": 3426 + }, + { + "epoch": 1.4464135021097047, + "grad_norm": 1.3527976274490356, + "learning_rate": 9.263420389794294e-05, + "loss": 0.6626612544059753, + "step": 3428 + }, + { + "epoch": 1.4472573839662446, + "grad_norm": 1.096303105354309, + "learning_rate": 9.262165309729854e-05, + "loss": 0.690841794013977, + "step": 3430 + }, + { + "epoch": 1.4481012658227848, + "grad_norm": 1.2131421566009521, + "learning_rate": 9.260909246465732e-05, + "loss": 0.6497649550437927, + "step": 3432 + }, + { + "epoch": 1.448945147679325, + "grad_norm": 1.1831032037734985, + "learning_rate": 9.259652200291678e-05, + "loss": 0.6236130595207214, + "step": 3434 + }, + { + "epoch": 1.4497890295358649, + "grad_norm": 0.9745979309082031, + "learning_rate": 9.25839417149767e-05, + "loss": 0.5223423838615417, + "step": 3436 + }, + { + "epoch": 1.450632911392405, + "grad_norm": 1.372460126876831, + "learning_rate": 9.257135160373912e-05, + "loss": 0.6642022728919983, + "step": 3438 + }, + { + "epoch": 1.4514767932489452, + "grad_norm": 1.421044111251831, + "learning_rate": 9.255875167210832e-05, + "loss": 0.5426992774009705, + "step": 3440 + }, + { + "epoch": 1.4523206751054851, + "grad_norm": 1.1694250106811523, + "learning_rate": 9.254614192299086e-05, + "loss": 0.6260567307472229, + "step": 3442 + }, + { + "epoch": 1.4531645569620253, + "grad_norm": 1.0892298221588135, + "learning_rate": 9.253352235929558e-05, + "loss": 0.5776100158691406, + "step": 3444 + }, + { + "epoch": 1.4540084388185655, + "grad_norm": 1.1841259002685547, + "learning_rate": 9.252089298393356e-05, + "loss": 0.6495202779769897, + "step": 3446 + }, + { + "epoch": 1.4548523206751054, + "grad_norm": 1.1133549213409424, + "learning_rate": 9.250825379981815e-05, + "loss": 0.6570594906806946, + "step": 3448 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 1.197100281715393, + "learning_rate": 9.249560480986498e-05, + "loss": 0.6496587991714478, + "step": 3450 + }, + { + "epoch": 1.4565400843881857, + "grad_norm": 1.1661107540130615, + "learning_rate": 9.248294601699193e-05, + "loss": 0.6644704341888428, + "step": 3452 + }, + { + "epoch": 1.4573839662447257, + "grad_norm": 1.2257879972457886, + "learning_rate": 9.247027742411912e-05, + "loss": 0.6451231241226196, + "step": 3454 + }, + { + "epoch": 1.4582278481012658, + "grad_norm": 1.3634982109069824, + "learning_rate": 9.245759903416897e-05, + "loss": 0.6108601093292236, + "step": 3456 + }, + { + "epoch": 1.459071729957806, + "grad_norm": 1.1802605390548706, + "learning_rate": 9.244491085006615e-05, + "loss": 0.6080004572868347, + "step": 3458 + }, + { + "epoch": 1.459915611814346, + "grad_norm": 1.280831217765808, + "learning_rate": 9.243221287473756e-05, + "loss": 0.6406423449516296, + "step": 3460 + }, + { + "epoch": 1.460759493670886, + "grad_norm": 1.3127192258834839, + "learning_rate": 9.241950511111237e-05, + "loss": 0.7320113778114319, + "step": 3462 + }, + { + "epoch": 1.4616033755274263, + "grad_norm": 1.1711835861206055, + "learning_rate": 9.240678756212204e-05, + "loss": 0.572110652923584, + "step": 3464 + }, + { + "epoch": 1.4624472573839662, + "grad_norm": 1.347143292427063, + "learning_rate": 9.239406023070028e-05, + "loss": 0.7446795105934143, + "step": 3466 + }, + { + "epoch": 1.4632911392405064, + "grad_norm": 1.4953652620315552, + "learning_rate": 9.238132311978299e-05, + "loss": 0.6709978580474854, + "step": 3468 + }, + { + "epoch": 1.4641350210970465, + "grad_norm": 1.2199387550354004, + "learning_rate": 9.236857623230842e-05, + "loss": 0.6691445112228394, + "step": 3470 + }, + { + "epoch": 1.4649789029535865, + "grad_norm": 1.0959199666976929, + "learning_rate": 9.235581957121702e-05, + "loss": 0.6964292526245117, + "step": 3472 + }, + { + "epoch": 1.4658227848101266, + "grad_norm": 1.455505609512329, + "learning_rate": 9.234305313945149e-05, + "loss": 0.6880454421043396, + "step": 3474 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.2820862531661987, + "learning_rate": 9.233027693995681e-05, + "loss": 0.6737138032913208, + "step": 3476 + }, + { + "epoch": 1.4675105485232067, + "grad_norm": 1.3459213972091675, + "learning_rate": 9.231749097568023e-05, + "loss": 0.6874006390571594, + "step": 3478 + }, + { + "epoch": 1.4683544303797469, + "grad_norm": 1.2815442085266113, + "learning_rate": 9.230469524957119e-05, + "loss": 0.7179469466209412, + "step": 3480 + }, + { + "epoch": 1.469198312236287, + "grad_norm": 1.6181597709655762, + "learning_rate": 9.229188976458145e-05, + "loss": 0.7525522112846375, + "step": 3482 + }, + { + "epoch": 1.470042194092827, + "grad_norm": 1.0633227825164795, + "learning_rate": 9.227907452366495e-05, + "loss": 0.5918128490447998, + "step": 3484 + }, + { + "epoch": 1.4708860759493672, + "grad_norm": 1.2055985927581787, + "learning_rate": 9.226624952977796e-05, + "loss": 0.6686186194419861, + "step": 3486 + }, + { + "epoch": 1.471729957805907, + "grad_norm": 1.2495088577270508, + "learning_rate": 9.225341478587893e-05, + "loss": 0.764410674571991, + "step": 3488 + }, + { + "epoch": 1.4725738396624473, + "grad_norm": 1.174229383468628, + "learning_rate": 9.22405702949286e-05, + "loss": 0.7066780924797058, + "step": 3490 + }, + { + "epoch": 1.4734177215189874, + "grad_norm": 1.0970302820205688, + "learning_rate": 9.222771605988995e-05, + "loss": 0.6740228533744812, + "step": 3492 + }, + { + "epoch": 1.4742616033755274, + "grad_norm": 1.2470436096191406, + "learning_rate": 9.221485208372822e-05, + "loss": 0.698371410369873, + "step": 3494 + }, + { + "epoch": 1.4751054852320675, + "grad_norm": 1.0750112533569336, + "learning_rate": 9.220197836941084e-05, + "loss": 0.6354188919067383, + "step": 3496 + }, + { + "epoch": 1.4759493670886075, + "grad_norm": 1.2656232118606567, + "learning_rate": 9.218909491990757e-05, + "loss": 0.7268608212471008, + "step": 3498 + }, + { + "epoch": 1.4767932489451476, + "grad_norm": 1.2389028072357178, + "learning_rate": 9.217620173819037e-05, + "loss": 0.6652966141700745, + "step": 3500 + }, + { + "epoch": 1.4767932489451476, + "eval_loss": 0.7155047059059143, + "eval_runtime": 855.8428, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 2.462, + "step": 3500 + }, + { + "epoch": 1.4776371308016878, + "grad_norm": 1.218304991722107, + "learning_rate": 9.216329882723343e-05, + "loss": 0.6845020651817322, + "step": 3502 + }, + { + "epoch": 1.4784810126582277, + "grad_norm": 1.123903512954712, + "learning_rate": 9.21503861900132e-05, + "loss": 0.6972519755363464, + "step": 3504 + }, + { + "epoch": 1.479324894514768, + "grad_norm": 1.1827739477157593, + "learning_rate": 9.213746382950839e-05, + "loss": 0.6699702739715576, + "step": 3506 + }, + { + "epoch": 1.480168776371308, + "grad_norm": 0.9934872984886169, + "learning_rate": 9.212453174869995e-05, + "loss": 0.5623225569725037, + "step": 3508 + }, + { + "epoch": 1.481012658227848, + "grad_norm": 1.221093773841858, + "learning_rate": 9.211158995057105e-05, + "loss": 0.6527173519134521, + "step": 3510 + }, + { + "epoch": 1.4818565400843882, + "grad_norm": 1.4569166898727417, + "learning_rate": 9.209863843810711e-05, + "loss": 0.7015712261199951, + "step": 3512 + }, + { + "epoch": 1.4827004219409283, + "grad_norm": 1.0764813423156738, + "learning_rate": 9.208567721429581e-05, + "loss": 0.6442505717277527, + "step": 3514 + }, + { + "epoch": 1.4835443037974683, + "grad_norm": 2.1307506561279297, + "learning_rate": 9.207270628212704e-05, + "loss": 0.666451096534729, + "step": 3516 + }, + { + "epoch": 1.4843881856540084, + "grad_norm": 1.180590271949768, + "learning_rate": 9.205972564459296e-05, + "loss": 0.6354807019233704, + "step": 3518 + }, + { + "epoch": 1.4852320675105486, + "grad_norm": 1.2999447584152222, + "learning_rate": 9.204673530468795e-05, + "loss": 0.6080324053764343, + "step": 3520 + }, + { + "epoch": 1.4860759493670885, + "grad_norm": 1.1680655479431152, + "learning_rate": 9.203373526540862e-05, + "loss": 0.6411244869232178, + "step": 3522 + }, + { + "epoch": 1.4869198312236287, + "grad_norm": 1.0565013885498047, + "learning_rate": 9.202072552975383e-05, + "loss": 0.6498287916183472, + "step": 3524 + }, + { + "epoch": 1.4877637130801689, + "grad_norm": 1.246267318725586, + "learning_rate": 9.20077061007247e-05, + "loss": 0.633613109588623, + "step": 3526 + }, + { + "epoch": 1.4886075949367088, + "grad_norm": 1.0626300573349, + "learning_rate": 9.199467698132453e-05, + "loss": 0.6102107167243958, + "step": 3528 + }, + { + "epoch": 1.489451476793249, + "grad_norm": 1.256600260734558, + "learning_rate": 9.198163817455892e-05, + "loss": 0.669352114200592, + "step": 3530 + }, + { + "epoch": 1.4902953586497891, + "grad_norm": 1.143188238143921, + "learning_rate": 9.196858968343565e-05, + "loss": 0.6305804252624512, + "step": 3532 + }, + { + "epoch": 1.491139240506329, + "grad_norm": 1.1471205949783325, + "learning_rate": 9.195553151096475e-05, + "loss": 0.6256994605064392, + "step": 3534 + }, + { + "epoch": 1.4919831223628692, + "grad_norm": 1.1771589517593384, + "learning_rate": 9.194246366015851e-05, + "loss": 0.6395107507705688, + "step": 3536 + }, + { + "epoch": 1.4928270042194094, + "grad_norm": 1.1997097730636597, + "learning_rate": 9.192938613403144e-05, + "loss": 0.6875160932540894, + "step": 3538 + }, + { + "epoch": 1.4936708860759493, + "grad_norm": 1.3962169885635376, + "learning_rate": 9.191629893560024e-05, + "loss": 0.7216510772705078, + "step": 3540 + }, + { + "epoch": 1.4945147679324895, + "grad_norm": 1.1835654973983765, + "learning_rate": 9.19032020678839e-05, + "loss": 0.6870693564414978, + "step": 3542 + }, + { + "epoch": 1.4953586497890297, + "grad_norm": 1.112331509590149, + "learning_rate": 9.18900955339036e-05, + "loss": 0.6266092658042908, + "step": 3544 + }, + { + "epoch": 1.4962025316455696, + "grad_norm": 1.0298354625701904, + "learning_rate": 9.187697933668278e-05, + "loss": 0.5906343460083008, + "step": 3546 + }, + { + "epoch": 1.4970464135021098, + "grad_norm": 1.2650012969970703, + "learning_rate": 9.186385347924709e-05, + "loss": 0.6203610897064209, + "step": 3548 + }, + { + "epoch": 1.49789029535865, + "grad_norm": 1.1208417415618896, + "learning_rate": 9.185071796462441e-05, + "loss": 0.6841281652450562, + "step": 3550 + }, + { + "epoch": 1.4987341772151899, + "grad_norm": 1.1319488286972046, + "learning_rate": 9.183757279584486e-05, + "loss": 0.7089514136314392, + "step": 3552 + }, + { + "epoch": 1.49957805907173, + "grad_norm": 1.1104235649108887, + "learning_rate": 9.182441797594076e-05, + "loss": 0.6663861870765686, + "step": 3554 + }, + { + "epoch": 1.5004219409282702, + "grad_norm": 1.161412000656128, + "learning_rate": 9.18112535079467e-05, + "loss": 0.6713237762451172, + "step": 3556 + }, + { + "epoch": 1.5012658227848101, + "grad_norm": 1.2925246953964233, + "learning_rate": 9.179807939489945e-05, + "loss": 0.6665274500846863, + "step": 3558 + }, + { + "epoch": 1.50210970464135, + "grad_norm": 1.0968270301818848, + "learning_rate": 9.178489563983802e-05, + "loss": 0.6881593465805054, + "step": 3560 + }, + { + "epoch": 1.5029535864978905, + "grad_norm": 1.111439824104309, + "learning_rate": 9.177170224580368e-05, + "loss": 0.631568431854248, + "step": 3562 + }, + { + "epoch": 1.5037974683544304, + "grad_norm": 1.6731075048446655, + "learning_rate": 9.175849921583986e-05, + "loss": 0.6896167397499084, + "step": 3564 + }, + { + "epoch": 1.5046413502109703, + "grad_norm": 1.226739525794983, + "learning_rate": 9.174528655299226e-05, + "loss": 0.6285277605056763, + "step": 3566 + }, + { + "epoch": 1.5054852320675105, + "grad_norm": 1.2030941247940063, + "learning_rate": 9.17320642603088e-05, + "loss": 0.6256678700447083, + "step": 3568 + }, + { + "epoch": 1.5063291139240507, + "grad_norm": 1.1980781555175781, + "learning_rate": 9.171883234083958e-05, + "loss": 0.6895992159843445, + "step": 3570 + }, + { + "epoch": 1.5071729957805906, + "grad_norm": 1.2083429098129272, + "learning_rate": 9.170559079763696e-05, + "loss": 0.6642275452613831, + "step": 3572 + }, + { + "epoch": 1.5080168776371308, + "grad_norm": 1.134020209312439, + "learning_rate": 9.169233963375552e-05, + "loss": 0.7441924214363098, + "step": 3574 + }, + { + "epoch": 1.508860759493671, + "grad_norm": 1.8178621530532837, + "learning_rate": 9.167907885225204e-05, + "loss": 0.6435995101928711, + "step": 3576 + }, + { + "epoch": 1.5097046413502109, + "grad_norm": 1.3850326538085938, + "learning_rate": 9.166580845618553e-05, + "loss": 0.6933603882789612, + "step": 3578 + }, + { + "epoch": 1.510548523206751, + "grad_norm": 1.2500641345977783, + "learning_rate": 9.165252844861723e-05, + "loss": 0.6686714887619019, + "step": 3580 + }, + { + "epoch": 1.5113924050632912, + "grad_norm": 1.0226643085479736, + "learning_rate": 9.163923883261056e-05, + "loss": 0.607890248298645, + "step": 3582 + }, + { + "epoch": 1.5122362869198311, + "grad_norm": 1.233402132987976, + "learning_rate": 9.162593961123118e-05, + "loss": 0.6604583859443665, + "step": 3584 + }, + { + "epoch": 1.5130801687763713, + "grad_norm": 1.2609056234359741, + "learning_rate": 9.161263078754698e-05, + "loss": 0.6756428480148315, + "step": 3586 + }, + { + "epoch": 1.5139240506329115, + "grad_norm": 1.22673761844635, + "learning_rate": 9.159931236462805e-05, + "loss": 0.6990940570831299, + "step": 3588 + }, + { + "epoch": 1.5147679324894514, + "grad_norm": 1.1386182308197021, + "learning_rate": 9.158598434554668e-05, + "loss": 0.6436648964881897, + "step": 3590 + }, + { + "epoch": 1.5156118143459916, + "grad_norm": 1.1136831045150757, + "learning_rate": 9.157264673337739e-05, + "loss": 0.6420145034790039, + "step": 3592 + }, + { + "epoch": 1.5164556962025317, + "grad_norm": 1.1957908868789673, + "learning_rate": 9.155929953119693e-05, + "loss": 0.6518592834472656, + "step": 3594 + }, + { + "epoch": 1.5172995780590717, + "grad_norm": 1.1049647331237793, + "learning_rate": 9.154594274208422e-05, + "loss": 0.6891129612922668, + "step": 3596 + }, + { + "epoch": 1.5181434599156118, + "grad_norm": 1.243675947189331, + "learning_rate": 9.153257636912043e-05, + "loss": 0.6945107579231262, + "step": 3598 + }, + { + "epoch": 1.518987341772152, + "grad_norm": 1.2633713483810425, + "learning_rate": 9.15192004153889e-05, + "loss": 0.7011660933494568, + "step": 3600 + }, + { + "epoch": 1.518987341772152, + "eval_loss": 0.7118256688117981, + "eval_runtime": 851.3079, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 2.475, + "step": 3600 + }, + { + "epoch": 1.519831223628692, + "grad_norm": 1.2995525598526, + "learning_rate": 9.150581488397525e-05, + "loss": 0.6843758821487427, + "step": 3602 + }, + { + "epoch": 1.520675105485232, + "grad_norm": 1.3140910863876343, + "learning_rate": 9.149241977796723e-05, + "loss": 0.6699353456497192, + "step": 3604 + }, + { + "epoch": 1.5215189873417723, + "grad_norm": 1.2674909830093384, + "learning_rate": 9.147901510045485e-05, + "loss": 0.7269271612167358, + "step": 3606 + }, + { + "epoch": 1.5223628691983122, + "grad_norm": 1.0232038497924805, + "learning_rate": 9.146560085453031e-05, + "loss": 0.5556837916374207, + "step": 3608 + }, + { + "epoch": 1.5232067510548524, + "grad_norm": 1.2598992586135864, + "learning_rate": 9.1452177043288e-05, + "loss": 0.7273092269897461, + "step": 3610 + }, + { + "epoch": 1.5240506329113925, + "grad_norm": 1.2002917528152466, + "learning_rate": 9.143874366982455e-05, + "loss": 0.6897470355033875, + "step": 3612 + }, + { + "epoch": 1.5248945147679325, + "grad_norm": 1.0959099531173706, + "learning_rate": 9.142530073723878e-05, + "loss": 0.6060715913772583, + "step": 3614 + }, + { + "epoch": 1.5257383966244724, + "grad_norm": 1.9890750646591187, + "learning_rate": 9.141184824863173e-05, + "loss": 0.6585046052932739, + "step": 3616 + }, + { + "epoch": 1.5265822784810128, + "grad_norm": 1.1460137367248535, + "learning_rate": 9.139838620710663e-05, + "loss": 0.6022046804428101, + "step": 3618 + }, + { + "epoch": 1.5274261603375527, + "grad_norm": 1.193206548690796, + "learning_rate": 9.138491461576888e-05, + "loss": 0.6332581639289856, + "step": 3620 + }, + { + "epoch": 1.5282700421940927, + "grad_norm": 1.2813689708709717, + "learning_rate": 9.137143347772614e-05, + "loss": 0.6690208315849304, + "step": 3622 + }, + { + "epoch": 1.529113924050633, + "grad_norm": 1.0950052738189697, + "learning_rate": 9.135794279608827e-05, + "loss": 0.6034293174743652, + "step": 3624 + }, + { + "epoch": 1.529957805907173, + "grad_norm": 1.208884358406067, + "learning_rate": 9.134444257396729e-05, + "loss": 0.7077960968017578, + "step": 3626 + }, + { + "epoch": 1.530801687763713, + "grad_norm": 1.093759298324585, + "learning_rate": 9.133093281447742e-05, + "loss": 0.6741147637367249, + "step": 3628 + }, + { + "epoch": 1.5316455696202531, + "grad_norm": 1.1280012130737305, + "learning_rate": 9.131741352073514e-05, + "loss": 0.6816818118095398, + "step": 3630 + }, + { + "epoch": 1.5324894514767933, + "grad_norm": 1.2868385314941406, + "learning_rate": 9.130388469585907e-05, + "loss": 0.7149180769920349, + "step": 3632 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.9654553532600403, + "learning_rate": 9.129034634297007e-05, + "loss": 0.613467812538147, + "step": 3634 + }, + { + "epoch": 1.5341772151898734, + "grad_norm": 1.8958736658096313, + "learning_rate": 9.127679846519115e-05, + "loss": 0.7034116387367249, + "step": 3636 + }, + { + "epoch": 1.5350210970464135, + "grad_norm": 1.305284857749939, + "learning_rate": 9.126324106564757e-05, + "loss": 0.7076106667518616, + "step": 3638 + }, + { + "epoch": 1.5358649789029535, + "grad_norm": 1.1843762397766113, + "learning_rate": 9.124967414746675e-05, + "loss": 0.6671180725097656, + "step": 3640 + }, + { + "epoch": 1.5367088607594936, + "grad_norm": 1.0460047721862793, + "learning_rate": 9.123609771377832e-05, + "loss": 0.667533814907074, + "step": 3642 + }, + { + "epoch": 1.5375527426160338, + "grad_norm": 1.0441135168075562, + "learning_rate": 9.122251176771409e-05, + "loss": 0.6454499959945679, + "step": 3644 + }, + { + "epoch": 1.5383966244725737, + "grad_norm": 1.5647634267807007, + "learning_rate": 9.120891631240811e-05, + "loss": 0.677007794380188, + "step": 3646 + }, + { + "epoch": 1.539240506329114, + "grad_norm": 1.0650273561477661, + "learning_rate": 9.119531135099655e-05, + "loss": 0.7017449736595154, + "step": 3648 + }, + { + "epoch": 1.540084388185654, + "grad_norm": 1.2904767990112305, + "learning_rate": 9.118169688661784e-05, + "loss": 0.683830738067627, + "step": 3650 + }, + { + "epoch": 1.540928270042194, + "grad_norm": 1.1278672218322754, + "learning_rate": 9.116807292241257e-05, + "loss": 0.5923286080360413, + "step": 3652 + }, + { + "epoch": 1.5417721518987342, + "grad_norm": 1.1107184886932373, + "learning_rate": 9.115443946152352e-05, + "loss": 0.6595140099525452, + "step": 3654 + }, + { + "epoch": 1.5426160337552743, + "grad_norm": 1.0917898416519165, + "learning_rate": 9.114079650709566e-05, + "loss": 0.655241072177887, + "step": 3656 + }, + { + "epoch": 1.5434599156118143, + "grad_norm": 1.1922433376312256, + "learning_rate": 9.11271440622762e-05, + "loss": 0.5987096428871155, + "step": 3658 + }, + { + "epoch": 1.5443037974683544, + "grad_norm": 0.9974617958068848, + "learning_rate": 9.111348213021445e-05, + "loss": 0.5710145235061646, + "step": 3660 + }, + { + "epoch": 1.5451476793248946, + "grad_norm": 1.133683443069458, + "learning_rate": 9.109981071406197e-05, + "loss": 0.6067734360694885, + "step": 3662 + }, + { + "epoch": 1.5459915611814345, + "grad_norm": 1.1958736181259155, + "learning_rate": 9.108612981697248e-05, + "loss": 0.622981071472168, + "step": 3664 + }, + { + "epoch": 1.5468354430379747, + "grad_norm": 1.234328031539917, + "learning_rate": 9.107243944210194e-05, + "loss": 0.6520710587501526, + "step": 3666 + }, + { + "epoch": 1.5476793248945149, + "grad_norm": 1.0374714136123657, + "learning_rate": 9.105873959260842e-05, + "loss": 0.5993341207504272, + "step": 3668 + }, + { + "epoch": 1.5485232067510548, + "grad_norm": 0.9987428784370422, + "learning_rate": 9.104503027165223e-05, + "loss": 0.6564813852310181, + "step": 3670 + }, + { + "epoch": 1.549367088607595, + "grad_norm": 1.0823339223861694, + "learning_rate": 9.103131148239584e-05, + "loss": 0.61710524559021, + "step": 3672 + }, + { + "epoch": 1.5502109704641351, + "grad_norm": 1.3481065034866333, + "learning_rate": 9.101758322800391e-05, + "loss": 0.687752366065979, + "step": 3674 + }, + { + "epoch": 1.551054852320675, + "grad_norm": 1.2243965864181519, + "learning_rate": 9.10038455116433e-05, + "loss": 0.5981095433235168, + "step": 3676 + }, + { + "epoch": 1.5518987341772152, + "grad_norm": 1.1384631395339966, + "learning_rate": 9.0990098336483e-05, + "loss": 0.7181004285812378, + "step": 3678 + }, + { + "epoch": 1.5527426160337554, + "grad_norm": 1.042925477027893, + "learning_rate": 9.097634170569426e-05, + "loss": 0.6137188076972961, + "step": 3680 + }, + { + "epoch": 1.5535864978902953, + "grad_norm": 1.372023105621338, + "learning_rate": 9.096257562245045e-05, + "loss": 0.6761168241500854, + "step": 3682 + }, + { + "epoch": 1.5544303797468353, + "grad_norm": 1.0574673414230347, + "learning_rate": 9.094880008992714e-05, + "loss": 0.614276647567749, + "step": 3684 + }, + { + "epoch": 1.5552742616033757, + "grad_norm": 1.2894645929336548, + "learning_rate": 9.093501511130208e-05, + "loss": 0.668122410774231, + "step": 3686 + }, + { + "epoch": 1.5561181434599156, + "grad_norm": 1.2241230010986328, + "learning_rate": 9.092122068975523e-05, + "loss": 0.6305631399154663, + "step": 3688 + }, + { + "epoch": 1.5569620253164556, + "grad_norm": 1.1316208839416504, + "learning_rate": 9.090741682846866e-05, + "loss": 0.633276641368866, + "step": 3690 + }, + { + "epoch": 1.557805907172996, + "grad_norm": 1.2857953310012817, + "learning_rate": 9.089360353062666e-05, + "loss": 0.6657599806785583, + "step": 3692 + }, + { + "epoch": 1.5586497890295359, + "grad_norm": 1.2325671911239624, + "learning_rate": 9.087978079941573e-05, + "loss": 0.6379332542419434, + "step": 3694 + }, + { + "epoch": 1.5594936708860758, + "grad_norm": 1.3286080360412598, + "learning_rate": 9.086594863802445e-05, + "loss": 0.6841909885406494, + "step": 3696 + }, + { + "epoch": 1.560337552742616, + "grad_norm": 1.261890172958374, + "learning_rate": 9.085210704964368e-05, + "loss": 0.6735964417457581, + "step": 3698 + }, + { + "epoch": 1.5611814345991561, + "grad_norm": 1.0922305583953857, + "learning_rate": 9.083825603746639e-05, + "loss": 0.6602351665496826, + "step": 3700 + }, + { + "epoch": 1.5611814345991561, + "eval_loss": 0.7099412679672241, + "eval_runtime": 857.2273, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 3700 + }, + { + "epoch": 1.562025316455696, + "grad_norm": 1.1113468408584595, + "learning_rate": 9.082439560468774e-05, + "loss": 0.6590834259986877, + "step": 3702 + }, + { + "epoch": 1.5628691983122363, + "grad_norm": 1.1476659774780273, + "learning_rate": 9.081052575450508e-05, + "loss": 0.6397460103034973, + "step": 3704 + }, + { + "epoch": 1.5637130801687764, + "grad_norm": 1.2270452976226807, + "learning_rate": 9.07966464901179e-05, + "loss": 0.6337460279464722, + "step": 3706 + }, + { + "epoch": 1.5645569620253164, + "grad_norm": 1.233667016029358, + "learning_rate": 9.07827578147279e-05, + "loss": 0.680374801158905, + "step": 3708 + }, + { + "epoch": 1.5654008438818565, + "grad_norm": 1.0761466026306152, + "learning_rate": 9.076885973153891e-05, + "loss": 0.6234241724014282, + "step": 3710 + }, + { + "epoch": 1.5662447257383967, + "grad_norm": 0.9219012260437012, + "learning_rate": 9.075495224375697e-05, + "loss": 0.6096800565719604, + "step": 3712 + }, + { + "epoch": 1.5670886075949366, + "grad_norm": 1.151168942451477, + "learning_rate": 9.074103535459026e-05, + "loss": 0.649919867515564, + "step": 3714 + }, + { + "epoch": 1.5679324894514768, + "grad_norm": 1.1380470991134644, + "learning_rate": 9.072710906724914e-05, + "loss": 0.6704574227333069, + "step": 3716 + }, + { + "epoch": 1.568776371308017, + "grad_norm": 1.2184447050094604, + "learning_rate": 9.071317338494614e-05, + "loss": 0.6619362831115723, + "step": 3718 + }, + { + "epoch": 1.5696202531645569, + "grad_norm": 1.131170630455017, + "learning_rate": 9.069922831089594e-05, + "loss": 0.6179121732711792, + "step": 3720 + }, + { + "epoch": 1.570464135021097, + "grad_norm": 1.2668405771255493, + "learning_rate": 9.06852738483154e-05, + "loss": 0.594958484172821, + "step": 3722 + }, + { + "epoch": 1.5713080168776372, + "grad_norm": 1.1624782085418701, + "learning_rate": 9.067131000042359e-05, + "loss": 0.6323778629302979, + "step": 3724 + }, + { + "epoch": 1.5721518987341772, + "grad_norm": 1.2936128377914429, + "learning_rate": 9.065733677044166e-05, + "loss": 0.628058910369873, + "step": 3726 + }, + { + "epoch": 1.5729957805907173, + "grad_norm": 1.1847784519195557, + "learning_rate": 9.064335416159296e-05, + "loss": 0.6472614407539368, + "step": 3728 + }, + { + "epoch": 1.5738396624472575, + "grad_norm": 1.8903449773788452, + "learning_rate": 9.062936217710305e-05, + "loss": 0.6395491361618042, + "step": 3730 + }, + { + "epoch": 1.5746835443037974, + "grad_norm": 1.1150785684585571, + "learning_rate": 9.061536082019956e-05, + "loss": 0.6911961436271667, + "step": 3732 + }, + { + "epoch": 1.5755274261603376, + "grad_norm": 1.1206107139587402, + "learning_rate": 9.060135009411239e-05, + "loss": 0.7051874399185181, + "step": 3734 + }, + { + "epoch": 1.5763713080168777, + "grad_norm": 1.27924382686615, + "learning_rate": 9.05873300020735e-05, + "loss": 0.7012752890586853, + "step": 3736 + }, + { + "epoch": 1.5772151898734177, + "grad_norm": 1.3970832824707031, + "learning_rate": 9.057330054731707e-05, + "loss": 0.7185142040252686, + "step": 3738 + }, + { + "epoch": 1.5780590717299579, + "grad_norm": 0.9732457995414734, + "learning_rate": 9.055926173307945e-05, + "loss": 0.6298858523368835, + "step": 3740 + }, + { + "epoch": 1.578902953586498, + "grad_norm": 1.230928897857666, + "learning_rate": 9.054521356259909e-05, + "loss": 0.7142943739891052, + "step": 3742 + }, + { + "epoch": 1.579746835443038, + "grad_norm": 1.1297426223754883, + "learning_rate": 9.053115603911664e-05, + "loss": 0.6535376310348511, + "step": 3744 + }, + { + "epoch": 1.580590717299578, + "grad_norm": 1.2132076025009155, + "learning_rate": 9.051708916587491e-05, + "loss": 0.6236510872840881, + "step": 3746 + }, + { + "epoch": 1.5814345991561183, + "grad_norm": 1.201319932937622, + "learning_rate": 9.050301294611885e-05, + "loss": 0.6752219200134277, + "step": 3748 + }, + { + "epoch": 1.5822784810126582, + "grad_norm": 1.2969163656234741, + "learning_rate": 9.048892738309559e-05, + "loss": 0.7248554825782776, + "step": 3750 + }, + { + "epoch": 1.5831223628691982, + "grad_norm": 1.0721957683563232, + "learning_rate": 9.047483248005439e-05, + "loss": 0.6488997340202332, + "step": 3752 + }, + { + "epoch": 1.5839662447257385, + "grad_norm": 0.9988508820533752, + "learning_rate": 9.046072824024667e-05, + "loss": 0.6191130876541138, + "step": 3754 + }, + { + "epoch": 1.5848101265822785, + "grad_norm": 1.260183572769165, + "learning_rate": 9.0446614666926e-05, + "loss": 0.6681985259056091, + "step": 3756 + }, + { + "epoch": 1.5856540084388184, + "grad_norm": 1.1288834810256958, + "learning_rate": 9.043249176334812e-05, + "loss": 0.662024736404419, + "step": 3758 + }, + { + "epoch": 1.5864978902953588, + "grad_norm": 1.4384263753890991, + "learning_rate": 9.04183595327709e-05, + "loss": 0.609916627407074, + "step": 3760 + }, + { + "epoch": 1.5873417721518988, + "grad_norm": 1.1109941005706787, + "learning_rate": 9.04042179784544e-05, + "loss": 0.6532528400421143, + "step": 3762 + }, + { + "epoch": 1.5881856540084387, + "grad_norm": 1.0959233045578003, + "learning_rate": 9.039006710366078e-05, + "loss": 0.7136290669441223, + "step": 3764 + }, + { + "epoch": 1.5890295358649789, + "grad_norm": 1.2313964366912842, + "learning_rate": 9.037590691165439e-05, + "loss": 0.6907190084457397, + "step": 3766 + }, + { + "epoch": 1.589873417721519, + "grad_norm": 1.3127682209014893, + "learning_rate": 9.036173740570172e-05, + "loss": 0.7114790678024292, + "step": 3768 + }, + { + "epoch": 1.590717299578059, + "grad_norm": 1.0038903951644897, + "learning_rate": 9.034755858907138e-05, + "loss": 0.6257581114768982, + "step": 3770 + }, + { + "epoch": 1.5915611814345991, + "grad_norm": 1.1058061122894287, + "learning_rate": 9.033337046503416e-05, + "loss": 0.578145444393158, + "step": 3772 + }, + { + "epoch": 1.5924050632911393, + "grad_norm": 1.0893515348434448, + "learning_rate": 9.0319173036863e-05, + "loss": 0.6312620043754578, + "step": 3774 + }, + { + "epoch": 1.5932489451476792, + "grad_norm": 1.1091047525405884, + "learning_rate": 9.030496630783297e-05, + "loss": 0.6799508333206177, + "step": 3776 + }, + { + "epoch": 1.5940928270042194, + "grad_norm": 1.1103609800338745, + "learning_rate": 9.029075028122127e-05, + "loss": 0.678726315498352, + "step": 3778 + }, + { + "epoch": 1.5949367088607596, + "grad_norm": 1.1918376684188843, + "learning_rate": 9.027652496030728e-05, + "loss": 0.7357890009880066, + "step": 3780 + }, + { + "epoch": 1.5957805907172995, + "grad_norm": 1.0541924238204956, + "learning_rate": 9.026229034837253e-05, + "loss": 0.6079391241073608, + "step": 3782 + }, + { + "epoch": 1.5966244725738397, + "grad_norm": 1.195845603942871, + "learning_rate": 9.024804644870062e-05, + "loss": 0.7173702120780945, + "step": 3784 + }, + { + "epoch": 1.5974683544303798, + "grad_norm": 1.1362866163253784, + "learning_rate": 9.023379326457737e-05, + "loss": 0.6431670188903809, + "step": 3786 + }, + { + "epoch": 1.5983122362869198, + "grad_norm": 1.2327499389648438, + "learning_rate": 9.021953079929074e-05, + "loss": 0.6346777677536011, + "step": 3788 + }, + { + "epoch": 1.59915611814346, + "grad_norm": 1.1623177528381348, + "learning_rate": 9.020525905613078e-05, + "loss": 0.6852784156799316, + "step": 3790 + }, + { + "epoch": 1.6, + "grad_norm": 1.0258424282073975, + "learning_rate": 9.019097803838971e-05, + "loss": 0.6357095241546631, + "step": 3792 + }, + { + "epoch": 1.60084388185654, + "grad_norm": 1.0825177431106567, + "learning_rate": 9.017668774936188e-05, + "loss": 0.6663659811019897, + "step": 3794 + }, + { + "epoch": 1.6016877637130802, + "grad_norm": 1.1190401315689087, + "learning_rate": 9.016238819234381e-05, + "loss": 0.6009758710861206, + "step": 3796 + }, + { + "epoch": 1.6025316455696204, + "grad_norm": 1.09871244430542, + "learning_rate": 9.01480793706341e-05, + "loss": 0.6907890439033508, + "step": 3798 + }, + { + "epoch": 1.6033755274261603, + "grad_norm": 1.2046958208084106, + "learning_rate": 9.013376128753354e-05, + "loss": 0.6709389090538025, + "step": 3800 + }, + { + "epoch": 1.6033755274261603, + "eval_loss": 0.7080941200256348, + "eval_runtime": 865.6774, + "eval_samples_per_second": 2.434, + "eval_steps_per_second": 2.434, + "step": 3800 + }, + { + "epoch": 1.6042194092827005, + "grad_norm": 1.0671489238739014, + "learning_rate": 9.011943394634505e-05, + "loss": 0.653937041759491, + "step": 3802 + }, + { + "epoch": 1.6050632911392406, + "grad_norm": 1.4205375909805298, + "learning_rate": 9.010509735037364e-05, + "loss": 0.6647229194641113, + "step": 3804 + }, + { + "epoch": 1.6059071729957806, + "grad_norm": 1.3793799877166748, + "learning_rate": 9.009075150292652e-05, + "loss": 0.6981267929077148, + "step": 3806 + }, + { + "epoch": 1.6067510548523207, + "grad_norm": 1.0534380674362183, + "learning_rate": 9.007639640731298e-05, + "loss": 0.6151314973831177, + "step": 3808 + }, + { + "epoch": 1.6075949367088609, + "grad_norm": 1.1359853744506836, + "learning_rate": 9.006203206684447e-05, + "loss": 0.6671237349510193, + "step": 3810 + }, + { + "epoch": 1.6084388185654008, + "grad_norm": 1.2385475635528564, + "learning_rate": 9.004765848483456e-05, + "loss": 0.7145646810531616, + "step": 3812 + }, + { + "epoch": 1.6092827004219408, + "grad_norm": 1.1323930025100708, + "learning_rate": 9.003327566459899e-05, + "loss": 0.6524789929389954, + "step": 3814 + }, + { + "epoch": 1.6101265822784812, + "grad_norm": 1.1863508224487305, + "learning_rate": 9.001888360945555e-05, + "loss": 0.7574670314788818, + "step": 3816 + }, + { + "epoch": 1.610970464135021, + "grad_norm": 1.0288994312286377, + "learning_rate": 9.000448232272425e-05, + "loss": 0.5858811736106873, + "step": 3818 + }, + { + "epoch": 1.611814345991561, + "grad_norm": 1.2674148082733154, + "learning_rate": 8.999007180772719e-05, + "loss": 0.6834250688552856, + "step": 3820 + }, + { + "epoch": 1.6126582278481014, + "grad_norm": 1.2014318704605103, + "learning_rate": 8.997565206778856e-05, + "loss": 0.6435309052467346, + "step": 3822 + }, + { + "epoch": 1.6135021097046414, + "grad_norm": 1.205741286277771, + "learning_rate": 8.996122310623476e-05, + "loss": 0.6212471127510071, + "step": 3824 + }, + { + "epoch": 1.6143459915611813, + "grad_norm": 1.0866186618804932, + "learning_rate": 8.994678492639426e-05, + "loss": 0.6832143664360046, + "step": 3826 + }, + { + "epoch": 1.6151898734177215, + "grad_norm": 1.0786924362182617, + "learning_rate": 8.993233753159768e-05, + "loss": 0.6129988431930542, + "step": 3828 + }, + { + "epoch": 1.6160337552742616, + "grad_norm": 1.176597237586975, + "learning_rate": 8.991788092517775e-05, + "loss": 0.6376019716262817, + "step": 3830 + }, + { + "epoch": 1.6168776371308016, + "grad_norm": 1.149990200996399, + "learning_rate": 8.99034151104693e-05, + "loss": 0.7300569415092468, + "step": 3832 + }, + { + "epoch": 1.6177215189873417, + "grad_norm": 1.0655301809310913, + "learning_rate": 8.988894009080936e-05, + "loss": 0.6163336634635925, + "step": 3834 + }, + { + "epoch": 1.618565400843882, + "grad_norm": 1.1596909761428833, + "learning_rate": 8.987445586953703e-05, + "loss": 0.6459008455276489, + "step": 3836 + }, + { + "epoch": 1.6194092827004218, + "grad_norm": 1.201897382736206, + "learning_rate": 8.985996244999352e-05, + "loss": 0.6166399121284485, + "step": 3838 + }, + { + "epoch": 1.620253164556962, + "grad_norm": 1.1000950336456299, + "learning_rate": 8.984545983552219e-05, + "loss": 0.6438087224960327, + "step": 3840 + }, + { + "epoch": 1.6210970464135022, + "grad_norm": 0.9962409734725952, + "learning_rate": 8.983094802946854e-05, + "loss": 0.6238043308258057, + "step": 3842 + }, + { + "epoch": 1.621940928270042, + "grad_norm": 1.2501682043075562, + "learning_rate": 8.981642703518015e-05, + "loss": 0.6445946097373962, + "step": 3844 + }, + { + "epoch": 1.6227848101265823, + "grad_norm": 1.2027913331985474, + "learning_rate": 8.980189685600673e-05, + "loss": 0.7147613167762756, + "step": 3846 + }, + { + "epoch": 1.6236286919831224, + "grad_norm": 1.1382197141647339, + "learning_rate": 8.97873574953001e-05, + "loss": 0.6531714200973511, + "step": 3848 + }, + { + "epoch": 1.6244725738396624, + "grad_norm": 1.2600723505020142, + "learning_rate": 8.977280895641425e-05, + "loss": 0.6811055541038513, + "step": 3850 + }, + { + "epoch": 1.6253164556962025, + "grad_norm": 0.9908071160316467, + "learning_rate": 8.97582512427052e-05, + "loss": 0.6142261624336243, + "step": 3852 + }, + { + "epoch": 1.6261603375527427, + "grad_norm": 1.171557068824768, + "learning_rate": 8.974368435753117e-05, + "loss": 0.6408987045288086, + "step": 3854 + }, + { + "epoch": 1.6270042194092826, + "grad_norm": 1.1839419603347778, + "learning_rate": 8.972910830425247e-05, + "loss": 0.7352069616317749, + "step": 3856 + }, + { + "epoch": 1.6278481012658228, + "grad_norm": 1.233730673789978, + "learning_rate": 8.971452308623148e-05, + "loss": 0.7663040161132812, + "step": 3858 + }, + { + "epoch": 1.628691983122363, + "grad_norm": 1.3636224269866943, + "learning_rate": 8.969992870683273e-05, + "loss": 0.6496971249580383, + "step": 3860 + }, + { + "epoch": 1.629535864978903, + "grad_norm": 1.2819573879241943, + "learning_rate": 8.96853251694229e-05, + "loss": 0.6079609394073486, + "step": 3862 + }, + { + "epoch": 1.630379746835443, + "grad_norm": 1.087265968322754, + "learning_rate": 8.967071247737071e-05, + "loss": 0.6299422979354858, + "step": 3864 + }, + { + "epoch": 1.6312236286919832, + "grad_norm": 1.24200439453125, + "learning_rate": 8.965609063404706e-05, + "loss": 0.6691840291023254, + "step": 3866 + }, + { + "epoch": 1.6320675105485232, + "grad_norm": 1.0771806240081787, + "learning_rate": 8.96414596428249e-05, + "loss": 0.6623613238334656, + "step": 3868 + }, + { + "epoch": 1.6329113924050633, + "grad_norm": 1.1830974817276, + "learning_rate": 8.962681950707932e-05, + "loss": 0.6663276553153992, + "step": 3870 + }, + { + "epoch": 1.6337552742616035, + "grad_norm": 1.1107177734375, + "learning_rate": 8.961217023018754e-05, + "loss": 0.6426810622215271, + "step": 3872 + }, + { + "epoch": 1.6345991561181434, + "grad_norm": 1.2528507709503174, + "learning_rate": 8.959751181552886e-05, + "loss": 0.7113696336746216, + "step": 3874 + }, + { + "epoch": 1.6354430379746834, + "grad_norm": 1.0656070709228516, + "learning_rate": 8.958284426648467e-05, + "loss": 0.6211581230163574, + "step": 3876 + }, + { + "epoch": 1.6362869198312238, + "grad_norm": 1.0627381801605225, + "learning_rate": 8.956816758643852e-05, + "loss": 0.5950066447257996, + "step": 3878 + }, + { + "epoch": 1.6371308016877637, + "grad_norm": 0.9812912344932556, + "learning_rate": 8.955348177877603e-05, + "loss": 0.6519815325737, + "step": 3880 + }, + { + "epoch": 1.6379746835443036, + "grad_norm": 1.1843842267990112, + "learning_rate": 8.953878684688493e-05, + "loss": 0.6830767393112183, + "step": 3882 + }, + { + "epoch": 1.638818565400844, + "grad_norm": 1.0393236875534058, + "learning_rate": 8.952408279415507e-05, + "loss": 0.5920302271842957, + "step": 3884 + }, + { + "epoch": 1.639662447257384, + "grad_norm": 0.9931944608688354, + "learning_rate": 8.950936962397838e-05, + "loss": 0.6269177198410034, + "step": 3886 + }, + { + "epoch": 1.640506329113924, + "grad_norm": 1.1461358070373535, + "learning_rate": 8.949464733974891e-05, + "loss": 0.7021532654762268, + "step": 3888 + }, + { + "epoch": 1.6413502109704643, + "grad_norm": 1.2654093503952026, + "learning_rate": 8.947991594486279e-05, + "loss": 0.7331246733665466, + "step": 3890 + }, + { + "epoch": 1.6421940928270042, + "grad_norm": 1.1487081050872803, + "learning_rate": 8.946517544271831e-05, + "loss": 0.6438513994216919, + "step": 3892 + }, + { + "epoch": 1.6430379746835442, + "grad_norm": 1.0876784324645996, + "learning_rate": 8.945042583671579e-05, + "loss": 0.6779276728630066, + "step": 3894 + }, + { + "epoch": 1.6438818565400843, + "grad_norm": 1.2382020950317383, + "learning_rate": 8.943566713025768e-05, + "loss": 0.7255419492721558, + "step": 3896 + }, + { + "epoch": 1.6447257383966245, + "grad_norm": 1.3502718210220337, + "learning_rate": 8.942089932674855e-05, + "loss": 0.7068934440612793, + "step": 3898 + }, + { + "epoch": 1.6455696202531644, + "grad_norm": 1.050878643989563, + "learning_rate": 8.940612242959503e-05, + "loss": 0.608700156211853, + "step": 3900 + }, + { + "epoch": 1.6455696202531644, + "eval_loss": 0.7049403786659241, + "eval_runtime": 854.9866, + "eval_samples_per_second": 2.464, + "eval_steps_per_second": 2.464, + "step": 3900 + }, + { + "epoch": 1.6464135021097046, + "grad_norm": 1.0536954402923584, + "learning_rate": 8.939133644220588e-05, + "loss": 0.6257222890853882, + "step": 3902 + }, + { + "epoch": 1.6472573839662448, + "grad_norm": 1.1903947591781616, + "learning_rate": 8.937654136799195e-05, + "loss": 0.6823404431343079, + "step": 3904 + }, + { + "epoch": 1.6481012658227847, + "grad_norm": 1.225679874420166, + "learning_rate": 8.936173721036616e-05, + "loss": 0.6596478819847107, + "step": 3906 + }, + { + "epoch": 1.6489451476793249, + "grad_norm": 1.0071430206298828, + "learning_rate": 8.934692397274354e-05, + "loss": 0.5638422966003418, + "step": 3908 + }, + { + "epoch": 1.649789029535865, + "grad_norm": 1.0146223306655884, + "learning_rate": 8.933210165854125e-05, + "loss": 0.5743419528007507, + "step": 3910 + }, + { + "epoch": 1.650632911392405, + "grad_norm": 1.122976541519165, + "learning_rate": 8.931727027117848e-05, + "loss": 0.6775169372558594, + "step": 3912 + }, + { + "epoch": 1.6514767932489451, + "grad_norm": 0.9223271012306213, + "learning_rate": 8.930242981407656e-05, + "loss": 0.5984215140342712, + "step": 3914 + }, + { + "epoch": 1.6523206751054853, + "grad_norm": 1.1599735021591187, + "learning_rate": 8.928758029065891e-05, + "loss": 0.6342158913612366, + "step": 3916 + }, + { + "epoch": 1.6531645569620252, + "grad_norm": 1.2680121660232544, + "learning_rate": 8.927272170435101e-05, + "loss": 0.678507924079895, + "step": 3918 + }, + { + "epoch": 1.6540084388185654, + "grad_norm": 1.3628549575805664, + "learning_rate": 8.925785405858047e-05, + "loss": 0.6739710569381714, + "step": 3920 + }, + { + "epoch": 1.6548523206751056, + "grad_norm": 1.163482427597046, + "learning_rate": 8.924297735677694e-05, + "loss": 0.7050020098686218, + "step": 3922 + }, + { + "epoch": 1.6556962025316455, + "grad_norm": 1.2057000398635864, + "learning_rate": 8.922809160237222e-05, + "loss": 0.6847540140151978, + "step": 3924 + }, + { + "epoch": 1.6565400843881857, + "grad_norm": 1.2784082889556885, + "learning_rate": 8.921319679880016e-05, + "loss": 0.7079069018363953, + "step": 3926 + }, + { + "epoch": 1.6573839662447258, + "grad_norm": 1.1701157093048096, + "learning_rate": 8.919829294949671e-05, + "loss": 0.665060818195343, + "step": 3928 + }, + { + "epoch": 1.6582278481012658, + "grad_norm": 1.3886606693267822, + "learning_rate": 8.918338005789988e-05, + "loss": 0.7547550201416016, + "step": 3930 + }, + { + "epoch": 1.659071729957806, + "grad_norm": 0.9504727721214294, + "learning_rate": 8.91684581274498e-05, + "loss": 0.5718522667884827, + "step": 3932 + }, + { + "epoch": 1.659915611814346, + "grad_norm": 1.1185030937194824, + "learning_rate": 8.915352716158869e-05, + "loss": 0.5984254479408264, + "step": 3934 + }, + { + "epoch": 1.660759493670886, + "grad_norm": 1.1489602327346802, + "learning_rate": 8.913858716376081e-05, + "loss": 0.6749780774116516, + "step": 3936 + }, + { + "epoch": 1.6616033755274262, + "grad_norm": 1.389431118965149, + "learning_rate": 8.912363813741255e-05, + "loss": 0.6537864804267883, + "step": 3938 + }, + { + "epoch": 1.6624472573839664, + "grad_norm": 1.0958757400512695, + "learning_rate": 8.910868008599235e-05, + "loss": 0.6033569574356079, + "step": 3940 + }, + { + "epoch": 1.6632911392405063, + "grad_norm": 1.2735344171524048, + "learning_rate": 8.909371301295075e-05, + "loss": 0.7404987215995789, + "step": 3942 + }, + { + "epoch": 1.6641350210970463, + "grad_norm": 1.123336911201477, + "learning_rate": 8.907873692174038e-05, + "loss": 0.6265006065368652, + "step": 3944 + }, + { + "epoch": 1.6649789029535866, + "grad_norm": 1.259470820426941, + "learning_rate": 8.90637518158159e-05, + "loss": 0.650705099105835, + "step": 3946 + }, + { + "epoch": 1.6658227848101266, + "grad_norm": 1.4020485877990723, + "learning_rate": 8.904875769863412e-05, + "loss": 0.7813970446586609, + "step": 3948 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.1709671020507812, + "learning_rate": 8.903375457365389e-05, + "loss": 0.6499447822570801, + "step": 3950 + }, + { + "epoch": 1.667510548523207, + "grad_norm": 1.085585355758667, + "learning_rate": 8.901874244433612e-05, + "loss": 0.6141875386238098, + "step": 3952 + }, + { + "epoch": 1.6683544303797468, + "grad_norm": 1.2340166568756104, + "learning_rate": 8.900372131414386e-05, + "loss": 0.7080221176147461, + "step": 3954 + }, + { + "epoch": 1.6691983122362868, + "grad_norm": 1.148576259613037, + "learning_rate": 8.898869118654216e-05, + "loss": 0.6340513229370117, + "step": 3956 + }, + { + "epoch": 1.6700421940928272, + "grad_norm": 1.2231999635696411, + "learning_rate": 8.89736520649982e-05, + "loss": 0.6999116539955139, + "step": 3958 + }, + { + "epoch": 1.6708860759493671, + "grad_norm": 1.1600396633148193, + "learning_rate": 8.895860395298121e-05, + "loss": 0.7177759408950806, + "step": 3960 + }, + { + "epoch": 1.671729957805907, + "grad_norm": 1.3019158840179443, + "learning_rate": 8.894354685396251e-05, + "loss": 0.6485702395439148, + "step": 3962 + }, + { + "epoch": 1.6725738396624472, + "grad_norm": 1.0153226852416992, + "learning_rate": 8.892848077141546e-05, + "loss": 0.6189450025558472, + "step": 3964 + }, + { + "epoch": 1.6734177215189874, + "grad_norm": 1.1953094005584717, + "learning_rate": 8.891340570881555e-05, + "loss": 0.6756728291511536, + "step": 3966 + }, + { + "epoch": 1.6742616033755273, + "grad_norm": 1.3376187086105347, + "learning_rate": 8.889832166964027e-05, + "loss": 0.6851167678833008, + "step": 3968 + }, + { + "epoch": 1.6751054852320675, + "grad_norm": 1.0045926570892334, + "learning_rate": 8.888322865736924e-05, + "loss": 0.5991915464401245, + "step": 3970 + }, + { + "epoch": 1.6759493670886076, + "grad_norm": 1.2115750312805176, + "learning_rate": 8.886812667548414e-05, + "loss": 0.713362455368042, + "step": 3972 + }, + { + "epoch": 1.6767932489451476, + "grad_norm": 1.1887929439544678, + "learning_rate": 8.88530157274687e-05, + "loss": 0.7058883309364319, + "step": 3974 + }, + { + "epoch": 1.6776371308016877, + "grad_norm": 1.1465295553207397, + "learning_rate": 8.883789581680868e-05, + "loss": 0.6501380801200867, + "step": 3976 + }, + { + "epoch": 1.678481012658228, + "grad_norm": 1.184693694114685, + "learning_rate": 8.882276694699204e-05, + "loss": 0.6109840273857117, + "step": 3978 + }, + { + "epoch": 1.6793248945147679, + "grad_norm": 1.2034777402877808, + "learning_rate": 8.880762912150862e-05, + "loss": 0.6815584897994995, + "step": 3980 + }, + { + "epoch": 1.680168776371308, + "grad_norm": 1.1312000751495361, + "learning_rate": 8.879248234385052e-05, + "loss": 0.6859248876571655, + "step": 3982 + }, + { + "epoch": 1.6810126582278482, + "grad_norm": 1.2273681163787842, + "learning_rate": 8.877732661751173e-05, + "loss": 0.6426702737808228, + "step": 3984 + }, + { + "epoch": 1.6818565400843881, + "grad_norm": 1.2550326585769653, + "learning_rate": 8.876216194598844e-05, + "loss": 0.6462456583976746, + "step": 3986 + }, + { + "epoch": 1.6827004219409283, + "grad_norm": 1.3111321926116943, + "learning_rate": 8.874698833277884e-05, + "loss": 0.6293925046920776, + "step": 3988 + }, + { + "epoch": 1.6835443037974684, + "grad_norm": 1.037883996963501, + "learning_rate": 8.873180578138316e-05, + "loss": 0.59798264503479, + "step": 3990 + }, + { + "epoch": 1.6843881856540084, + "grad_norm": 1.2411901950836182, + "learning_rate": 8.871661429530376e-05, + "loss": 0.6741529703140259, + "step": 3992 + }, + { + "epoch": 1.6852320675105485, + "grad_norm": 1.206354022026062, + "learning_rate": 8.8701413878045e-05, + "loss": 0.5972680449485779, + "step": 3994 + }, + { + "epoch": 1.6860759493670887, + "grad_norm": 1.1922144889831543, + "learning_rate": 8.868620453311334e-05, + "loss": 0.5879245400428772, + "step": 3996 + }, + { + "epoch": 1.6869198312236287, + "grad_norm": 1.3499996662139893, + "learning_rate": 8.867098626401729e-05, + "loss": 0.7381167411804199, + "step": 3998 + }, + { + "epoch": 1.6877637130801688, + "grad_norm": 1.3601514101028442, + "learning_rate": 8.865575907426737e-05, + "loss": 0.6590276956558228, + "step": 4000 + }, + { + "epoch": 1.6877637130801688, + "eval_loss": 0.7027890682220459, + "eval_runtime": 848.7529, + "eval_samples_per_second": 2.482, + "eval_steps_per_second": 2.482, + "step": 4000 + }, + { + "epoch": 1.688607594936709, + "grad_norm": 1.1060529947280884, + "learning_rate": 8.864052296737624e-05, + "loss": 0.5958077907562256, + "step": 4002 + }, + { + "epoch": 1.689451476793249, + "grad_norm": 1.2067371606826782, + "learning_rate": 8.862527794685858e-05, + "loss": 0.6802279353141785, + "step": 4004 + }, + { + "epoch": 1.690295358649789, + "grad_norm": 1.0094636678695679, + "learning_rate": 8.86100240162311e-05, + "loss": 0.5701603889465332, + "step": 4006 + }, + { + "epoch": 1.6911392405063292, + "grad_norm": 1.0976500511169434, + "learning_rate": 8.85947611790126e-05, + "loss": 0.6580625176429749, + "step": 4008 + }, + { + "epoch": 1.6919831223628692, + "grad_norm": 0.9448981285095215, + "learning_rate": 8.857948943872392e-05, + "loss": 0.5947542190551758, + "step": 4010 + }, + { + "epoch": 1.6928270042194091, + "grad_norm": 1.219609260559082, + "learning_rate": 8.856420879888796e-05, + "loss": 0.6361464262008667, + "step": 4012 + }, + { + "epoch": 1.6936708860759495, + "grad_norm": 1.2395503520965576, + "learning_rate": 8.854891926302966e-05, + "loss": 0.608664333820343, + "step": 4014 + }, + { + "epoch": 1.6945147679324895, + "grad_norm": 1.1300057172775269, + "learning_rate": 8.853362083467604e-05, + "loss": 0.6932460069656372, + "step": 4016 + }, + { + "epoch": 1.6953586497890294, + "grad_norm": 1.2300254106521606, + "learning_rate": 8.851831351735616e-05, + "loss": 0.646004855632782, + "step": 4018 + }, + { + "epoch": 1.6962025316455698, + "grad_norm": 1.2328956127166748, + "learning_rate": 8.85029973146011e-05, + "loss": 0.6760826110839844, + "step": 4020 + }, + { + "epoch": 1.6970464135021097, + "grad_norm": 1.1252286434173584, + "learning_rate": 8.848767222994401e-05, + "loss": 0.5943224430084229, + "step": 4022 + }, + { + "epoch": 1.6978902953586497, + "grad_norm": 1.1587592363357544, + "learning_rate": 8.847233826692012e-05, + "loss": 0.7535276412963867, + "step": 4024 + }, + { + "epoch": 1.6987341772151898, + "grad_norm": 1.0294606685638428, + "learning_rate": 8.845699542906667e-05, + "loss": 0.5903090834617615, + "step": 4026 + }, + { + "epoch": 1.69957805907173, + "grad_norm": 1.1940597295761108, + "learning_rate": 8.844164371992295e-05, + "loss": 0.6031379699707031, + "step": 4028 + }, + { + "epoch": 1.70042194092827, + "grad_norm": 1.0416409969329834, + "learning_rate": 8.842628314303031e-05, + "loss": 0.6185168623924255, + "step": 4030 + }, + { + "epoch": 1.70126582278481, + "grad_norm": 1.8715689182281494, + "learning_rate": 8.841091370193214e-05, + "loss": 0.6325570344924927, + "step": 4032 + }, + { + "epoch": 1.7021097046413503, + "grad_norm": 1.230658769607544, + "learning_rate": 8.839553540017387e-05, + "loss": 0.7413952350616455, + "step": 4034 + }, + { + "epoch": 1.7029535864978902, + "grad_norm": 1.298003077507019, + "learning_rate": 8.838014824130299e-05, + "loss": 0.6973189115524292, + "step": 4036 + }, + { + "epoch": 1.7037974683544304, + "grad_norm": 1.0246652364730835, + "learning_rate": 8.836475222886902e-05, + "loss": 0.6582493185997009, + "step": 4038 + }, + { + "epoch": 1.7046413502109705, + "grad_norm": 1.3652594089508057, + "learning_rate": 8.834934736642351e-05, + "loss": 0.6934399008750916, + "step": 4040 + }, + { + "epoch": 1.7054852320675105, + "grad_norm": 1.029778242111206, + "learning_rate": 8.833393365752007e-05, + "loss": 0.6437561511993408, + "step": 4042 + }, + { + "epoch": 1.7063291139240506, + "grad_norm": 1.1993004083633423, + "learning_rate": 8.831851110571437e-05, + "loss": 0.605059027671814, + "step": 4044 + }, + { + "epoch": 1.7071729957805908, + "grad_norm": 1.286389946937561, + "learning_rate": 8.830307971456406e-05, + "loss": 0.7035017609596252, + "step": 4046 + }, + { + "epoch": 1.7080168776371307, + "grad_norm": 1.1211459636688232, + "learning_rate": 8.82876394876289e-05, + "loss": 0.6429924964904785, + "step": 4048 + }, + { + "epoch": 1.7088607594936709, + "grad_norm": 1.1284868717193604, + "learning_rate": 8.827219042847064e-05, + "loss": 0.6454769968986511, + "step": 4050 + }, + { + "epoch": 1.709704641350211, + "grad_norm": 1.1934884786605835, + "learning_rate": 8.825673254065306e-05, + "loss": 0.707233190536499, + "step": 4052 + }, + { + "epoch": 1.710548523206751, + "grad_norm": 1.1560680866241455, + "learning_rate": 8.824126582774203e-05, + "loss": 0.6790444254875183, + "step": 4054 + }, + { + "epoch": 1.7113924050632912, + "grad_norm": 1.1924364566802979, + "learning_rate": 8.822579029330541e-05, + "loss": 0.6115295886993408, + "step": 4056 + }, + { + "epoch": 1.7122362869198313, + "grad_norm": 1.107370138168335, + "learning_rate": 8.82103059409131e-05, + "loss": 0.7039182186126709, + "step": 4058 + }, + { + "epoch": 1.7130801687763713, + "grad_norm": 1.2554657459259033, + "learning_rate": 8.819481277413707e-05, + "loss": 0.6580052971839905, + "step": 4060 + }, + { + "epoch": 1.7139240506329114, + "grad_norm": 1.2873135805130005, + "learning_rate": 8.817931079655127e-05, + "loss": 0.6042479276657104, + "step": 4062 + }, + { + "epoch": 1.7147679324894516, + "grad_norm": 1.027056097984314, + "learning_rate": 8.816380001173172e-05, + "loss": 0.5992372632026672, + "step": 4064 + }, + { + "epoch": 1.7156118143459915, + "grad_norm": 1.0694721937179565, + "learning_rate": 8.814828042325644e-05, + "loss": 0.7078655362129211, + "step": 4066 + }, + { + "epoch": 1.7164556962025317, + "grad_norm": 1.194984793663025, + "learning_rate": 8.813275203470555e-05, + "loss": 0.6618752479553223, + "step": 4068 + }, + { + "epoch": 1.7172995780590719, + "grad_norm": 1.1713165044784546, + "learning_rate": 8.811721484966109e-05, + "loss": 0.6328625679016113, + "step": 4070 + }, + { + "epoch": 1.7181434599156118, + "grad_norm": 0.9993656277656555, + "learning_rate": 8.810166887170724e-05, + "loss": 0.5916416645050049, + "step": 4072 + }, + { + "epoch": 1.7189873417721517, + "grad_norm": 1.172642707824707, + "learning_rate": 8.808611410443011e-05, + "loss": 0.6490002274513245, + "step": 4074 + }, + { + "epoch": 1.7198312236286921, + "grad_norm": 1.1404821872711182, + "learning_rate": 8.807055055141793e-05, + "loss": 0.6571791172027588, + "step": 4076 + }, + { + "epoch": 1.720675105485232, + "grad_norm": 1.2104214429855347, + "learning_rate": 8.80549782162609e-05, + "loss": 0.6233854293823242, + "step": 4078 + }, + { + "epoch": 1.721518987341772, + "grad_norm": 1.1691396236419678, + "learning_rate": 8.803939710255126e-05, + "loss": 0.6331531405448914, + "step": 4080 + }, + { + "epoch": 1.7223628691983124, + "grad_norm": 1.263174057006836, + "learning_rate": 8.802380721388325e-05, + "loss": 0.6321156620979309, + "step": 4082 + }, + { + "epoch": 1.7232067510548523, + "grad_norm": 1.0685606002807617, + "learning_rate": 8.80082085538532e-05, + "loss": 0.644904613494873, + "step": 4084 + }, + { + "epoch": 1.7240506329113923, + "grad_norm": 1.2289735078811646, + "learning_rate": 8.799260112605938e-05, + "loss": 0.6743831634521484, + "step": 4086 + }, + { + "epoch": 1.7248945147679327, + "grad_norm": 1.0661355257034302, + "learning_rate": 8.797698493410216e-05, + "loss": 0.6866999268531799, + "step": 4088 + }, + { + "epoch": 1.7257383966244726, + "grad_norm": 1.1001228094100952, + "learning_rate": 8.796135998158386e-05, + "loss": 0.691387414932251, + "step": 4090 + }, + { + "epoch": 1.7265822784810125, + "grad_norm": 1.1078115701675415, + "learning_rate": 8.794572627210887e-05, + "loss": 0.5882864594459534, + "step": 4092 + }, + { + "epoch": 1.7274261603375527, + "grad_norm": 1.0483999252319336, + "learning_rate": 8.79300838092836e-05, + "loss": 0.6192089319229126, + "step": 4094 + }, + { + "epoch": 1.7282700421940929, + "grad_norm": 1.1194913387298584, + "learning_rate": 8.791443259671645e-05, + "loss": 0.603322446346283, + "step": 4096 + }, + { + "epoch": 1.7291139240506328, + "grad_norm": 1.1800397634506226, + "learning_rate": 8.789877263801787e-05, + "loss": 0.6141818165779114, + "step": 4098 + }, + { + "epoch": 1.729957805907173, + "grad_norm": 1.261768102645874, + "learning_rate": 8.78831039368003e-05, + "loss": 0.6707983016967773, + "step": 4100 + }, + { + "epoch": 1.729957805907173, + "eval_loss": 0.7022181153297424, + "eval_runtime": 844.6405, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 2.495, + "step": 4100 + }, + { + "epoch": 1.7308016877637131, + "grad_norm": 1.2505232095718384, + "learning_rate": 8.786742649667822e-05, + "loss": 0.6440353989601135, + "step": 4102 + }, + { + "epoch": 1.731645569620253, + "grad_norm": 1.2631809711456299, + "learning_rate": 8.78517403212681e-05, + "loss": 0.6712808012962341, + "step": 4104 + }, + { + "epoch": 1.7324894514767932, + "grad_norm": 1.2781071662902832, + "learning_rate": 8.783604541418845e-05, + "loss": 0.6854958534240723, + "step": 4106 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 1.1065936088562012, + "learning_rate": 8.782034177905976e-05, + "loss": 0.6281477808952332, + "step": 4108 + }, + { + "epoch": 1.7341772151898733, + "grad_norm": 1.010961890220642, + "learning_rate": 8.780462941950457e-05, + "loss": 0.6835165619850159, + "step": 4110 + }, + { + "epoch": 1.7350210970464135, + "grad_norm": 1.1467366218566895, + "learning_rate": 8.778890833914744e-05, + "loss": 0.6674962639808655, + "step": 4112 + }, + { + "epoch": 1.7358649789029537, + "grad_norm": 1.0221859216690063, + "learning_rate": 8.77731785416149e-05, + "loss": 0.5967551469802856, + "step": 4114 + }, + { + "epoch": 1.7367088607594936, + "grad_norm": 1.347937822341919, + "learning_rate": 8.775744003053552e-05, + "loss": 0.7356855869293213, + "step": 4116 + }, + { + "epoch": 1.7375527426160338, + "grad_norm": 1.2952557802200317, + "learning_rate": 8.774169280953988e-05, + "loss": 0.6932644844055176, + "step": 4118 + }, + { + "epoch": 1.738396624472574, + "grad_norm": 1.0157089233398438, + "learning_rate": 8.772593688226052e-05, + "loss": 0.5917407870292664, + "step": 4120 + }, + { + "epoch": 1.7392405063291139, + "grad_norm": 1.1537878513336182, + "learning_rate": 8.77101722523321e-05, + "loss": 0.6335760354995728, + "step": 4122 + }, + { + "epoch": 1.740084388185654, + "grad_norm": 1.0989667177200317, + "learning_rate": 8.769439892339115e-05, + "loss": 0.6892110109329224, + "step": 4124 + }, + { + "epoch": 1.7409282700421942, + "grad_norm": 1.1293572187423706, + "learning_rate": 8.767861689907633e-05, + "loss": 0.5966230630874634, + "step": 4126 + }, + { + "epoch": 1.7417721518987341, + "grad_norm": 1.1167775392532349, + "learning_rate": 8.76628261830282e-05, + "loss": 0.5981804728507996, + "step": 4128 + }, + { + "epoch": 1.7426160337552743, + "grad_norm": 1.0572419166564941, + "learning_rate": 8.76470267788894e-05, + "loss": 0.5539529919624329, + "step": 4130 + }, + { + "epoch": 1.7434599156118145, + "grad_norm": 0.937256932258606, + "learning_rate": 8.763121869030456e-05, + "loss": 0.6238219141960144, + "step": 4132 + }, + { + "epoch": 1.7443037974683544, + "grad_norm": 1.082932472229004, + "learning_rate": 8.761540192092029e-05, + "loss": 0.6033329963684082, + "step": 4134 + }, + { + "epoch": 1.7451476793248946, + "grad_norm": 1.0495184659957886, + "learning_rate": 8.75995764743852e-05, + "loss": 0.5567626357078552, + "step": 4136 + }, + { + "epoch": 1.7459915611814347, + "grad_norm": 1.3143779039382935, + "learning_rate": 8.758374235434994e-05, + "loss": 0.6759346127510071, + "step": 4138 + }, + { + "epoch": 1.7468354430379747, + "grad_norm": 1.2385786771774292, + "learning_rate": 8.756789956446713e-05, + "loss": 0.6439400315284729, + "step": 4140 + }, + { + "epoch": 1.7476793248945146, + "grad_norm": 1.0453747510910034, + "learning_rate": 8.75520481083914e-05, + "loss": 0.627493679523468, + "step": 4142 + }, + { + "epoch": 1.748523206751055, + "grad_norm": 1.09946608543396, + "learning_rate": 8.753618798977935e-05, + "loss": 0.677209198474884, + "step": 4144 + }, + { + "epoch": 1.749367088607595, + "grad_norm": 1.2207063436508179, + "learning_rate": 8.752031921228965e-05, + "loss": 0.6874014735221863, + "step": 4146 + }, + { + "epoch": 1.7502109704641349, + "grad_norm": 1.2520697116851807, + "learning_rate": 8.750444177958288e-05, + "loss": 0.6332831382751465, + "step": 4148 + }, + { + "epoch": 1.7510548523206753, + "grad_norm": 1.2463186979293823, + "learning_rate": 8.748855569532168e-05, + "loss": 0.682744562625885, + "step": 4150 + }, + { + "epoch": 1.7518987341772152, + "grad_norm": 1.1895235776901245, + "learning_rate": 8.747266096317069e-05, + "loss": 0.7006803750991821, + "step": 4152 + }, + { + "epoch": 1.7527426160337551, + "grad_norm": 1.1627185344696045, + "learning_rate": 8.745675758679646e-05, + "loss": 0.6751191020011902, + "step": 4154 + }, + { + "epoch": 1.7535864978902953, + "grad_norm": 1.324127197265625, + "learning_rate": 8.744084556986764e-05, + "loss": 0.661848247051239, + "step": 4156 + }, + { + "epoch": 1.7544303797468355, + "grad_norm": 1.226809024810791, + "learning_rate": 8.74249249160548e-05, + "loss": 0.7057217955589294, + "step": 4158 + }, + { + "epoch": 1.7552742616033754, + "grad_norm": 1.2341214418411255, + "learning_rate": 8.740899562903056e-05, + "loss": 0.6856105923652649, + "step": 4160 + }, + { + "epoch": 1.7561181434599156, + "grad_norm": 1.3907564878463745, + "learning_rate": 8.739305771246946e-05, + "loss": 0.6616930365562439, + "step": 4162 + }, + { + "epoch": 1.7569620253164557, + "grad_norm": 1.2756825685501099, + "learning_rate": 8.737711117004812e-05, + "loss": 0.5791551470756531, + "step": 4164 + }, + { + "epoch": 1.7578059071729957, + "grad_norm": 1.2861095666885376, + "learning_rate": 8.736115600544506e-05, + "loss": 0.7074756622314453, + "step": 4166 + }, + { + "epoch": 1.7586497890295358, + "grad_norm": 1.2198424339294434, + "learning_rate": 8.734519222234083e-05, + "loss": 0.6494167447090149, + "step": 4168 + }, + { + "epoch": 1.759493670886076, + "grad_norm": 1.19169020652771, + "learning_rate": 8.732921982441799e-05, + "loss": 0.6546841859817505, + "step": 4170 + }, + { + "epoch": 1.760337552742616, + "grad_norm": 1.11533784866333, + "learning_rate": 8.731323881536108e-05, + "loss": 0.6701815724372864, + "step": 4172 + }, + { + "epoch": 1.761181434599156, + "grad_norm": 1.2148140668869019, + "learning_rate": 8.729724919885657e-05, + "loss": 0.6678179502487183, + "step": 4174 + }, + { + "epoch": 1.7620253164556963, + "grad_norm": 1.1968709230422974, + "learning_rate": 8.728125097859298e-05, + "loss": 0.6505144834518433, + "step": 4176 + }, + { + "epoch": 1.7628691983122362, + "grad_norm": 1.0954766273498535, + "learning_rate": 8.726524415826079e-05, + "loss": 0.6531696915626526, + "step": 4178 + }, + { + "epoch": 1.7637130801687764, + "grad_norm": 1.5149537324905396, + "learning_rate": 8.724922874155246e-05, + "loss": 0.710014283657074, + "step": 4180 + }, + { + "epoch": 1.7645569620253165, + "grad_norm": 1.145113229751587, + "learning_rate": 8.723320473216245e-05, + "loss": 0.714016318321228, + "step": 4182 + }, + { + "epoch": 1.7654008438818565, + "grad_norm": 0.9454524517059326, + "learning_rate": 8.721717213378719e-05, + "loss": 0.6775414347648621, + "step": 4184 + }, + { + "epoch": 1.7662447257383966, + "grad_norm": 1.1414754390716553, + "learning_rate": 8.720113095012507e-05, + "loss": 0.6279728412628174, + "step": 4186 + }, + { + "epoch": 1.7670886075949368, + "grad_norm": 1.212802767753601, + "learning_rate": 8.718508118487652e-05, + "loss": 0.5894309282302856, + "step": 4188 + }, + { + "epoch": 1.7679324894514767, + "grad_norm": 1.5213478803634644, + "learning_rate": 8.716902284174388e-05, + "loss": 0.6124046444892883, + "step": 4190 + }, + { + "epoch": 1.768776371308017, + "grad_norm": 0.9973840713500977, + "learning_rate": 8.715295592443154e-05, + "loss": 0.5990801453590393, + "step": 4192 + }, + { + "epoch": 1.769620253164557, + "grad_norm": 1.1084294319152832, + "learning_rate": 8.713688043664579e-05, + "loss": 0.6485559344291687, + "step": 4194 + }, + { + "epoch": 1.770464135021097, + "grad_norm": 1.1401913166046143, + "learning_rate": 8.712079638209493e-05, + "loss": 0.7083099484443665, + "step": 4196 + }, + { + "epoch": 1.7713080168776372, + "grad_norm": 1.278105616569519, + "learning_rate": 8.71047037644893e-05, + "loss": 0.7237915992736816, + "step": 4198 + }, + { + "epoch": 1.7721518987341773, + "grad_norm": 1.2407530546188354, + "learning_rate": 8.708860258754108e-05, + "loss": 0.6259870529174805, + "step": 4200 + }, + { + "epoch": 1.7721518987341773, + "eval_loss": 0.6993561387062073, + "eval_runtime": 542.0281, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 3.887, + "step": 4200 + }, + { + "epoch": 1.7729957805907173, + "grad_norm": 1.102859616279602, + "learning_rate": 8.707249285496457e-05, + "loss": 0.6604248285293579, + "step": 4202 + }, + { + "epoch": 1.7738396624472574, + "grad_norm": 1.2478244304656982, + "learning_rate": 8.705637457047594e-05, + "loss": 0.6799775958061218, + "step": 4204 + }, + { + "epoch": 1.7746835443037976, + "grad_norm": 1.1178022623062134, + "learning_rate": 8.704024773779338e-05, + "loss": 0.6136477589607239, + "step": 4206 + }, + { + "epoch": 1.7755274261603375, + "grad_norm": 1.904076337814331, + "learning_rate": 8.702411236063703e-05, + "loss": 0.6568390130996704, + "step": 4208 + }, + { + "epoch": 1.7763713080168775, + "grad_norm": 1.0902835130691528, + "learning_rate": 8.700796844272903e-05, + "loss": 0.6404406428337097, + "step": 4210 + }, + { + "epoch": 1.7772151898734179, + "grad_norm": 1.1858288049697876, + "learning_rate": 8.699181598779347e-05, + "loss": 0.6924911737442017, + "step": 4212 + }, + { + "epoch": 1.7780590717299578, + "grad_norm": 1.0015727281570435, + "learning_rate": 8.69756549995564e-05, + "loss": 0.572692334651947, + "step": 4214 + }, + { + "epoch": 1.7789029535864977, + "grad_norm": 1.440079689025879, + "learning_rate": 8.695948548174583e-05, + "loss": 0.7196018695831299, + "step": 4216 + }, + { + "epoch": 1.7797468354430381, + "grad_norm": 1.1320992708206177, + "learning_rate": 8.69433074380918e-05, + "loss": 0.5870906710624695, + "step": 4218 + }, + { + "epoch": 1.780590717299578, + "grad_norm": 1.3156964778900146, + "learning_rate": 8.692712087232626e-05, + "loss": 0.6501539349555969, + "step": 4220 + }, + { + "epoch": 1.781434599156118, + "grad_norm": 1.1869803667068481, + "learning_rate": 8.691092578818311e-05, + "loss": 0.7017278075218201, + "step": 4222 + }, + { + "epoch": 1.7822784810126582, + "grad_norm": 0.9708380699157715, + "learning_rate": 8.689472218939829e-05, + "loss": 0.5954802632331848, + "step": 4224 + }, + { + "epoch": 1.7831223628691983, + "grad_norm": 1.0753228664398193, + "learning_rate": 8.687851007970962e-05, + "loss": 0.6494144797325134, + "step": 4226 + }, + { + "epoch": 1.7839662447257383, + "grad_norm": 1.1038413047790527, + "learning_rate": 8.686228946285695e-05, + "loss": 0.7247282862663269, + "step": 4228 + }, + { + "epoch": 1.7848101265822784, + "grad_norm": 0.9666786789894104, + "learning_rate": 8.684606034258206e-05, + "loss": 0.5673812627792358, + "step": 4230 + }, + { + "epoch": 1.7856540084388186, + "grad_norm": 1.1972676515579224, + "learning_rate": 8.682982272262869e-05, + "loss": 0.5950504541397095, + "step": 4232 + }, + { + "epoch": 1.7864978902953585, + "grad_norm": 1.23736572265625, + "learning_rate": 8.681357660674255e-05, + "loss": 0.6477514505386353, + "step": 4234 + }, + { + "epoch": 1.7873417721518987, + "grad_norm": 1.0238158702850342, + "learning_rate": 8.679732199867127e-05, + "loss": 0.6180200576782227, + "step": 4236 + }, + { + "epoch": 1.7881856540084389, + "grad_norm": 1.0333375930786133, + "learning_rate": 8.678105890216455e-05, + "loss": 0.5771099328994751, + "step": 4238 + }, + { + "epoch": 1.7890295358649788, + "grad_norm": 1.30390202999115, + "learning_rate": 8.676478732097393e-05, + "loss": 0.6592516899108887, + "step": 4240 + }, + { + "epoch": 1.789873417721519, + "grad_norm": 1.115160346031189, + "learning_rate": 8.674850725885294e-05, + "loss": 0.6662757396697998, + "step": 4242 + }, + { + "epoch": 1.7907172995780591, + "grad_norm": 1.2130142450332642, + "learning_rate": 8.67322187195571e-05, + "loss": 0.6673333048820496, + "step": 4244 + }, + { + "epoch": 1.791561181434599, + "grad_norm": 1.1505554914474487, + "learning_rate": 8.671592170684386e-05, + "loss": 0.6698325872421265, + "step": 4246 + }, + { + "epoch": 1.7924050632911392, + "grad_norm": 1.0758062601089478, + "learning_rate": 8.669961622447262e-05, + "loss": 0.6216199398040771, + "step": 4248 + }, + { + "epoch": 1.7932489451476794, + "grad_norm": 0.9300920367240906, + "learning_rate": 8.668330227620475e-05, + "loss": 0.6460495591163635, + "step": 4250 + }, + { + "epoch": 1.7940928270042193, + "grad_norm": 1.3860046863555908, + "learning_rate": 8.666697986580357e-05, + "loss": 0.6949506998062134, + "step": 4252 + }, + { + "epoch": 1.7949367088607595, + "grad_norm": 1.2287555932998657, + "learning_rate": 8.665064899703433e-05, + "loss": 0.6320405602455139, + "step": 4254 + }, + { + "epoch": 1.7957805907172997, + "grad_norm": 1.1585466861724854, + "learning_rate": 8.663430967366426e-05, + "loss": 0.6635019779205322, + "step": 4256 + }, + { + "epoch": 1.7966244725738396, + "grad_norm": 1.1007941961288452, + "learning_rate": 8.661796189946252e-05, + "loss": 0.645052969455719, + "step": 4258 + }, + { + "epoch": 1.7974683544303798, + "grad_norm": 1.2059847116470337, + "learning_rate": 8.660160567820023e-05, + "loss": 0.70420902967453, + "step": 4260 + }, + { + "epoch": 1.79831223628692, + "grad_norm": 1.0648717880249023, + "learning_rate": 8.658524101365044e-05, + "loss": 0.6263765096664429, + "step": 4262 + }, + { + "epoch": 1.7991561181434599, + "grad_norm": 1.017052412033081, + "learning_rate": 8.656886790958821e-05, + "loss": 0.6199937462806702, + "step": 4264 + }, + { + "epoch": 1.8, + "grad_norm": 1.1153450012207031, + "learning_rate": 8.655248636979045e-05, + "loss": 0.5891271233558655, + "step": 4266 + }, + { + "epoch": 1.8008438818565402, + "grad_norm": 1.0661747455596924, + "learning_rate": 8.65360963980361e-05, + "loss": 0.5442121028900146, + "step": 4268 + }, + { + "epoch": 1.8016877637130801, + "grad_norm": 1.3049758672714233, + "learning_rate": 8.6519697998106e-05, + "loss": 0.6988245248794556, + "step": 4270 + }, + { + "epoch": 1.80253164556962, + "grad_norm": 1.2679938077926636, + "learning_rate": 8.650329117378294e-05, + "loss": 0.7260398864746094, + "step": 4272 + }, + { + "epoch": 1.8033755274261605, + "grad_norm": 1.0899536609649658, + "learning_rate": 8.648687592885168e-05, + "loss": 0.5757678151130676, + "step": 4274 + }, + { + "epoch": 1.8042194092827004, + "grad_norm": 1.4088575839996338, + "learning_rate": 8.647045226709887e-05, + "loss": 0.7042108178138733, + "step": 4276 + }, + { + "epoch": 1.8050632911392404, + "grad_norm": 1.2143783569335938, + "learning_rate": 8.645402019231316e-05, + "loss": 0.641275942325592, + "step": 4278 + }, + { + "epoch": 1.8059071729957807, + "grad_norm": 1.4072896242141724, + "learning_rate": 8.64375797082851e-05, + "loss": 0.7657124996185303, + "step": 4280 + }, + { + "epoch": 1.8067510548523207, + "grad_norm": 1.2563380002975464, + "learning_rate": 8.642113081880718e-05, + "loss": 0.713768720626831, + "step": 4282 + }, + { + "epoch": 1.8075949367088606, + "grad_norm": 1.1195416450500488, + "learning_rate": 8.64046735276739e-05, + "loss": 0.6276429295539856, + "step": 4284 + }, + { + "epoch": 1.808438818565401, + "grad_norm": 1.2472422122955322, + "learning_rate": 8.638820783868158e-05, + "loss": 0.5641238689422607, + "step": 4286 + }, + { + "epoch": 1.809282700421941, + "grad_norm": 1.1974313259124756, + "learning_rate": 8.637173375562855e-05, + "loss": 0.6312015056610107, + "step": 4288 + }, + { + "epoch": 1.810126582278481, + "grad_norm": 1.1673604249954224, + "learning_rate": 8.63552512823151e-05, + "loss": 0.6674410104751587, + "step": 4290 + }, + { + "epoch": 1.810970464135021, + "grad_norm": 1.199095368385315, + "learning_rate": 8.633876042254337e-05, + "loss": 0.6772016286849976, + "step": 4292 + }, + { + "epoch": 1.8118143459915612, + "grad_norm": 1.2302746772766113, + "learning_rate": 8.632226118011752e-05, + "loss": 0.6621671915054321, + "step": 4294 + }, + { + "epoch": 1.8126582278481012, + "grad_norm": 1.304010033607483, + "learning_rate": 8.63057535588436e-05, + "loss": 0.6965363621711731, + "step": 4296 + }, + { + "epoch": 1.8135021097046413, + "grad_norm": 1.223366618156433, + "learning_rate": 8.62892375625296e-05, + "loss": 0.6300807595252991, + "step": 4298 + }, + { + "epoch": 1.8143459915611815, + "grad_norm": 1.028496265411377, + "learning_rate": 8.627271319498544e-05, + "loss": 0.5610660910606384, + "step": 4300 + }, + { + "epoch": 1.8143459915611815, + "eval_loss": 0.6981000900268555, + "eval_runtime": 514.4659, + "eval_samples_per_second": 4.096, + "eval_steps_per_second": 4.096, + "step": 4300 + }, + { + "epoch": 1.8151898734177214, + "grad_norm": 1.2050007581710815, + "learning_rate": 8.625618046002298e-05, + "loss": 0.6666551232337952, + "step": 4302 + }, + { + "epoch": 1.8160337552742616, + "grad_norm": 1.1233220100402832, + "learning_rate": 8.6239639361456e-05, + "loss": 0.6631835103034973, + "step": 4304 + }, + { + "epoch": 1.8168776371308017, + "grad_norm": 1.1262956857681274, + "learning_rate": 8.622308990310021e-05, + "loss": 0.6395270228385925, + "step": 4306 + }, + { + "epoch": 1.8177215189873417, + "grad_norm": 1.0448222160339355, + "learning_rate": 8.620653208877328e-05, + "loss": 0.6165015697479248, + "step": 4308 + }, + { + "epoch": 1.8185654008438819, + "grad_norm": 1.1555759906768799, + "learning_rate": 8.618996592229473e-05, + "loss": 0.5915844440460205, + "step": 4310 + }, + { + "epoch": 1.819409282700422, + "grad_norm": 1.5407506227493286, + "learning_rate": 8.617339140748608e-05, + "loss": 0.6491456627845764, + "step": 4312 + }, + { + "epoch": 1.820253164556962, + "grad_norm": 1.3690788745880127, + "learning_rate": 8.615680854817077e-05, + "loss": 0.6053901314735413, + "step": 4314 + }, + { + "epoch": 1.8210970464135021, + "grad_norm": 1.052583932876587, + "learning_rate": 8.614021734817413e-05, + "loss": 0.5821644067764282, + "step": 4316 + }, + { + "epoch": 1.8219409282700423, + "grad_norm": 1.090567708015442, + "learning_rate": 8.612361781132344e-05, + "loss": 0.645878255367279, + "step": 4318 + }, + { + "epoch": 1.8227848101265822, + "grad_norm": 1.122719645500183, + "learning_rate": 8.610700994144787e-05, + "loss": 0.6883123517036438, + "step": 4320 + }, + { + "epoch": 1.8236286919831224, + "grad_norm": 1.3273001909255981, + "learning_rate": 8.609039374237856e-05, + "loss": 0.6918330788612366, + "step": 4322 + }, + { + "epoch": 1.8244725738396625, + "grad_norm": 1.0628443956375122, + "learning_rate": 8.607376921794855e-05, + "loss": 0.6292204856872559, + "step": 4324 + }, + { + "epoch": 1.8253164556962025, + "grad_norm": 1.287466287612915, + "learning_rate": 8.605713637199279e-05, + "loss": 0.6136105060577393, + "step": 4326 + }, + { + "epoch": 1.8261603375527427, + "grad_norm": 1.1399345397949219, + "learning_rate": 8.604049520834816e-05, + "loss": 0.6099681854248047, + "step": 4328 + }, + { + "epoch": 1.8270042194092828, + "grad_norm": 1.1131435632705688, + "learning_rate": 8.602384573085345e-05, + "loss": 0.6267056465148926, + "step": 4330 + }, + { + "epoch": 1.8278481012658228, + "grad_norm": 1.1312925815582275, + "learning_rate": 8.600718794334939e-05, + "loss": 0.609437882900238, + "step": 4332 + }, + { + "epoch": 1.828691983122363, + "grad_norm": 1.3711494207382202, + "learning_rate": 8.599052184967859e-05, + "loss": 0.727881669998169, + "step": 4334 + }, + { + "epoch": 1.829535864978903, + "grad_norm": 1.1403605937957764, + "learning_rate": 8.597384745368562e-05, + "loss": 0.6771696209907532, + "step": 4336 + }, + { + "epoch": 1.830379746835443, + "grad_norm": 1.2769951820373535, + "learning_rate": 8.595716475921693e-05, + "loss": 0.6812924742698669, + "step": 4338 + }, + { + "epoch": 1.831223628691983, + "grad_norm": 1.055721402168274, + "learning_rate": 8.59404737701209e-05, + "loss": 0.6403515338897705, + "step": 4340 + }, + { + "epoch": 1.8320675105485233, + "grad_norm": 1.1047639846801758, + "learning_rate": 8.592377449024784e-05, + "loss": 0.663240373134613, + "step": 4342 + }, + { + "epoch": 1.8329113924050633, + "grad_norm": 1.0808883905410767, + "learning_rate": 8.590706692344991e-05, + "loss": 0.6398993134498596, + "step": 4344 + }, + { + "epoch": 1.8337552742616032, + "grad_norm": 1.2433407306671143, + "learning_rate": 8.589035107358125e-05, + "loss": 0.6838348507881165, + "step": 4346 + }, + { + "epoch": 1.8345991561181436, + "grad_norm": 1.031216025352478, + "learning_rate": 8.58736269444979e-05, + "loss": 0.640884280204773, + "step": 4348 + }, + { + "epoch": 1.8354430379746836, + "grad_norm": 1.1417057514190674, + "learning_rate": 8.585689454005776e-05, + "loss": 0.6346741914749146, + "step": 4350 + }, + { + "epoch": 1.8362869198312235, + "grad_norm": 1.210988998413086, + "learning_rate": 8.584015386412072e-05, + "loss": 0.6209521889686584, + "step": 4352 + }, + { + "epoch": 1.8371308016877637, + "grad_norm": 1.2120760679244995, + "learning_rate": 8.582340492054847e-05, + "loss": 0.6699252128601074, + "step": 4354 + }, + { + "epoch": 1.8379746835443038, + "grad_norm": 1.1768114566802979, + "learning_rate": 8.580664771320475e-05, + "loss": 0.6472980380058289, + "step": 4356 + }, + { + "epoch": 1.8388185654008438, + "grad_norm": 1.060070276260376, + "learning_rate": 8.578988224595506e-05, + "loss": 0.6440452933311462, + "step": 4358 + }, + { + "epoch": 1.839662447257384, + "grad_norm": 1.1366443634033203, + "learning_rate": 8.57731085226669e-05, + "loss": 0.5894474387168884, + "step": 4360 + }, + { + "epoch": 1.840506329113924, + "grad_norm": 1.1571751832962036, + "learning_rate": 8.575632654720963e-05, + "loss": 0.5868900418281555, + "step": 4362 + }, + { + "epoch": 1.841350210970464, + "grad_norm": 1.1983840465545654, + "learning_rate": 8.573953632345453e-05, + "loss": 0.5841533541679382, + "step": 4364 + }, + { + "epoch": 1.8421940928270042, + "grad_norm": 1.101806640625, + "learning_rate": 8.572273785527481e-05, + "loss": 0.5503215193748474, + "step": 4366 + }, + { + "epoch": 1.8430379746835444, + "grad_norm": 1.0327471494674683, + "learning_rate": 8.570593114654552e-05, + "loss": 0.6131128072738647, + "step": 4368 + }, + { + "epoch": 1.8438818565400843, + "grad_norm": 1.1421098709106445, + "learning_rate": 8.568911620114368e-05, + "loss": 0.6614060401916504, + "step": 4370 + }, + { + "epoch": 1.8447257383966245, + "grad_norm": 1.1707026958465576, + "learning_rate": 8.567229302294814e-05, + "loss": 0.6392307877540588, + "step": 4372 + }, + { + "epoch": 1.8455696202531646, + "grad_norm": 1.1704418659210205, + "learning_rate": 8.565546161583969e-05, + "loss": 0.6560825109481812, + "step": 4374 + }, + { + "epoch": 1.8464135021097046, + "grad_norm": 1.3618037700653076, + "learning_rate": 8.563862198370103e-05, + "loss": 0.6996290683746338, + "step": 4376 + }, + { + "epoch": 1.8472573839662447, + "grad_norm": 1.116645097732544, + "learning_rate": 8.562177413041674e-05, + "loss": 0.6776535511016846, + "step": 4378 + }, + { + "epoch": 1.8481012658227849, + "grad_norm": 1.1669151782989502, + "learning_rate": 8.560491805987327e-05, + "loss": 0.6390423774719238, + "step": 4380 + }, + { + "epoch": 1.8489451476793248, + "grad_norm": 1.2188117504119873, + "learning_rate": 8.558805377595904e-05, + "loss": 0.6554020047187805, + "step": 4382 + }, + { + "epoch": 1.849789029535865, + "grad_norm": 1.216829776763916, + "learning_rate": 8.557118128256425e-05, + "loss": 0.6291787624359131, + "step": 4384 + }, + { + "epoch": 1.8506329113924052, + "grad_norm": 1.0431596040725708, + "learning_rate": 8.555430058358111e-05, + "loss": 0.6484442949295044, + "step": 4386 + }, + { + "epoch": 1.851476793248945, + "grad_norm": 1.3015289306640625, + "learning_rate": 8.553741168290367e-05, + "loss": 0.7034047842025757, + "step": 4388 + }, + { + "epoch": 1.8523206751054853, + "grad_norm": 1.2062040567398071, + "learning_rate": 8.552051458442785e-05, + "loss": 0.644135594367981, + "step": 4390 + }, + { + "epoch": 1.8531645569620254, + "grad_norm": 1.238461971282959, + "learning_rate": 8.55036092920515e-05, + "loss": 0.6767282485961914, + "step": 4392 + }, + { + "epoch": 1.8540084388185654, + "grad_norm": 1.2978830337524414, + "learning_rate": 8.548669580967435e-05, + "loss": 0.7292267680168152, + "step": 4394 + }, + { + "epoch": 1.8548523206751055, + "grad_norm": 1.1448328495025635, + "learning_rate": 8.546977414119801e-05, + "loss": 0.6788421273231506, + "step": 4396 + }, + { + "epoch": 1.8556962025316457, + "grad_norm": 1.0685368776321411, + "learning_rate": 8.5452844290526e-05, + "loss": 0.6745942234992981, + "step": 4398 + }, + { + "epoch": 1.8565400843881856, + "grad_norm": 1.125707983970642, + "learning_rate": 8.543590626156368e-05, + "loss": 0.6351125836372375, + "step": 4400 + }, + { + "epoch": 1.8565400843881856, + "eval_loss": 0.6961485147476196, + "eval_runtime": 513.5724, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 4400 + }, + { + "epoch": 1.8573839662447258, + "grad_norm": 1.072179913520813, + "learning_rate": 8.541896005821835e-05, + "loss": 0.5840762257575989, + "step": 4402 + }, + { + "epoch": 1.858227848101266, + "grad_norm": 1.2572803497314453, + "learning_rate": 8.540200568439915e-05, + "loss": 0.6431074738502502, + "step": 4404 + }, + { + "epoch": 1.859071729957806, + "grad_norm": 1.3294413089752197, + "learning_rate": 8.538504314401718e-05, + "loss": 0.708808183670044, + "step": 4406 + }, + { + "epoch": 1.8599156118143458, + "grad_norm": 1.1775587797164917, + "learning_rate": 8.536807244098533e-05, + "loss": 0.6580085754394531, + "step": 4408 + }, + { + "epoch": 1.8607594936708862, + "grad_norm": 1.1880089044570923, + "learning_rate": 8.53510935792184e-05, + "loss": 0.6500136256217957, + "step": 4410 + }, + { + "epoch": 1.8616033755274262, + "grad_norm": 1.2166204452514648, + "learning_rate": 8.533410656263313e-05, + "loss": 0.6922352313995361, + "step": 4412 + }, + { + "epoch": 1.862447257383966, + "grad_norm": 1.0405415296554565, + "learning_rate": 8.531711139514808e-05, + "loss": 0.6761626601219177, + "step": 4414 + }, + { + "epoch": 1.8632911392405065, + "grad_norm": 1.0674270391464233, + "learning_rate": 8.530010808068371e-05, + "loss": 0.672576904296875, + "step": 4416 + }, + { + "epoch": 1.8641350210970464, + "grad_norm": 1.0584741830825806, + "learning_rate": 8.528309662316236e-05, + "loss": 0.5521218180656433, + "step": 4418 + }, + { + "epoch": 1.8649789029535864, + "grad_norm": 1.3619039058685303, + "learning_rate": 8.526607702650824e-05, + "loss": 0.6546680927276611, + "step": 4420 + }, + { + "epoch": 1.8658227848101265, + "grad_norm": 0.9904745221138, + "learning_rate": 8.524904929464745e-05, + "loss": 0.6043933629989624, + "step": 4422 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 1.3046703338623047, + "learning_rate": 8.523201343150795e-05, + "loss": 0.7106801271438599, + "step": 4424 + }, + { + "epoch": 1.8675105485232066, + "grad_norm": 1.1166832447052002, + "learning_rate": 8.52149694410196e-05, + "loss": 0.6456703543663025, + "step": 4426 + }, + { + "epoch": 1.8683544303797468, + "grad_norm": 1.1260632276535034, + "learning_rate": 8.519791732711412e-05, + "loss": 0.5963318347930908, + "step": 4428 + }, + { + "epoch": 1.869198312236287, + "grad_norm": 1.0990599393844604, + "learning_rate": 8.51808570937251e-05, + "loss": 0.6295356750488281, + "step": 4430 + }, + { + "epoch": 1.870042194092827, + "grad_norm": 1.3689274787902832, + "learning_rate": 8.516378874478801e-05, + "loss": 0.6984617114067078, + "step": 4432 + }, + { + "epoch": 1.870886075949367, + "grad_norm": 1.0986580848693848, + "learning_rate": 8.514671228424018e-05, + "loss": 0.5598900318145752, + "step": 4434 + }, + { + "epoch": 1.8717299578059072, + "grad_norm": 0.9570761322975159, + "learning_rate": 8.512962771602085e-05, + "loss": 0.6286435723304749, + "step": 4436 + }, + { + "epoch": 1.8725738396624472, + "grad_norm": 1.1480669975280762, + "learning_rate": 8.511253504407107e-05, + "loss": 0.5956313014030457, + "step": 4438 + }, + { + "epoch": 1.8734177215189873, + "grad_norm": 1.1132479906082153, + "learning_rate": 8.50954342723338e-05, + "loss": 0.6523844599723816, + "step": 4440 + }, + { + "epoch": 1.8742616033755275, + "grad_norm": 1.1569167375564575, + "learning_rate": 8.507832540475387e-05, + "loss": 0.6231355667114258, + "step": 4442 + }, + { + "epoch": 1.8751054852320674, + "grad_norm": 1.1327043771743774, + "learning_rate": 8.506120844527796e-05, + "loss": 0.660773754119873, + "step": 4444 + }, + { + "epoch": 1.8759493670886076, + "grad_norm": 0.8939630389213562, + "learning_rate": 8.504408339785463e-05, + "loss": 0.6319235563278198, + "step": 4446 + }, + { + "epoch": 1.8767932489451478, + "grad_norm": 1.1910638809204102, + "learning_rate": 8.50269502664343e-05, + "loss": 0.6753001809120178, + "step": 4448 + }, + { + "epoch": 1.8776371308016877, + "grad_norm": 1.1502408981323242, + "learning_rate": 8.500980905496923e-05, + "loss": 0.6300671696662903, + "step": 4450 + }, + { + "epoch": 1.8784810126582279, + "grad_norm": 1.0639009475708008, + "learning_rate": 8.49926597674136e-05, + "loss": 0.6196691989898682, + "step": 4452 + }, + { + "epoch": 1.879324894514768, + "grad_norm": 1.1072754859924316, + "learning_rate": 8.497550240772341e-05, + "loss": 0.7029181122779846, + "step": 4454 + }, + { + "epoch": 1.880168776371308, + "grad_norm": 1.0440188646316528, + "learning_rate": 8.495833697985652e-05, + "loss": 0.65432208776474, + "step": 4456 + }, + { + "epoch": 1.8810126582278481, + "grad_norm": 1.0646617412567139, + "learning_rate": 8.494116348777269e-05, + "loss": 0.6446614861488342, + "step": 4458 + }, + { + "epoch": 1.8818565400843883, + "grad_norm": 1.2163805961608887, + "learning_rate": 8.492398193543349e-05, + "loss": 0.6430497765541077, + "step": 4460 + }, + { + "epoch": 1.8827004219409282, + "grad_norm": 1.2715297937393188, + "learning_rate": 8.490679232680241e-05, + "loss": 0.6609845161437988, + "step": 4462 + }, + { + "epoch": 1.8835443037974684, + "grad_norm": 1.0435588359832764, + "learning_rate": 8.488959466584469e-05, + "loss": 0.5791062712669373, + "step": 4464 + }, + { + "epoch": 1.8843881856540086, + "grad_norm": 1.229202151298523, + "learning_rate": 8.487238895652759e-05, + "loss": 0.6312171220779419, + "step": 4466 + }, + { + "epoch": 1.8852320675105485, + "grad_norm": 1.0713022947311401, + "learning_rate": 8.485517520282008e-05, + "loss": 0.6698815226554871, + "step": 4468 + }, + { + "epoch": 1.8860759493670884, + "grad_norm": 1.0172312259674072, + "learning_rate": 8.483795340869305e-05, + "loss": 0.6283810138702393, + "step": 4470 + }, + { + "epoch": 1.8869198312236288, + "grad_norm": 1.2880207300186157, + "learning_rate": 8.482072357811926e-05, + "loss": 0.6659437417984009, + "step": 4472 + }, + { + "epoch": 1.8877637130801688, + "grad_norm": 1.0840508937835693, + "learning_rate": 8.480348571507329e-05, + "loss": 0.6190289258956909, + "step": 4474 + }, + { + "epoch": 1.8886075949367087, + "grad_norm": 1.1101994514465332, + "learning_rate": 8.478623982353156e-05, + "loss": 0.5760066509246826, + "step": 4476 + }, + { + "epoch": 1.889451476793249, + "grad_norm": 1.2388770580291748, + "learning_rate": 8.476898590747237e-05, + "loss": 0.6151811480522156, + "step": 4478 + }, + { + "epoch": 1.890295358649789, + "grad_norm": 0.9986408948898315, + "learning_rate": 8.475172397087591e-05, + "loss": 0.5991593599319458, + "step": 4480 + }, + { + "epoch": 1.891139240506329, + "grad_norm": 1.1380778551101685, + "learning_rate": 8.473445401772415e-05, + "loss": 0.7262179255485535, + "step": 4482 + }, + { + "epoch": 1.8919831223628694, + "grad_norm": 1.3933676481246948, + "learning_rate": 8.471717605200092e-05, + "loss": 0.5806916356086731, + "step": 4484 + }, + { + "epoch": 1.8928270042194093, + "grad_norm": 1.0242944955825806, + "learning_rate": 8.469989007769194e-05, + "loss": 0.617904782295227, + "step": 4486 + }, + { + "epoch": 1.8936708860759492, + "grad_norm": 1.0909028053283691, + "learning_rate": 8.468259609878475e-05, + "loss": 0.6488202810287476, + "step": 4488 + }, + { + "epoch": 1.8945147679324894, + "grad_norm": 1.042611002922058, + "learning_rate": 8.466529411926874e-05, + "loss": 0.6015118956565857, + "step": 4490 + }, + { + "epoch": 1.8953586497890296, + "grad_norm": 1.3965784311294556, + "learning_rate": 8.46479841431351e-05, + "loss": 0.7035272717475891, + "step": 4492 + }, + { + "epoch": 1.8962025316455695, + "grad_norm": 1.1486462354660034, + "learning_rate": 8.463066617437698e-05, + "loss": 0.6611229777336121, + "step": 4494 + }, + { + "epoch": 1.8970464135021097, + "grad_norm": 1.0845859050750732, + "learning_rate": 8.461334021698925e-05, + "loss": 0.6378056406974792, + "step": 4496 + }, + { + "epoch": 1.8978902953586498, + "grad_norm": 0.936612069606781, + "learning_rate": 8.459600627496869e-05, + "loss": 0.642429769039154, + "step": 4498 + }, + { + "epoch": 1.8987341772151898, + "grad_norm": 1.1905454397201538, + "learning_rate": 8.457866435231391e-05, + "loss": 0.6341768503189087, + "step": 4500 + }, + { + "epoch": 1.8987341772151898, + "eval_loss": 0.6938078999519348, + "eval_runtime": 513.615, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 4.102, + "step": 4500 + }, + { + "epoch": 1.89957805907173, + "grad_norm": 0.9778118133544922, + "learning_rate": 8.456131445302538e-05, + "loss": 0.5973100662231445, + "step": 4502 + }, + { + "epoch": 1.90042194092827, + "grad_norm": 0.9587083458900452, + "learning_rate": 8.454395658110536e-05, + "loss": 0.5982911586761475, + "step": 4504 + }, + { + "epoch": 1.90126582278481, + "grad_norm": 1.327643871307373, + "learning_rate": 8.452659074055798e-05, + "loss": 0.6858586668968201, + "step": 4506 + }, + { + "epoch": 1.9021097046413502, + "grad_norm": 1.0740257501602173, + "learning_rate": 8.450921693538922e-05, + "loss": 0.6172328591346741, + "step": 4508 + }, + { + "epoch": 1.9029535864978904, + "grad_norm": 1.0705101490020752, + "learning_rate": 8.449183516960685e-05, + "loss": 0.5349634289741516, + "step": 4510 + }, + { + "epoch": 1.9037974683544303, + "grad_norm": 0.9151237607002258, + "learning_rate": 8.447444544722058e-05, + "loss": 0.5769277811050415, + "step": 4512 + }, + { + "epoch": 1.9046413502109705, + "grad_norm": 1.139900803565979, + "learning_rate": 8.44570477722418e-05, + "loss": 0.6579093933105469, + "step": 4514 + }, + { + "epoch": 1.9054852320675106, + "grad_norm": 1.2481658458709717, + "learning_rate": 8.443964214868387e-05, + "loss": 0.6748929619789124, + "step": 4516 + }, + { + "epoch": 1.9063291139240506, + "grad_norm": 1.1661686897277832, + "learning_rate": 8.442222858056193e-05, + "loss": 0.6492021083831787, + "step": 4518 + }, + { + "epoch": 1.9071729957805907, + "grad_norm": 1.241477370262146, + "learning_rate": 8.440480707189295e-05, + "loss": 0.635409951210022, + "step": 4520 + }, + { + "epoch": 1.908016877637131, + "grad_norm": 1.1102054119110107, + "learning_rate": 8.438737762669573e-05, + "loss": 0.631928026676178, + "step": 4522 + }, + { + "epoch": 1.9088607594936708, + "grad_norm": 1.0638107061386108, + "learning_rate": 8.43699402489909e-05, + "loss": 0.604518473148346, + "step": 4524 + }, + { + "epoch": 1.909704641350211, + "grad_norm": 1.0270655155181885, + "learning_rate": 8.435249494280096e-05, + "loss": 0.61314457654953, + "step": 4526 + }, + { + "epoch": 1.9105485232067512, + "grad_norm": 1.1840111017227173, + "learning_rate": 8.433504171215018e-05, + "loss": 0.661663293838501, + "step": 4528 + }, + { + "epoch": 1.9113924050632911, + "grad_norm": 1.1404399871826172, + "learning_rate": 8.43175805610647e-05, + "loss": 0.7026967406272888, + "step": 4530 + }, + { + "epoch": 1.9122362869198313, + "grad_norm": 1.2371265888214111, + "learning_rate": 8.430011149357246e-05, + "loss": 0.6599440574645996, + "step": 4532 + }, + { + "epoch": 1.9130801687763714, + "grad_norm": 1.0042651891708374, + "learning_rate": 8.428263451370326e-05, + "loss": 0.5728344321250916, + "step": 4534 + }, + { + "epoch": 1.9139240506329114, + "grad_norm": 1.04367196559906, + "learning_rate": 8.426514962548866e-05, + "loss": 0.6495450735092163, + "step": 4536 + }, + { + "epoch": 1.9147679324894513, + "grad_norm": 1.0867135524749756, + "learning_rate": 8.424765683296215e-05, + "loss": 0.6406553387641907, + "step": 4538 + }, + { + "epoch": 1.9156118143459917, + "grad_norm": 1.0751310586929321, + "learning_rate": 8.423015614015892e-05, + "loss": 0.6692186594009399, + "step": 4540 + }, + { + "epoch": 1.9164556962025316, + "grad_norm": 1.13556969165802, + "learning_rate": 8.421264755111607e-05, + "loss": 0.6029785871505737, + "step": 4542 + }, + { + "epoch": 1.9172995780590716, + "grad_norm": 1.1560977697372437, + "learning_rate": 8.419513106987251e-05, + "loss": 0.6457844972610474, + "step": 4544 + }, + { + "epoch": 1.918143459915612, + "grad_norm": 1.2192902565002441, + "learning_rate": 8.417760670046893e-05, + "loss": 0.7082147598266602, + "step": 4546 + }, + { + "epoch": 1.918987341772152, + "grad_norm": 1.1170696020126343, + "learning_rate": 8.41600744469479e-05, + "loss": 0.6919234991073608, + "step": 4548 + }, + { + "epoch": 1.9198312236286919, + "grad_norm": 1.061253547668457, + "learning_rate": 8.414253431335373e-05, + "loss": 0.6310052871704102, + "step": 4550 + }, + { + "epoch": 1.920675105485232, + "grad_norm": 1.0671885013580322, + "learning_rate": 8.412498630373263e-05, + "loss": 0.6330236792564392, + "step": 4552 + }, + { + "epoch": 1.9215189873417722, + "grad_norm": 1.2085163593292236, + "learning_rate": 8.410743042213256e-05, + "loss": 0.7031015157699585, + "step": 4554 + }, + { + "epoch": 1.9223628691983121, + "grad_norm": 1.2682013511657715, + "learning_rate": 8.408986667260334e-05, + "loss": 0.7078304290771484, + "step": 4556 + }, + { + "epoch": 1.9232067510548523, + "grad_norm": 1.2966876029968262, + "learning_rate": 8.407229505919658e-05, + "loss": 0.6542860865592957, + "step": 4558 + }, + { + "epoch": 1.9240506329113924, + "grad_norm": 1.1086169481277466, + "learning_rate": 8.405471558596573e-05, + "loss": 0.5856828093528748, + "step": 4560 + }, + { + "epoch": 1.9248945147679324, + "grad_norm": 1.3175504207611084, + "learning_rate": 8.403712825696604e-05, + "loss": 0.7382104992866516, + "step": 4562 + }, + { + "epoch": 1.9257383966244725, + "grad_norm": 1.163164496421814, + "learning_rate": 8.401953307625454e-05, + "loss": 0.6862360239028931, + "step": 4564 + }, + { + "epoch": 1.9265822784810127, + "grad_norm": 1.207650899887085, + "learning_rate": 8.400193004789013e-05, + "loss": 0.7442302703857422, + "step": 4566 + }, + { + "epoch": 1.9274261603375527, + "grad_norm": 1.1570589542388916, + "learning_rate": 8.398431917593345e-05, + "loss": 0.595226526260376, + "step": 4568 + }, + { + "epoch": 1.9282700421940928, + "grad_norm": 1.091927170753479, + "learning_rate": 8.396670046444704e-05, + "loss": 0.6360410451889038, + "step": 4570 + }, + { + "epoch": 1.929113924050633, + "grad_norm": 1.149559497833252, + "learning_rate": 8.394907391749516e-05, + "loss": 0.6343122124671936, + "step": 4572 + }, + { + "epoch": 1.929957805907173, + "grad_norm": 1.0585254430770874, + "learning_rate": 8.393143953914395e-05, + "loss": 0.7394745349884033, + "step": 4574 + }, + { + "epoch": 1.930801687763713, + "grad_norm": 1.1648521423339844, + "learning_rate": 8.391379733346128e-05, + "loss": 0.6489678025245667, + "step": 4576 + }, + { + "epoch": 1.9316455696202532, + "grad_norm": 1.1756316423416138, + "learning_rate": 8.389614730451692e-05, + "loss": 0.6687861084938049, + "step": 4578 + }, + { + "epoch": 1.9324894514767932, + "grad_norm": 0.9857237339019775, + "learning_rate": 8.387848945638235e-05, + "loss": 0.523727536201477, + "step": 4580 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 1.1038693189620972, + "learning_rate": 8.386082379313092e-05, + "loss": 0.6545047760009766, + "step": 4582 + }, + { + "epoch": 1.9341772151898735, + "grad_norm": 1.0780832767486572, + "learning_rate": 8.384315031883774e-05, + "loss": 0.6067036390304565, + "step": 4584 + }, + { + "epoch": 1.9350210970464135, + "grad_norm": 1.2915070056915283, + "learning_rate": 8.382546903757975e-05, + "loss": 0.6880824565887451, + "step": 4586 + }, + { + "epoch": 1.9358649789029536, + "grad_norm": 1.1243441104888916, + "learning_rate": 8.380777995343568e-05, + "loss": 0.7319117188453674, + "step": 4588 + }, + { + "epoch": 1.9367088607594938, + "grad_norm": 1.1143072843551636, + "learning_rate": 8.379008307048609e-05, + "loss": 0.6845395565032959, + "step": 4590 + }, + { + "epoch": 1.9375527426160337, + "grad_norm": 1.039494276046753, + "learning_rate": 8.377237839281327e-05, + "loss": 0.6653600335121155, + "step": 4592 + }, + { + "epoch": 1.9383966244725739, + "grad_norm": 1.299617886543274, + "learning_rate": 8.375466592450136e-05, + "loss": 0.6352495551109314, + "step": 4594 + }, + { + "epoch": 1.939240506329114, + "grad_norm": 0.9918657541275024, + "learning_rate": 8.373694566963631e-05, + "loss": 0.5660957098007202, + "step": 4596 + }, + { + "epoch": 1.940084388185654, + "grad_norm": 1.0540478229522705, + "learning_rate": 8.371921763230579e-05, + "loss": 0.6296496987342834, + "step": 4598 + }, + { + "epoch": 1.9409282700421941, + "grad_norm": 1.1309545040130615, + "learning_rate": 8.370148181659939e-05, + "loss": 0.6672025918960571, + "step": 4600 + }, + { + "epoch": 1.9409282700421941, + "eval_loss": 0.6930755376815796, + "eval_runtime": 617.8927, + "eval_samples_per_second": 3.41, + "eval_steps_per_second": 3.41, + "step": 4600 + }, + { + "epoch": 1.9417721518987343, + "grad_norm": 1.2338588237762451, + "learning_rate": 8.368373822660836e-05, + "loss": 0.6200884580612183, + "step": 4602 + }, + { + "epoch": 1.9426160337552743, + "grad_norm": 1.1756945848464966, + "learning_rate": 8.366598686642582e-05, + "loss": 0.653294026851654, + "step": 4604 + }, + { + "epoch": 1.9434599156118142, + "grad_norm": 1.032018780708313, + "learning_rate": 8.364822774014671e-05, + "loss": 0.5670395493507385, + "step": 4606 + }, + { + "epoch": 1.9443037974683546, + "grad_norm": 1.045280933380127, + "learning_rate": 8.363046085186766e-05, + "loss": 0.6819197535514832, + "step": 4608 + }, + { + "epoch": 1.9451476793248945, + "grad_norm": 1.3223930597305298, + "learning_rate": 8.36126862056872e-05, + "loss": 0.6952820420265198, + "step": 4610 + }, + { + "epoch": 1.9459915611814345, + "grad_norm": 1.0048432350158691, + "learning_rate": 8.359490380570556e-05, + "loss": 0.5291440486907959, + "step": 4612 + }, + { + "epoch": 1.9468354430379748, + "grad_norm": 1.1477346420288086, + "learning_rate": 8.357711365602483e-05, + "loss": 0.6857813000679016, + "step": 4614 + }, + { + "epoch": 1.9476793248945148, + "grad_norm": 0.959985077381134, + "learning_rate": 8.355931576074882e-05, + "loss": 0.5581508278846741, + "step": 4616 + }, + { + "epoch": 1.9485232067510547, + "grad_norm": 1.1104289293289185, + "learning_rate": 8.35415101239832e-05, + "loss": 0.6536211371421814, + "step": 4618 + }, + { + "epoch": 1.9493670886075949, + "grad_norm": 1.2344517707824707, + "learning_rate": 8.352369674983535e-05, + "loss": 0.6570560336112976, + "step": 4620 + }, + { + "epoch": 1.950210970464135, + "grad_norm": 1.3411606550216675, + "learning_rate": 8.350587564241451e-05, + "loss": 0.6070495247840881, + "step": 4622 + }, + { + "epoch": 1.951054852320675, + "grad_norm": 1.1713159084320068, + "learning_rate": 8.348804680583166e-05, + "loss": 0.6444135904312134, + "step": 4624 + }, + { + "epoch": 1.9518987341772152, + "grad_norm": 1.127242922782898, + "learning_rate": 8.347021024419954e-05, + "loss": 0.6517419815063477, + "step": 4626 + }, + { + "epoch": 1.9527426160337553, + "grad_norm": 1.0733028650283813, + "learning_rate": 8.345236596163274e-05, + "loss": 0.6174065470695496, + "step": 4628 + }, + { + "epoch": 1.9535864978902953, + "grad_norm": 1.1114680767059326, + "learning_rate": 8.343451396224757e-05, + "loss": 0.7163593769073486, + "step": 4630 + }, + { + "epoch": 1.9544303797468354, + "grad_norm": 1.0839568376541138, + "learning_rate": 8.341665425016216e-05, + "loss": 0.698553204536438, + "step": 4632 + }, + { + "epoch": 1.9552742616033756, + "grad_norm": 1.17001211643219, + "learning_rate": 8.339878682949638e-05, + "loss": 0.6224857568740845, + "step": 4634 + }, + { + "epoch": 1.9561181434599155, + "grad_norm": 3.483793020248413, + "learning_rate": 8.338091170437193e-05, + "loss": 0.5931200981140137, + "step": 4636 + }, + { + "epoch": 1.9569620253164557, + "grad_norm": 1.1575394868850708, + "learning_rate": 8.336302887891224e-05, + "loss": 0.6031442284584045, + "step": 4638 + }, + { + "epoch": 1.9578059071729959, + "grad_norm": 1.1494992971420288, + "learning_rate": 8.334513835724252e-05, + "loss": 0.6101768016815186, + "step": 4640 + }, + { + "epoch": 1.9586497890295358, + "grad_norm": 1.3858197927474976, + "learning_rate": 8.332724014348981e-05, + "loss": 0.6571711301803589, + "step": 4642 + }, + { + "epoch": 1.959493670886076, + "grad_norm": 1.1094943284988403, + "learning_rate": 8.330933424178284e-05, + "loss": 0.6391071677207947, + "step": 4644 + }, + { + "epoch": 1.9603375527426161, + "grad_norm": 1.1640198230743408, + "learning_rate": 8.329142065625218e-05, + "loss": 0.6542805433273315, + "step": 4646 + }, + { + "epoch": 1.961181434599156, + "grad_norm": 1.1080211400985718, + "learning_rate": 8.327349939103016e-05, + "loss": 0.6053075194358826, + "step": 4648 + }, + { + "epoch": 1.9620253164556962, + "grad_norm": 1.0137052536010742, + "learning_rate": 8.325557045025085e-05, + "loss": 0.6009573340415955, + "step": 4650 + }, + { + "epoch": 1.9628691983122364, + "grad_norm": 1.0867283344268799, + "learning_rate": 8.323763383805012e-05, + "loss": 0.5993483066558838, + "step": 4652 + }, + { + "epoch": 1.9637130801687763, + "grad_norm": 1.0577161312103271, + "learning_rate": 8.321968955856562e-05, + "loss": 0.6788463592529297, + "step": 4654 + }, + { + "epoch": 1.9645569620253165, + "grad_norm": 1.2002183198928833, + "learning_rate": 8.320173761593672e-05, + "loss": 0.5786917209625244, + "step": 4656 + }, + { + "epoch": 1.9654008438818567, + "grad_norm": 1.2266993522644043, + "learning_rate": 8.318377801430461e-05, + "loss": 0.7437994480133057, + "step": 4658 + }, + { + "epoch": 1.9662447257383966, + "grad_norm": 1.007582187652588, + "learning_rate": 8.316581075781223e-05, + "loss": 0.6763550639152527, + "step": 4660 + }, + { + "epoch": 1.9670886075949368, + "grad_norm": 1.2374811172485352, + "learning_rate": 8.314783585060425e-05, + "loss": 0.6953140497207642, + "step": 4662 + }, + { + "epoch": 1.967932489451477, + "grad_norm": 1.1791057586669922, + "learning_rate": 8.312985329682717e-05, + "loss": 0.6867341995239258, + "step": 4664 + }, + { + "epoch": 1.9687763713080169, + "grad_norm": 1.1903331279754639, + "learning_rate": 8.31118631006292e-05, + "loss": 0.6445001363754272, + "step": 4666 + }, + { + "epoch": 1.9696202531645568, + "grad_norm": 1.1731067895889282, + "learning_rate": 8.309386526616034e-05, + "loss": 0.6500589847564697, + "step": 4668 + }, + { + "epoch": 1.9704641350210972, + "grad_norm": 0.9470233917236328, + "learning_rate": 8.307585979757233e-05, + "loss": 0.6215718984603882, + "step": 4670 + }, + { + "epoch": 1.9713080168776371, + "grad_norm": 1.2900800704956055, + "learning_rate": 8.305784669901872e-05, + "loss": 0.6396787762641907, + "step": 4672 + }, + { + "epoch": 1.972151898734177, + "grad_norm": 1.1729133129119873, + "learning_rate": 8.303982597465474e-05, + "loss": 0.6581959128379822, + "step": 4674 + }, + { + "epoch": 1.9729957805907175, + "grad_norm": 1.1450555324554443, + "learning_rate": 8.302179762863746e-05, + "loss": 0.7013490796089172, + "step": 4676 + }, + { + "epoch": 1.9738396624472574, + "grad_norm": 1.1506338119506836, + "learning_rate": 8.300376166512567e-05, + "loss": 0.6796102523803711, + "step": 4678 + }, + { + "epoch": 1.9746835443037973, + "grad_norm": 1.149979591369629, + "learning_rate": 8.298571808827991e-05, + "loss": 0.6960519552230835, + "step": 4680 + }, + { + "epoch": 1.9755274261603377, + "grad_norm": 1.1078912019729614, + "learning_rate": 8.296766690226249e-05, + "loss": 0.6789507865905762, + "step": 4682 + }, + { + "epoch": 1.9763713080168777, + "grad_norm": 1.0199202299118042, + "learning_rate": 8.294960811123747e-05, + "loss": 0.5962659120559692, + "step": 4684 + }, + { + "epoch": 1.9772151898734176, + "grad_norm": 1.2226134538650513, + "learning_rate": 8.293154171937068e-05, + "loss": 0.6483094692230225, + "step": 4686 + }, + { + "epoch": 1.9780590717299578, + "grad_norm": 1.184095025062561, + "learning_rate": 8.291346773082965e-05, + "loss": 0.6750242710113525, + "step": 4688 + }, + { + "epoch": 1.978902953586498, + "grad_norm": 1.1018693447113037, + "learning_rate": 8.289538614978375e-05, + "loss": 0.7094066739082336, + "step": 4690 + }, + { + "epoch": 1.9797468354430379, + "grad_norm": 1.0342390537261963, + "learning_rate": 8.287729698040403e-05, + "loss": 0.6554126739501953, + "step": 4692 + }, + { + "epoch": 1.980590717299578, + "grad_norm": 1.0603563785552979, + "learning_rate": 8.285920022686332e-05, + "loss": 0.5493529438972473, + "step": 4694 + }, + { + "epoch": 1.9814345991561182, + "grad_norm": 1.139609932899475, + "learning_rate": 8.284109589333617e-05, + "loss": 0.6824741363525391, + "step": 4696 + }, + { + "epoch": 1.9822784810126581, + "grad_norm": 1.2167822122573853, + "learning_rate": 8.282298398399895e-05, + "loss": 0.7121000289916992, + "step": 4698 + }, + { + "epoch": 1.9831223628691983, + "grad_norm": 1.109857201576233, + "learning_rate": 8.280486450302968e-05, + "loss": 0.6711249351501465, + "step": 4700 + }, + { + "epoch": 1.9831223628691983, + "eval_loss": 0.6923081278800964, + "eval_runtime": 514.7729, + "eval_samples_per_second": 4.093, + "eval_steps_per_second": 4.093, + "step": 4700 + }, + { + "epoch": 1.9839662447257385, + "grad_norm": 1.1387107372283936, + "learning_rate": 8.27867374546082e-05, + "loss": 0.581635594367981, + "step": 4702 + }, + { + "epoch": 1.9848101265822784, + "grad_norm": 1.2519257068634033, + "learning_rate": 8.27686028429161e-05, + "loss": 0.6867302060127258, + "step": 4704 + }, + { + "epoch": 1.9856540084388186, + "grad_norm": 1.0927205085754395, + "learning_rate": 8.275046067213663e-05, + "loss": 0.6494556665420532, + "step": 4706 + }, + { + "epoch": 1.9864978902953587, + "grad_norm": 1.042035698890686, + "learning_rate": 8.273231094645487e-05, + "loss": 0.6949493288993835, + "step": 4708 + }, + { + "epoch": 1.9873417721518987, + "grad_norm": 1.0220824480056763, + "learning_rate": 8.271415367005762e-05, + "loss": 0.6535884737968445, + "step": 4710 + }, + { + "epoch": 1.9881856540084388, + "grad_norm": 1.3023611307144165, + "learning_rate": 8.269598884713339e-05, + "loss": 0.6635278463363647, + "step": 4712 + }, + { + "epoch": 1.989029535864979, + "grad_norm": 1.2526965141296387, + "learning_rate": 8.267781648187248e-05, + "loss": 0.7194697856903076, + "step": 4714 + }, + { + "epoch": 1.989873417721519, + "grad_norm": 1.0388038158416748, + "learning_rate": 8.265963657846691e-05, + "loss": 0.6355333924293518, + "step": 4716 + }, + { + "epoch": 1.990717299578059, + "grad_norm": 1.0852965116500854, + "learning_rate": 8.264144914111041e-05, + "loss": 0.6898305416107178, + "step": 4718 + }, + { + "epoch": 1.9915611814345993, + "grad_norm": 1.0714049339294434, + "learning_rate": 8.262325417399847e-05, + "loss": 0.6202836036682129, + "step": 4720 + }, + { + "epoch": 1.9924050632911392, + "grad_norm": 1.0767238140106201, + "learning_rate": 8.260505168132835e-05, + "loss": 0.6160458326339722, + "step": 4722 + }, + { + "epoch": 1.9932489451476794, + "grad_norm": 0.9605211615562439, + "learning_rate": 8.258684166729899e-05, + "loss": 0.6049920916557312, + "step": 4724 + }, + { + "epoch": 1.9940928270042195, + "grad_norm": 1.0580185651779175, + "learning_rate": 8.256862413611113e-05, + "loss": 0.5622014999389648, + "step": 4726 + }, + { + "epoch": 1.9949367088607595, + "grad_norm": 1.1039034128189087, + "learning_rate": 8.255039909196713e-05, + "loss": 0.6678924560546875, + "step": 4728 + }, + { + "epoch": 1.9957805907172996, + "grad_norm": 1.1482586860656738, + "learning_rate": 8.253216653907123e-05, + "loss": 0.658260703086853, + "step": 4730 + }, + { + "epoch": 1.9966244725738398, + "grad_norm": 1.135349988937378, + "learning_rate": 8.251392648162929e-05, + "loss": 0.6461613178253174, + "step": 4732 + }, + { + "epoch": 1.9974683544303797, + "grad_norm": 1.0155420303344727, + "learning_rate": 8.249567892384895e-05, + "loss": 0.6837426424026489, + "step": 4734 + }, + { + "epoch": 1.9983122362869197, + "grad_norm": 1.3392970561981201, + "learning_rate": 8.247742386993958e-05, + "loss": 0.6091697812080383, + "step": 4736 + }, + { + "epoch": 1.99915611814346, + "grad_norm": 1.0509974956512451, + "learning_rate": 8.245916132411226e-05, + "loss": 0.6539653539657593, + "step": 4738 + }, + { + "epoch": 2.0, + "grad_norm": 0.9777396321296692, + "learning_rate": 8.244089129057982e-05, + "loss": 0.5630147457122803, + "step": 4740 + }, + { + "epoch": 2.00084388185654, + "grad_norm": 1.1639164686203003, + "learning_rate": 8.24226137735568e-05, + "loss": 0.6190353631973267, + "step": 4742 + }, + { + "epoch": 2.0016877637130803, + "grad_norm": 1.119614839553833, + "learning_rate": 8.240432877725947e-05, + "loss": 0.6282529234886169, + "step": 4744 + }, + { + "epoch": 2.0025316455696203, + "grad_norm": 1.114739179611206, + "learning_rate": 8.238603630590581e-05, + "loss": 0.6176725625991821, + "step": 4746 + }, + { + "epoch": 2.00337552742616, + "grad_norm": 1.0543076992034912, + "learning_rate": 8.236773636371557e-05, + "loss": 0.5182007551193237, + "step": 4748 + }, + { + "epoch": 2.0042194092827006, + "grad_norm": 1.060389518737793, + "learning_rate": 8.234942895491019e-05, + "loss": 0.532536506652832, + "step": 4750 + }, + { + "epoch": 2.0050632911392405, + "grad_norm": 1.0824412107467651, + "learning_rate": 8.233111408371282e-05, + "loss": 0.5474061369895935, + "step": 4752 + }, + { + "epoch": 2.0059071729957805, + "grad_norm": 1.1450858116149902, + "learning_rate": 8.231279175434838e-05, + "loss": 0.586384654045105, + "step": 4754 + }, + { + "epoch": 2.006751054852321, + "grad_norm": 1.1225577592849731, + "learning_rate": 8.229446197104345e-05, + "loss": 0.6469444036483765, + "step": 4756 + }, + { + "epoch": 2.007594936708861, + "grad_norm": 1.7292449474334717, + "learning_rate": 8.227612473802637e-05, + "loss": 0.5371572971343994, + "step": 4758 + }, + { + "epoch": 2.0084388185654007, + "grad_norm": 1.1743781566619873, + "learning_rate": 8.22577800595272e-05, + "loss": 0.558707058429718, + "step": 4760 + }, + { + "epoch": 2.009282700421941, + "grad_norm": 1.0385273694992065, + "learning_rate": 8.223942793977769e-05, + "loss": 0.5943514108657837, + "step": 4762 + }, + { + "epoch": 2.010126582278481, + "grad_norm": 1.1302000284194946, + "learning_rate": 8.222106838301131e-05, + "loss": 0.5630753636360168, + "step": 4764 + }, + { + "epoch": 2.010970464135021, + "grad_norm": 1.140005111694336, + "learning_rate": 8.220270139346327e-05, + "loss": 0.527510404586792, + "step": 4766 + }, + { + "epoch": 2.0118143459915614, + "grad_norm": 1.1979734897613525, + "learning_rate": 8.21843269753705e-05, + "loss": 0.6315013766288757, + "step": 4768 + }, + { + "epoch": 2.0126582278481013, + "grad_norm": 1.3759459257125854, + "learning_rate": 8.21659451329716e-05, + "loss": 0.6225199699401855, + "step": 4770 + }, + { + "epoch": 2.0135021097046413, + "grad_norm": 1.330600380897522, + "learning_rate": 8.21475558705069e-05, + "loss": 0.6838938593864441, + "step": 4772 + }, + { + "epoch": 2.014345991561181, + "grad_norm": 1.2365351915359497, + "learning_rate": 8.21291591922185e-05, + "loss": 0.606302797794342, + "step": 4774 + }, + { + "epoch": 2.0151898734177216, + "grad_norm": 1.1886142492294312, + "learning_rate": 8.211075510235011e-05, + "loss": 0.6194182634353638, + "step": 4776 + }, + { + "epoch": 2.0160337552742615, + "grad_norm": 1.1414743661880493, + "learning_rate": 8.209234360514721e-05, + "loss": 0.639540433883667, + "step": 4778 + }, + { + "epoch": 2.0168776371308015, + "grad_norm": 1.2877455949783325, + "learning_rate": 8.2073924704857e-05, + "loss": 0.6350902318954468, + "step": 4780 + }, + { + "epoch": 2.017721518987342, + "grad_norm": 1.095578908920288, + "learning_rate": 8.205549840572834e-05, + "loss": 0.5152000784873962, + "step": 4782 + }, + { + "epoch": 2.018565400843882, + "grad_norm": 1.0043798685073853, + "learning_rate": 8.203706471201183e-05, + "loss": 0.46245837211608887, + "step": 4784 + }, + { + "epoch": 2.0194092827004217, + "grad_norm": 1.2133857011795044, + "learning_rate": 8.201862362795979e-05, + "loss": 0.6471722722053528, + "step": 4786 + }, + { + "epoch": 2.020253164556962, + "grad_norm": 1.0835390090942383, + "learning_rate": 8.200017515782619e-05, + "loss": 0.5790625214576721, + "step": 4788 + }, + { + "epoch": 2.021097046413502, + "grad_norm": 1.0176091194152832, + "learning_rate": 8.198171930586678e-05, + "loss": 0.5826238989830017, + "step": 4790 + }, + { + "epoch": 2.021940928270042, + "grad_norm": 1.1581370830535889, + "learning_rate": 8.196325607633893e-05, + "loss": 0.5781272649765015, + "step": 4792 + }, + { + "epoch": 2.0227848101265824, + "grad_norm": 1.243381142616272, + "learning_rate": 8.194478547350178e-05, + "loss": 0.6600401997566223, + "step": 4794 + }, + { + "epoch": 2.0236286919831223, + "grad_norm": 1.0718560218811035, + "learning_rate": 8.192630750161612e-05, + "loss": 0.5291268825531006, + "step": 4796 + }, + { + "epoch": 2.0244725738396623, + "grad_norm": 1.2338320016860962, + "learning_rate": 8.190782216494448e-05, + "loss": 0.6564924120903015, + "step": 4798 + }, + { + "epoch": 2.0253164556962027, + "grad_norm": 0.978547990322113, + "learning_rate": 8.188932946775107e-05, + "loss": 0.5471183657646179, + "step": 4800 + }, + { + "epoch": 2.0253164556962027, + "eval_loss": 0.6924457550048828, + "eval_runtime": 514.0427, + "eval_samples_per_second": 4.099, + "eval_steps_per_second": 4.099, + "step": 4800 + }, + { + "epoch": 2.0261603375527426, + "grad_norm": 1.1782792806625366, + "learning_rate": 8.18708294143018e-05, + "loss": 0.567442774772644, + "step": 4802 + }, + { + "epoch": 2.0270042194092825, + "grad_norm": 1.0768574476242065, + "learning_rate": 8.185232200886426e-05, + "loss": 0.6005180478096008, + "step": 4804 + }, + { + "epoch": 2.027848101265823, + "grad_norm": 1.3096717596054077, + "learning_rate": 8.18338072557078e-05, + "loss": 0.616436779499054, + "step": 4806 + }, + { + "epoch": 2.028691983122363, + "grad_norm": 1.0233508348464966, + "learning_rate": 8.181528515910336e-05, + "loss": 0.49587416648864746, + "step": 4808 + }, + { + "epoch": 2.029535864978903, + "grad_norm": 1.0800065994262695, + "learning_rate": 8.179675572332366e-05, + "loss": 0.5758571624755859, + "step": 4810 + }, + { + "epoch": 2.030379746835443, + "grad_norm": 1.09299898147583, + "learning_rate": 8.177821895264309e-05, + "loss": 0.561736524105072, + "step": 4812 + }, + { + "epoch": 2.031223628691983, + "grad_norm": 1.1439210176467896, + "learning_rate": 8.175967485133771e-05, + "loss": 0.5249468088150024, + "step": 4814 + }, + { + "epoch": 2.032067510548523, + "grad_norm": 1.15841805934906, + "learning_rate": 8.174112342368532e-05, + "loss": 0.6429001688957214, + "step": 4816 + }, + { + "epoch": 2.0329113924050635, + "grad_norm": 1.1720670461654663, + "learning_rate": 8.172256467396533e-05, + "loss": 0.60152667760849, + "step": 4818 + }, + { + "epoch": 2.0337552742616034, + "grad_norm": 1.2652091979980469, + "learning_rate": 8.170399860645892e-05, + "loss": 0.5553541779518127, + "step": 4820 + }, + { + "epoch": 2.0345991561181433, + "grad_norm": 1.0768507719039917, + "learning_rate": 8.168542522544893e-05, + "loss": 0.5369323492050171, + "step": 4822 + }, + { + "epoch": 2.0354430379746837, + "grad_norm": 0.9906469583511353, + "learning_rate": 8.166684453521986e-05, + "loss": 0.5468952655792236, + "step": 4824 + }, + { + "epoch": 2.0362869198312237, + "grad_norm": 1.3448988199234009, + "learning_rate": 8.164825654005792e-05, + "loss": 0.5795659422874451, + "step": 4826 + }, + { + "epoch": 2.0371308016877636, + "grad_norm": 1.2502341270446777, + "learning_rate": 8.162966124425103e-05, + "loss": 0.6465779542922974, + "step": 4828 + }, + { + "epoch": 2.037974683544304, + "grad_norm": 1.1512303352355957, + "learning_rate": 8.161105865208875e-05, + "loss": 0.5509394407272339, + "step": 4830 + }, + { + "epoch": 2.038818565400844, + "grad_norm": 1.2513408660888672, + "learning_rate": 8.159244876786232e-05, + "loss": 0.5515735745429993, + "step": 4832 + }, + { + "epoch": 2.039662447257384, + "grad_norm": 1.3035682439804077, + "learning_rate": 8.157383159586473e-05, + "loss": 0.757799506187439, + "step": 4834 + }, + { + "epoch": 2.0405063291139243, + "grad_norm": 1.1136540174484253, + "learning_rate": 8.155520714039056e-05, + "loss": 0.607295036315918, + "step": 4836 + }, + { + "epoch": 2.041350210970464, + "grad_norm": 1.220146656036377, + "learning_rate": 8.153657540573613e-05, + "loss": 0.5769712328910828, + "step": 4838 + }, + { + "epoch": 2.042194092827004, + "grad_norm": 1.2104195356369019, + "learning_rate": 8.151793639619944e-05, + "loss": 0.5746933817863464, + "step": 4840 + }, + { + "epoch": 2.043037974683544, + "grad_norm": 1.241708517074585, + "learning_rate": 8.149929011608014e-05, + "loss": 0.5932332277297974, + "step": 4842 + }, + { + "epoch": 2.0438818565400845, + "grad_norm": 1.1172713041305542, + "learning_rate": 8.148063656967955e-05, + "loss": 0.583284318447113, + "step": 4844 + }, + { + "epoch": 2.0447257383966244, + "grad_norm": 1.0867618322372437, + "learning_rate": 8.14619757613007e-05, + "loss": 0.5589476823806763, + "step": 4846 + }, + { + "epoch": 2.0455696202531644, + "grad_norm": 1.2470483779907227, + "learning_rate": 8.14433076952483e-05, + "loss": 0.6118156313896179, + "step": 4848 + }, + { + "epoch": 2.0464135021097047, + "grad_norm": 1.0908832550048828, + "learning_rate": 8.142463237582868e-05, + "loss": 0.5815895795822144, + "step": 4850 + }, + { + "epoch": 2.0472573839662447, + "grad_norm": 1.2589281797409058, + "learning_rate": 8.140594980734989e-05, + "loss": 0.6232373714447021, + "step": 4852 + }, + { + "epoch": 2.0481012658227846, + "grad_norm": 1.234152913093567, + "learning_rate": 8.138725999412165e-05, + "loss": 0.5992053151130676, + "step": 4854 + }, + { + "epoch": 2.048945147679325, + "grad_norm": 1.3304446935653687, + "learning_rate": 8.136856294045533e-05, + "loss": 0.6494496464729309, + "step": 4856 + }, + { + "epoch": 2.049789029535865, + "grad_norm": 1.1871088743209839, + "learning_rate": 8.134985865066398e-05, + "loss": 0.6263431906700134, + "step": 4858 + }, + { + "epoch": 2.050632911392405, + "grad_norm": 1.1454699039459229, + "learning_rate": 8.133114712906234e-05, + "loss": 0.6036502122879028, + "step": 4860 + }, + { + "epoch": 2.0514767932489453, + "grad_norm": 1.2953420877456665, + "learning_rate": 8.131242837996675e-05, + "loss": 0.5674451589584351, + "step": 4862 + }, + { + "epoch": 2.052320675105485, + "grad_norm": 1.1874405145645142, + "learning_rate": 8.129370240769534e-05, + "loss": 0.5616317987442017, + "step": 4864 + }, + { + "epoch": 2.053164556962025, + "grad_norm": 1.2936227321624756, + "learning_rate": 8.127496921656777e-05, + "loss": 0.6495023369789124, + "step": 4866 + }, + { + "epoch": 2.0540084388185655, + "grad_norm": 1.1935228109359741, + "learning_rate": 8.125622881090544e-05, + "loss": 0.6028099060058594, + "step": 4868 + }, + { + "epoch": 2.0548523206751055, + "grad_norm": 0.9932331442832947, + "learning_rate": 8.123748119503143e-05, + "loss": 0.476296067237854, + "step": 4870 + }, + { + "epoch": 2.0556962025316454, + "grad_norm": 1.3878839015960693, + "learning_rate": 8.121872637327042e-05, + "loss": 0.6191902756690979, + "step": 4872 + }, + { + "epoch": 2.056540084388186, + "grad_norm": 1.1185581684112549, + "learning_rate": 8.11999643499488e-05, + "loss": 0.566487729549408, + "step": 4874 + }, + { + "epoch": 2.0573839662447257, + "grad_norm": 1.3729257583618164, + "learning_rate": 8.118119512939464e-05, + "loss": 0.5970078706741333, + "step": 4876 + }, + { + "epoch": 2.0582278481012657, + "grad_norm": 1.1332688331604004, + "learning_rate": 8.11624187159376e-05, + "loss": 0.570341944694519, + "step": 4878 + }, + { + "epoch": 2.059071729957806, + "grad_norm": 1.2648937702178955, + "learning_rate": 8.114363511390903e-05, + "loss": 0.6302897334098816, + "step": 4880 + }, + { + "epoch": 2.059915611814346, + "grad_norm": 1.250616192817688, + "learning_rate": 8.112484432764197e-05, + "loss": 0.5619142651557922, + "step": 4882 + }, + { + "epoch": 2.060759493670886, + "grad_norm": 0.9710861444473267, + "learning_rate": 8.110604636147109e-05, + "loss": 0.5426228642463684, + "step": 4884 + }, + { + "epoch": 2.0616033755274263, + "grad_norm": 1.1979506015777588, + "learning_rate": 8.108724121973271e-05, + "loss": 0.5498107671737671, + "step": 4886 + }, + { + "epoch": 2.0624472573839663, + "grad_norm": 1.0936485528945923, + "learning_rate": 8.106842890676483e-05, + "loss": 0.5695134401321411, + "step": 4888 + }, + { + "epoch": 2.0632911392405062, + "grad_norm": 1.1246092319488525, + "learning_rate": 8.10496094269071e-05, + "loss": 0.5998331308364868, + "step": 4890 + }, + { + "epoch": 2.0641350210970466, + "grad_norm": 1.244438648223877, + "learning_rate": 8.103078278450075e-05, + "loss": 0.5702623128890991, + "step": 4892 + }, + { + "epoch": 2.0649789029535865, + "grad_norm": 1.1585633754730225, + "learning_rate": 8.101194898388881e-05, + "loss": 0.5392299890518188, + "step": 4894 + }, + { + "epoch": 2.0658227848101265, + "grad_norm": 1.3044285774230957, + "learning_rate": 8.099310802941582e-05, + "loss": 0.5640127658843994, + "step": 4896 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 1.2483032941818237, + "learning_rate": 8.097425992542804e-05, + "loss": 0.6103175282478333, + "step": 4898 + }, + { + "epoch": 2.067510548523207, + "grad_norm": 1.0845462083816528, + "learning_rate": 8.095540467627337e-05, + "loss": 0.5041166543960571, + "step": 4900 + }, + { + "epoch": 2.067510548523207, + "eval_loss": 0.6941288113594055, + "eval_runtime": 513.4497, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 4900 + }, + { + "epoch": 2.0683544303797468, + "grad_norm": 1.2493232488632202, + "learning_rate": 8.093654228630134e-05, + "loss": 0.6253946423530579, + "step": 4902 + }, + { + "epoch": 2.0691983122362867, + "grad_norm": 1.1668756008148193, + "learning_rate": 8.091767275986317e-05, + "loss": 0.523486852645874, + "step": 4904 + }, + { + "epoch": 2.070042194092827, + "grad_norm": 1.1709638833999634, + "learning_rate": 8.089879610131167e-05, + "loss": 0.5569989681243896, + "step": 4906 + }, + { + "epoch": 2.070886075949367, + "grad_norm": 1.1044740676879883, + "learning_rate": 8.087991231500133e-05, + "loss": 0.642728865146637, + "step": 4908 + }, + { + "epoch": 2.071729957805907, + "grad_norm": 1.1032549142837524, + "learning_rate": 8.086102140528828e-05, + "loss": 0.5998259782791138, + "step": 4910 + }, + { + "epoch": 2.0725738396624473, + "grad_norm": 0.9980027079582214, + "learning_rate": 8.08421233765303e-05, + "loss": 0.5460172891616821, + "step": 4912 + }, + { + "epoch": 2.0734177215189873, + "grad_norm": 1.0866090059280396, + "learning_rate": 8.082321823308679e-05, + "loss": 0.5643284916877747, + "step": 4914 + }, + { + "epoch": 2.0742616033755272, + "grad_norm": 1.1942687034606934, + "learning_rate": 8.080430597931878e-05, + "loss": 0.554400622844696, + "step": 4916 + }, + { + "epoch": 2.0751054852320676, + "grad_norm": 1.0680599212646484, + "learning_rate": 8.078538661958901e-05, + "loss": 0.5955621004104614, + "step": 4918 + }, + { + "epoch": 2.0759493670886076, + "grad_norm": 1.20845627784729, + "learning_rate": 8.076646015826179e-05, + "loss": 0.5970203280448914, + "step": 4920 + }, + { + "epoch": 2.0767932489451475, + "grad_norm": 1.8368924856185913, + "learning_rate": 8.074752659970308e-05, + "loss": 0.6467664837837219, + "step": 4922 + }, + { + "epoch": 2.077637130801688, + "grad_norm": 1.3291922807693481, + "learning_rate": 8.072858594828053e-05, + "loss": 0.630719006061554, + "step": 4924 + }, + { + "epoch": 2.078481012658228, + "grad_norm": 1.1496083736419678, + "learning_rate": 8.070963820836333e-05, + "loss": 0.601140022277832, + "step": 4926 + }, + { + "epoch": 2.0793248945147678, + "grad_norm": 1.1562724113464355, + "learning_rate": 8.069068338432239e-05, + "loss": 0.6096881031990051, + "step": 4928 + }, + { + "epoch": 2.080168776371308, + "grad_norm": 1.0115300416946411, + "learning_rate": 8.067172148053021e-05, + "loss": 0.5085908770561218, + "step": 4930 + }, + { + "epoch": 2.081012658227848, + "grad_norm": 1.2181830406188965, + "learning_rate": 8.065275250136097e-05, + "loss": 0.5268720984458923, + "step": 4932 + }, + { + "epoch": 2.081856540084388, + "grad_norm": 1.1249788999557495, + "learning_rate": 8.06337764511904e-05, + "loss": 0.6075665950775146, + "step": 4934 + }, + { + "epoch": 2.0827004219409284, + "grad_norm": 1.1143964529037476, + "learning_rate": 8.061479333439595e-05, + "loss": 0.59170001745224, + "step": 4936 + }, + { + "epoch": 2.0835443037974684, + "grad_norm": 1.4773131608963013, + "learning_rate": 8.059580315535664e-05, + "loss": 0.6689745187759399, + "step": 4938 + }, + { + "epoch": 2.0843881856540083, + "grad_norm": 1.143965244293213, + "learning_rate": 8.057680591845316e-05, + "loss": 0.5409777760505676, + "step": 4940 + }, + { + "epoch": 2.0852320675105487, + "grad_norm": 1.0384942293167114, + "learning_rate": 8.055780162806777e-05, + "loss": 0.5778636336326599, + "step": 4942 + }, + { + "epoch": 2.0860759493670886, + "grad_norm": 1.0102177858352661, + "learning_rate": 8.053879028858442e-05, + "loss": 0.5576038360595703, + "step": 4944 + }, + { + "epoch": 2.0869198312236286, + "grad_norm": 1.3792158365249634, + "learning_rate": 8.051977190438868e-05, + "loss": 0.5873376131057739, + "step": 4946 + }, + { + "epoch": 2.087763713080169, + "grad_norm": 1.4402949810028076, + "learning_rate": 8.050074647986768e-05, + "loss": 0.6067743301391602, + "step": 4948 + }, + { + "epoch": 2.088607594936709, + "grad_norm": 1.2719058990478516, + "learning_rate": 8.048171401941027e-05, + "loss": 0.604671835899353, + "step": 4950 + }, + { + "epoch": 2.089451476793249, + "grad_norm": 1.1054867506027222, + "learning_rate": 8.046267452740683e-05, + "loss": 0.5743544697761536, + "step": 4952 + }, + { + "epoch": 2.090295358649789, + "grad_norm": 1.0521535873413086, + "learning_rate": 8.044362800824944e-05, + "loss": 0.576278567314148, + "step": 4954 + }, + { + "epoch": 2.091139240506329, + "grad_norm": 1.2665088176727295, + "learning_rate": 8.042457446633174e-05, + "loss": 0.5903641581535339, + "step": 4956 + }, + { + "epoch": 2.091983122362869, + "grad_norm": 1.1283398866653442, + "learning_rate": 8.040551390604902e-05, + "loss": 0.5854214429855347, + "step": 4958 + }, + { + "epoch": 2.0928270042194095, + "grad_norm": 1.1194316148757935, + "learning_rate": 8.03864463317982e-05, + "loss": 0.5843619108200073, + "step": 4960 + }, + { + "epoch": 2.0936708860759494, + "grad_norm": 1.3581651449203491, + "learning_rate": 8.036737174797778e-05, + "loss": 0.6115096211433411, + "step": 4962 + }, + { + "epoch": 2.0945147679324894, + "grad_norm": 1.341748595237732, + "learning_rate": 8.034829015898793e-05, + "loss": 0.5998795032501221, + "step": 4964 + }, + { + "epoch": 2.0953586497890297, + "grad_norm": 1.2212611436843872, + "learning_rate": 8.032920156923038e-05, + "loss": 0.628372311592102, + "step": 4966 + }, + { + "epoch": 2.0962025316455697, + "grad_norm": 1.1348317861557007, + "learning_rate": 8.031010598310851e-05, + "loss": 0.5668916702270508, + "step": 4968 + }, + { + "epoch": 2.0970464135021096, + "grad_norm": 1.1106547117233276, + "learning_rate": 8.029100340502731e-05, + "loss": 0.5253881216049194, + "step": 4970 + }, + { + "epoch": 2.09789029535865, + "grad_norm": 1.2471354007720947, + "learning_rate": 8.027189383939339e-05, + "loss": 0.5790762901306152, + "step": 4972 + }, + { + "epoch": 2.09873417721519, + "grad_norm": 1.2477394342422485, + "learning_rate": 8.025277729061492e-05, + "loss": 0.6382888555526733, + "step": 4974 + }, + { + "epoch": 2.09957805907173, + "grad_norm": 1.2716054916381836, + "learning_rate": 8.023365376310176e-05, + "loss": 0.5962072610855103, + "step": 4976 + }, + { + "epoch": 2.10042194092827, + "grad_norm": 1.257820725440979, + "learning_rate": 8.021452326126532e-05, + "loss": 0.5882940292358398, + "step": 4978 + }, + { + "epoch": 2.1012658227848102, + "grad_norm": 1.0924186706542969, + "learning_rate": 8.019538578951864e-05, + "loss": 0.5640701055526733, + "step": 4980 + }, + { + "epoch": 2.10210970464135, + "grad_norm": 1.1250383853912354, + "learning_rate": 8.017624135227637e-05, + "loss": 0.5746428966522217, + "step": 4982 + }, + { + "epoch": 2.10295358649789, + "grad_norm": 1.131323218345642, + "learning_rate": 8.015708995395477e-05, + "loss": 0.5611346960067749, + "step": 4984 + }, + { + "epoch": 2.1037974683544305, + "grad_norm": 1.4267152547836304, + "learning_rate": 8.013793159897171e-05, + "loss": 0.6173797249794006, + "step": 4986 + }, + { + "epoch": 2.1046413502109704, + "grad_norm": 1.41414213180542, + "learning_rate": 8.011876629174662e-05, + "loss": 0.64865642786026, + "step": 4988 + }, + { + "epoch": 2.1054852320675104, + "grad_norm": 1.1498184204101562, + "learning_rate": 8.00995940367006e-05, + "loss": 0.6125827431678772, + "step": 4990 + }, + { + "epoch": 2.1063291139240508, + "grad_norm": 1.2327708005905151, + "learning_rate": 8.00804148382563e-05, + "loss": 0.670495867729187, + "step": 4992 + }, + { + "epoch": 2.1071729957805907, + "grad_norm": 1.2797311544418335, + "learning_rate": 8.0061228700838e-05, + "loss": 0.6020209193229675, + "step": 4994 + }, + { + "epoch": 2.1080168776371306, + "grad_norm": 1.079584002494812, + "learning_rate": 8.004203562887157e-05, + "loss": 0.5974310636520386, + "step": 4996 + }, + { + "epoch": 2.108860759493671, + "grad_norm": 1.4352604150772095, + "learning_rate": 8.002283562678452e-05, + "loss": 0.6424587368965149, + "step": 4998 + }, + { + "epoch": 2.109704641350211, + "grad_norm": 1.0876719951629639, + "learning_rate": 8.000362869900586e-05, + "loss": 0.6185846328735352, + "step": 5000 + }, + { + "epoch": 2.109704641350211, + "eval_loss": 0.6908889412879944, + "eval_runtime": 675.8398, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 3.118, + "step": 5000 + }, + { + "epoch": 2.110548523206751, + "grad_norm": 1.0125762224197388, + "learning_rate": 7.998441484996631e-05, + "loss": 0.6127280592918396, + "step": 5002 + }, + { + "epoch": 2.1113924050632913, + "grad_norm": 1.0253753662109375, + "learning_rate": 7.99651940840981e-05, + "loss": 0.5495694875717163, + "step": 5004 + }, + { + "epoch": 2.1122362869198312, + "grad_norm": 1.5620673894882202, + "learning_rate": 7.994596640583511e-05, + "loss": 0.6199497580528259, + "step": 5006 + }, + { + "epoch": 2.113080168776371, + "grad_norm": 1.3032969236373901, + "learning_rate": 7.992673181961281e-05, + "loss": 0.5896390676498413, + "step": 5008 + }, + { + "epoch": 2.1139240506329116, + "grad_norm": 1.0933046340942383, + "learning_rate": 7.990749032986821e-05, + "loss": 0.6332341432571411, + "step": 5010 + }, + { + "epoch": 2.1147679324894515, + "grad_norm": 1.3115314245224, + "learning_rate": 7.988824194104e-05, + "loss": 0.5964323282241821, + "step": 5012 + }, + { + "epoch": 2.1156118143459914, + "grad_norm": 1.229978084564209, + "learning_rate": 7.986898665756837e-05, + "loss": 0.5938325524330139, + "step": 5014 + }, + { + "epoch": 2.116455696202532, + "grad_norm": 1.1779940128326416, + "learning_rate": 7.984972448389517e-05, + "loss": 0.5761791467666626, + "step": 5016 + }, + { + "epoch": 2.1172995780590718, + "grad_norm": 1.063490629196167, + "learning_rate": 7.98304554244638e-05, + "loss": 0.6073653101921082, + "step": 5018 + }, + { + "epoch": 2.1181434599156117, + "grad_norm": 1.2390391826629639, + "learning_rate": 7.981117948371927e-05, + "loss": 0.6126761436462402, + "step": 5020 + }, + { + "epoch": 2.118987341772152, + "grad_norm": 1.1946247816085815, + "learning_rate": 7.979189666610818e-05, + "loss": 0.614434003829956, + "step": 5022 + }, + { + "epoch": 2.119831223628692, + "grad_norm": 1.1008374691009521, + "learning_rate": 7.977260697607867e-05, + "loss": 0.5947603583335876, + "step": 5024 + }, + { + "epoch": 2.120675105485232, + "grad_norm": 1.14899480342865, + "learning_rate": 7.975331041808054e-05, + "loss": 0.583965539932251, + "step": 5026 + }, + { + "epoch": 2.1215189873417724, + "grad_norm": 1.1627864837646484, + "learning_rate": 7.973400699656512e-05, + "loss": 0.615121603012085, + "step": 5028 + }, + { + "epoch": 2.1223628691983123, + "grad_norm": 1.3622617721557617, + "learning_rate": 7.971469671598532e-05, + "loss": 0.6268601417541504, + "step": 5030 + }, + { + "epoch": 2.1232067510548522, + "grad_norm": 1.1735879182815552, + "learning_rate": 7.96953795807957e-05, + "loss": 0.6021270155906677, + "step": 5032 + }, + { + "epoch": 2.124050632911392, + "grad_norm": 1.3856201171875, + "learning_rate": 7.96760555954523e-05, + "loss": 0.636816680431366, + "step": 5034 + }, + { + "epoch": 2.1248945147679326, + "grad_norm": 1.1410126686096191, + "learning_rate": 7.965672476441282e-05, + "loss": 0.5324423313140869, + "step": 5036 + }, + { + "epoch": 2.1257383966244725, + "grad_norm": 1.446070909500122, + "learning_rate": 7.963738709213651e-05, + "loss": 0.7433624267578125, + "step": 5038 + }, + { + "epoch": 2.1265822784810124, + "grad_norm": 1.3041753768920898, + "learning_rate": 7.961804258308419e-05, + "loss": 0.6359145641326904, + "step": 5040 + }, + { + "epoch": 2.127426160337553, + "grad_norm": 1.2043813467025757, + "learning_rate": 7.959869124171826e-05, + "loss": 0.6164234280586243, + "step": 5042 + }, + { + "epoch": 2.1282700421940928, + "grad_norm": 1.2375630140304565, + "learning_rate": 7.957933307250273e-05, + "loss": 0.6437279582023621, + "step": 5044 + }, + { + "epoch": 2.1291139240506327, + "grad_norm": 1.210644245147705, + "learning_rate": 7.955996807990314e-05, + "loss": 0.585924506187439, + "step": 5046 + }, + { + "epoch": 2.129957805907173, + "grad_norm": 1.2011489868164062, + "learning_rate": 7.954059626838661e-05, + "loss": 0.6081803441047668, + "step": 5048 + }, + { + "epoch": 2.130801687763713, + "grad_norm": 1.0365782976150513, + "learning_rate": 7.952121764242187e-05, + "loss": 0.5609047412872314, + "step": 5050 + }, + { + "epoch": 2.131645569620253, + "grad_norm": 1.7950767278671265, + "learning_rate": 7.950183220647918e-05, + "loss": 0.5612874031066895, + "step": 5052 + }, + { + "epoch": 2.1324894514767934, + "grad_norm": 1.2933409214019775, + "learning_rate": 7.94824399650304e-05, + "loss": 0.6554630994796753, + "step": 5054 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 1.129828929901123, + "learning_rate": 7.946304092254894e-05, + "loss": 0.5623239278793335, + "step": 5056 + }, + { + "epoch": 2.1341772151898732, + "grad_norm": 1.1060296297073364, + "learning_rate": 7.944363508350978e-05, + "loss": 0.5036910772323608, + "step": 5058 + }, + { + "epoch": 2.1350210970464136, + "grad_norm": 1.2622627019882202, + "learning_rate": 7.94242224523895e-05, + "loss": 0.5840913653373718, + "step": 5060 + }, + { + "epoch": 2.1358649789029536, + "grad_norm": 1.3803153038024902, + "learning_rate": 7.940480303366618e-05, + "loss": 0.6365578770637512, + "step": 5062 + }, + { + "epoch": 2.1367088607594935, + "grad_norm": 1.2524651288986206, + "learning_rate": 7.938537683181955e-05, + "loss": 0.6167916655540466, + "step": 5064 + }, + { + "epoch": 2.137552742616034, + "grad_norm": 1.3320350646972656, + "learning_rate": 7.936594385133083e-05, + "loss": 0.6356930732727051, + "step": 5066 + }, + { + "epoch": 2.138396624472574, + "grad_norm": 1.3180949687957764, + "learning_rate": 7.934650409668285e-05, + "loss": 0.5888242721557617, + "step": 5068 + }, + { + "epoch": 2.1392405063291138, + "grad_norm": 1.1376243829727173, + "learning_rate": 7.932705757235999e-05, + "loss": 0.608725905418396, + "step": 5070 + }, + { + "epoch": 2.140084388185654, + "grad_norm": 1.1734369993209839, + "learning_rate": 7.930760428284817e-05, + "loss": 0.5824158787727356, + "step": 5072 + }, + { + "epoch": 2.140928270042194, + "grad_norm": 1.1038579940795898, + "learning_rate": 7.928814423263493e-05, + "loss": 0.5629416704177856, + "step": 5074 + }, + { + "epoch": 2.141772151898734, + "grad_norm": 1.269780158996582, + "learning_rate": 7.926867742620929e-05, + "loss": 0.5994445085525513, + "step": 5076 + }, + { + "epoch": 2.1426160337552744, + "grad_norm": 1.2274279594421387, + "learning_rate": 7.924920386806188e-05, + "loss": 0.5845475792884827, + "step": 5078 + }, + { + "epoch": 2.1434599156118144, + "grad_norm": 1.168766975402832, + "learning_rate": 7.922972356268488e-05, + "loss": 0.621201753616333, + "step": 5080 + }, + { + "epoch": 2.1443037974683543, + "grad_norm": 1.0057638883590698, + "learning_rate": 7.921023651457203e-05, + "loss": 0.5282597541809082, + "step": 5082 + }, + { + "epoch": 2.1451476793248947, + "grad_norm": 1.432309865951538, + "learning_rate": 7.91907427282186e-05, + "loss": 0.632583737373352, + "step": 5084 + }, + { + "epoch": 2.1459915611814346, + "grad_norm": 1.3939776420593262, + "learning_rate": 7.917124220812144e-05, + "loss": 0.6239289045333862, + "step": 5086 + }, + { + "epoch": 2.1468354430379746, + "grad_norm": 1.3741775751113892, + "learning_rate": 7.915173495877895e-05, + "loss": 0.5749062895774841, + "step": 5088 + }, + { + "epoch": 2.147679324894515, + "grad_norm": 1.3123528957366943, + "learning_rate": 7.913222098469109e-05, + "loss": 0.6011738181114197, + "step": 5090 + }, + { + "epoch": 2.148523206751055, + "grad_norm": 1.3473498821258545, + "learning_rate": 7.911270029035932e-05, + "loss": 0.5804699659347534, + "step": 5092 + }, + { + "epoch": 2.149367088607595, + "grad_norm": 1.0873067378997803, + "learning_rate": 7.909317288028673e-05, + "loss": 0.6446103453636169, + "step": 5094 + }, + { + "epoch": 2.1502109704641352, + "grad_norm": 1.1374083757400513, + "learning_rate": 7.907363875897789e-05, + "loss": 0.6136524677276611, + "step": 5096 + }, + { + "epoch": 2.151054852320675, + "grad_norm": 1.1356533765792847, + "learning_rate": 7.905409793093896e-05, + "loss": 0.5107976794242859, + "step": 5098 + }, + { + "epoch": 2.151898734177215, + "grad_norm": 1.2579567432403564, + "learning_rate": 7.903455040067763e-05, + "loss": 0.6073099374771118, + "step": 5100 + }, + { + "epoch": 2.151898734177215, + "eval_loss": 0.6902023553848267, + "eval_runtime": 733.915, + "eval_samples_per_second": 2.871, + "eval_steps_per_second": 2.871, + "step": 5100 + }, + { + "epoch": 2.1527426160337555, + "grad_norm": 1.2401398420333862, + "learning_rate": 7.901499617270315e-05, + "loss": 0.5562406182289124, + "step": 5102 + }, + { + "epoch": 2.1535864978902954, + "grad_norm": 1.086590051651001, + "learning_rate": 7.899543525152628e-05, + "loss": 0.5749467015266418, + "step": 5104 + }, + { + "epoch": 2.1544303797468354, + "grad_norm": 1.206458568572998, + "learning_rate": 7.897586764165939e-05, + "loss": 0.6326877474784851, + "step": 5106 + }, + { + "epoch": 2.1552742616033758, + "grad_norm": 1.030740737915039, + "learning_rate": 7.895629334761632e-05, + "loss": 0.5616445541381836, + "step": 5108 + }, + { + "epoch": 2.1561181434599157, + "grad_norm": 1.3338581323623657, + "learning_rate": 7.89367123739125e-05, + "loss": 0.6307384371757507, + "step": 5110 + }, + { + "epoch": 2.1569620253164556, + "grad_norm": 1.2684671878814697, + "learning_rate": 7.891712472506485e-05, + "loss": 0.6087653636932373, + "step": 5112 + }, + { + "epoch": 2.1578059071729956, + "grad_norm": 1.1610581874847412, + "learning_rate": 7.889753040559188e-05, + "loss": 0.5747998952865601, + "step": 5114 + }, + { + "epoch": 2.158649789029536, + "grad_norm": 1.4069275856018066, + "learning_rate": 7.887792942001366e-05, + "loss": 0.6143770217895508, + "step": 5116 + }, + { + "epoch": 2.159493670886076, + "grad_norm": 1.0858227014541626, + "learning_rate": 7.885832177285173e-05, + "loss": 0.552534282207489, + "step": 5118 + }, + { + "epoch": 2.160337552742616, + "grad_norm": 1.067070722579956, + "learning_rate": 7.88387074686292e-05, + "loss": 0.5781989693641663, + "step": 5120 + }, + { + "epoch": 2.1611814345991562, + "grad_norm": 1.139981746673584, + "learning_rate": 7.881908651187072e-05, + "loss": 0.5521422624588013, + "step": 5122 + }, + { + "epoch": 2.162025316455696, + "grad_norm": 1.0987457036972046, + "learning_rate": 7.879945890710245e-05, + "loss": 0.5755025744438171, + "step": 5124 + }, + { + "epoch": 2.162869198312236, + "grad_norm": 1.1530758142471313, + "learning_rate": 7.877982465885214e-05, + "loss": 0.5783509612083435, + "step": 5126 + }, + { + "epoch": 2.1637130801687765, + "grad_norm": 1.2285696268081665, + "learning_rate": 7.876018377164899e-05, + "loss": 0.5942281484603882, + "step": 5128 + }, + { + "epoch": 2.1645569620253164, + "grad_norm": 1.1283711194992065, + "learning_rate": 7.874053625002378e-05, + "loss": 0.5539707541465759, + "step": 5130 + }, + { + "epoch": 2.1654008438818564, + "grad_norm": 1.3213335275650024, + "learning_rate": 7.872088209850885e-05, + "loss": 0.5955292582511902, + "step": 5132 + }, + { + "epoch": 2.1662447257383968, + "grad_norm": 1.1748592853546143, + "learning_rate": 7.8701221321638e-05, + "loss": 0.5422899723052979, + "step": 5134 + }, + { + "epoch": 2.1670886075949367, + "grad_norm": 1.0752148628234863, + "learning_rate": 7.868155392394662e-05, + "loss": 0.5547205209732056, + "step": 5136 + }, + { + "epoch": 2.1679324894514767, + "grad_norm": 1.1814554929733276, + "learning_rate": 7.86618799099716e-05, + "loss": 0.5938948392868042, + "step": 5138 + }, + { + "epoch": 2.168776371308017, + "grad_norm": 1.3455278873443604, + "learning_rate": 7.864219928425132e-05, + "loss": 0.6468925476074219, + "step": 5140 + }, + { + "epoch": 2.169620253164557, + "grad_norm": 1.2695354223251343, + "learning_rate": 7.862251205132576e-05, + "loss": 0.5704391002655029, + "step": 5142 + }, + { + "epoch": 2.170464135021097, + "grad_norm": 1.1529468297958374, + "learning_rate": 7.860281821573638e-05, + "loss": 0.6057283878326416, + "step": 5144 + }, + { + "epoch": 2.1713080168776373, + "grad_norm": 1.3461004495620728, + "learning_rate": 7.858311778202616e-05, + "loss": 0.6135527491569519, + "step": 5146 + }, + { + "epoch": 2.1721518987341772, + "grad_norm": 1.1258536577224731, + "learning_rate": 7.856341075473962e-05, + "loss": 0.5585638880729675, + "step": 5148 + }, + { + "epoch": 2.172995780590717, + "grad_norm": 1.254898190498352, + "learning_rate": 7.854369713842279e-05, + "loss": 0.5780918002128601, + "step": 5150 + }, + { + "epoch": 2.1738396624472576, + "grad_norm": 1.2730201482772827, + "learning_rate": 7.852397693762321e-05, + "loss": 0.595267117023468, + "step": 5152 + }, + { + "epoch": 2.1746835443037975, + "grad_norm": 1.1875078678131104, + "learning_rate": 7.850425015688999e-05, + "loss": 0.5636162161827087, + "step": 5154 + }, + { + "epoch": 2.1755274261603375, + "grad_norm": 1.0930945873260498, + "learning_rate": 7.848451680077366e-05, + "loss": 0.6362089514732361, + "step": 5156 + }, + { + "epoch": 2.176371308016878, + "grad_norm": 1.2274452447891235, + "learning_rate": 7.846477687382639e-05, + "loss": 0.6268675327301025, + "step": 5158 + }, + { + "epoch": 2.1772151898734178, + "grad_norm": 1.2023133039474487, + "learning_rate": 7.844503038060176e-05, + "loss": 0.6014906167984009, + "step": 5160 + }, + { + "epoch": 2.1780590717299577, + "grad_norm": 1.2616889476776123, + "learning_rate": 7.842527732565491e-05, + "loss": 0.6180019974708557, + "step": 5162 + }, + { + "epoch": 2.1789029535864977, + "grad_norm": 1.1046907901763916, + "learning_rate": 7.84055177135425e-05, + "loss": 0.5400100946426392, + "step": 5164 + }, + { + "epoch": 2.179746835443038, + "grad_norm": 1.1664032936096191, + "learning_rate": 7.83857515488227e-05, + "loss": 0.5713199973106384, + "step": 5166 + }, + { + "epoch": 2.180590717299578, + "grad_norm": 1.2526558637619019, + "learning_rate": 7.836597883605519e-05, + "loss": 0.5741307735443115, + "step": 5168 + }, + { + "epoch": 2.181434599156118, + "grad_norm": 1.0457103252410889, + "learning_rate": 7.834619957980112e-05, + "loss": 0.47188031673431396, + "step": 5170 + }, + { + "epoch": 2.1822784810126583, + "grad_norm": 1.1978110074996948, + "learning_rate": 7.832641378462319e-05, + "loss": 0.6149471998214722, + "step": 5172 + }, + { + "epoch": 2.1831223628691983, + "grad_norm": 1.2231460809707642, + "learning_rate": 7.830662145508567e-05, + "loss": 0.5520018339157104, + "step": 5174 + }, + { + "epoch": 2.183966244725738, + "grad_norm": 1.4367618560791016, + "learning_rate": 7.828682259575417e-05, + "loss": 0.6536548733711243, + "step": 5176 + }, + { + "epoch": 2.1848101265822786, + "grad_norm": 1.0891374349594116, + "learning_rate": 7.826701721119598e-05, + "loss": 0.5324372053146362, + "step": 5178 + }, + { + "epoch": 2.1856540084388185, + "grad_norm": 1.118695616722107, + "learning_rate": 7.82472053059798e-05, + "loss": 0.6127952337265015, + "step": 5180 + }, + { + "epoch": 2.1864978902953585, + "grad_norm": 1.1116070747375488, + "learning_rate": 7.822738688467585e-05, + "loss": 0.505962610244751, + "step": 5182 + }, + { + "epoch": 2.187341772151899, + "grad_norm": 1.2140545845031738, + "learning_rate": 7.820756195185586e-05, + "loss": 0.6210073232650757, + "step": 5184 + }, + { + "epoch": 2.188185654008439, + "grad_norm": 1.2135601043701172, + "learning_rate": 7.818773051209307e-05, + "loss": 0.6517674326896667, + "step": 5186 + }, + { + "epoch": 2.1890295358649787, + "grad_norm": 1.3875514268875122, + "learning_rate": 7.816789256996218e-05, + "loss": 0.5577492117881775, + "step": 5188 + }, + { + "epoch": 2.189873417721519, + "grad_norm": 1.181325912475586, + "learning_rate": 7.814804813003949e-05, + "loss": 0.6010199189186096, + "step": 5190 + }, + { + "epoch": 2.190717299578059, + "grad_norm": 1.102044701576233, + "learning_rate": 7.812819719690265e-05, + "loss": 0.5635302662849426, + "step": 5192 + }, + { + "epoch": 2.191561181434599, + "grad_norm": 1.4227958917617798, + "learning_rate": 7.810833977513094e-05, + "loss": 0.5804321765899658, + "step": 5194 + }, + { + "epoch": 2.1924050632911394, + "grad_norm": 1.2573446035385132, + "learning_rate": 7.80884758693051e-05, + "loss": 0.6005555987358093, + "step": 5196 + }, + { + "epoch": 2.1932489451476793, + "grad_norm": 1.3534085750579834, + "learning_rate": 7.80686054840073e-05, + "loss": 0.6263643503189087, + "step": 5198 + }, + { + "epoch": 2.1940928270042193, + "grad_norm": 1.6895852088928223, + "learning_rate": 7.804872862382131e-05, + "loss": 0.6235764622688293, + "step": 5200 + }, + { + "epoch": 2.1940928270042193, + "eval_loss": 0.6915348172187805, + "eval_runtime": 1167.9782, + "eval_samples_per_second": 1.804, + "eval_steps_per_second": 1.804, + "step": 5200 + }, + { + "epoch": 2.1949367088607596, + "grad_norm": 1.138973593711853, + "learning_rate": 7.802884529333227e-05, + "loss": 0.5586035847663879, + "step": 5202 + }, + { + "epoch": 2.1957805907172996, + "grad_norm": 1.3664026260375977, + "learning_rate": 7.800895549712697e-05, + "loss": 0.5768917202949524, + "step": 5204 + }, + { + "epoch": 2.1966244725738395, + "grad_norm": 1.2182449102401733, + "learning_rate": 7.798905923979353e-05, + "loss": 0.6046215891838074, + "step": 5206 + }, + { + "epoch": 2.19746835443038, + "grad_norm": 1.2692211866378784, + "learning_rate": 7.796915652592167e-05, + "loss": 0.5412904024124146, + "step": 5208 + }, + { + "epoch": 2.19831223628692, + "grad_norm": 1.200822114944458, + "learning_rate": 7.794924736010256e-05, + "loss": 0.5328584909439087, + "step": 5210 + }, + { + "epoch": 2.19915611814346, + "grad_norm": 1.1093779802322388, + "learning_rate": 7.792933174692886e-05, + "loss": 0.5497913360595703, + "step": 5212 + }, + { + "epoch": 2.2, + "grad_norm": 1.3838921785354614, + "learning_rate": 7.790940969099471e-05, + "loss": 0.5908066034317017, + "step": 5214 + }, + { + "epoch": 2.20084388185654, + "grad_norm": 1.1411913633346558, + "learning_rate": 7.788948119689576e-05, + "loss": 0.6117307543754578, + "step": 5216 + }, + { + "epoch": 2.20168776371308, + "grad_norm": 1.5668916702270508, + "learning_rate": 7.786954626922913e-05, + "loss": 0.5788605809211731, + "step": 5218 + }, + { + "epoch": 2.2025316455696204, + "grad_norm": 1.195027232170105, + "learning_rate": 7.784960491259344e-05, + "loss": 0.5948591828346252, + "step": 5220 + }, + { + "epoch": 2.2033755274261604, + "grad_norm": 1.2665271759033203, + "learning_rate": 7.782965713158872e-05, + "loss": 0.6321669220924377, + "step": 5222 + }, + { + "epoch": 2.2042194092827003, + "grad_norm": 1.123711109161377, + "learning_rate": 7.78097029308166e-05, + "loss": 0.5853859186172485, + "step": 5224 + }, + { + "epoch": 2.2050632911392407, + "grad_norm": 1.9381071329116821, + "learning_rate": 7.77897423148801e-05, + "loss": 0.6485977172851562, + "step": 5226 + }, + { + "epoch": 2.2059071729957807, + "grad_norm": 1.4062265157699585, + "learning_rate": 7.776977528838376e-05, + "loss": 0.6243517398834229, + "step": 5228 + }, + { + "epoch": 2.2067510548523206, + "grad_norm": 1.2127182483673096, + "learning_rate": 7.774980185593358e-05, + "loss": 0.5770578980445862, + "step": 5230 + }, + { + "epoch": 2.207594936708861, + "grad_norm": 1.250847578048706, + "learning_rate": 7.772982202213709e-05, + "loss": 0.6521194577217102, + "step": 5232 + }, + { + "epoch": 2.208438818565401, + "grad_norm": 1.2568131685256958, + "learning_rate": 7.77098357916032e-05, + "loss": 0.5755271911621094, + "step": 5234 + }, + { + "epoch": 2.209282700421941, + "grad_norm": 1.2422975301742554, + "learning_rate": 7.768984316894236e-05, + "loss": 0.5486469864845276, + "step": 5236 + }, + { + "epoch": 2.2101265822784812, + "grad_norm": 1.1018635034561157, + "learning_rate": 7.766984415876652e-05, + "loss": 0.5512928366661072, + "step": 5238 + }, + { + "epoch": 2.210970464135021, + "grad_norm": 1.2261123657226562, + "learning_rate": 7.764983876568903e-05, + "loss": 0.5753499269485474, + "step": 5240 + }, + { + "epoch": 2.211814345991561, + "grad_norm": 1.2222342491149902, + "learning_rate": 7.762982699432474e-05, + "loss": 0.5404848456382751, + "step": 5242 + }, + { + "epoch": 2.212658227848101, + "grad_norm": 1.231494426727295, + "learning_rate": 7.760980884929004e-05, + "loss": 0.5999218821525574, + "step": 5244 + }, + { + "epoch": 2.2135021097046415, + "grad_norm": 1.1530078649520874, + "learning_rate": 7.758978433520268e-05, + "loss": 0.6123101115226746, + "step": 5246 + }, + { + "epoch": 2.2143459915611814, + "grad_norm": 1.182706594467163, + "learning_rate": 7.756975345668194e-05, + "loss": 0.5945886969566345, + "step": 5248 + }, + { + "epoch": 2.2151898734177213, + "grad_norm": 1.0788652896881104, + "learning_rate": 7.754971621834857e-05, + "loss": 0.5698213577270508, + "step": 5250 + }, + { + "epoch": 2.2160337552742617, + "grad_norm": 1.2243359088897705, + "learning_rate": 7.752967262482477e-05, + "loss": 0.5959678888320923, + "step": 5252 + }, + { + "epoch": 2.2168776371308017, + "grad_norm": 1.4292869567871094, + "learning_rate": 7.750962268073421e-05, + "loss": 0.586794376373291, + "step": 5254 + }, + { + "epoch": 2.2177215189873416, + "grad_norm": 1.1809570789337158, + "learning_rate": 7.748956639070204e-05, + "loss": 0.5513298511505127, + "step": 5256 + }, + { + "epoch": 2.218565400843882, + "grad_norm": 1.485813856124878, + "learning_rate": 7.746950375935484e-05, + "loss": 0.6402831673622131, + "step": 5258 + }, + { + "epoch": 2.219409282700422, + "grad_norm": 1.0851374864578247, + "learning_rate": 7.744943479132069e-05, + "loss": 0.5729117393493652, + "step": 5260 + }, + { + "epoch": 2.220253164556962, + "grad_norm": 1.4308949708938599, + "learning_rate": 7.742935949122911e-05, + "loss": 0.6239725947380066, + "step": 5262 + }, + { + "epoch": 2.2210970464135023, + "grad_norm": 1.379258155822754, + "learning_rate": 7.740927786371107e-05, + "loss": 0.6260181069374084, + "step": 5264 + }, + { + "epoch": 2.221940928270042, + "grad_norm": 1.1661925315856934, + "learning_rate": 7.738918991339905e-05, + "loss": 0.6074157357215881, + "step": 5266 + }, + { + "epoch": 2.222784810126582, + "grad_norm": 1.168901801109314, + "learning_rate": 7.736909564492694e-05, + "loss": 0.6119515895843506, + "step": 5268 + }, + { + "epoch": 2.2236286919831225, + "grad_norm": 1.1451057195663452, + "learning_rate": 7.734899506293008e-05, + "loss": 0.5505842566490173, + "step": 5270 + }, + { + "epoch": 2.2244725738396625, + "grad_norm": 1.2303991317749023, + "learning_rate": 7.732888817204533e-05, + "loss": 0.6117991805076599, + "step": 5272 + }, + { + "epoch": 2.2253164556962024, + "grad_norm": 1.04572331905365, + "learning_rate": 7.730877497691092e-05, + "loss": 0.5589770078659058, + "step": 5274 + }, + { + "epoch": 2.226160337552743, + "grad_norm": 1.2047234773635864, + "learning_rate": 7.72886554821666e-05, + "loss": 0.6288654208183289, + "step": 5276 + }, + { + "epoch": 2.2270042194092827, + "grad_norm": 1.2036652565002441, + "learning_rate": 7.726852969245355e-05, + "loss": 0.6174501776695251, + "step": 5278 + }, + { + "epoch": 2.2278481012658227, + "grad_norm": 1.1740167140960693, + "learning_rate": 7.72483976124144e-05, + "loss": 0.6027677655220032, + "step": 5280 + }, + { + "epoch": 2.228691983122363, + "grad_norm": 1.0600008964538574, + "learning_rate": 7.722825924669326e-05, + "loss": 0.6016151309013367, + "step": 5282 + }, + { + "epoch": 2.229535864978903, + "grad_norm": 1.2631008625030518, + "learning_rate": 7.720811459993562e-05, + "loss": 0.5905849933624268, + "step": 5284 + }, + { + "epoch": 2.230379746835443, + "grad_norm": 1.1024738550186157, + "learning_rate": 7.718796367678848e-05, + "loss": 0.5129587054252625, + "step": 5286 + }, + { + "epoch": 2.2312236286919833, + "grad_norm": 1.23116934299469, + "learning_rate": 7.716780648190028e-05, + "loss": 0.5709586143493652, + "step": 5288 + }, + { + "epoch": 2.2320675105485233, + "grad_norm": 1.2739102840423584, + "learning_rate": 7.714764301992088e-05, + "loss": 0.5454761385917664, + "step": 5290 + }, + { + "epoch": 2.232911392405063, + "grad_norm": 1.303963303565979, + "learning_rate": 7.712747329550162e-05, + "loss": 0.537248969078064, + "step": 5292 + }, + { + "epoch": 2.233755274261603, + "grad_norm": 1.2454309463500977, + "learning_rate": 7.710729731329529e-05, + "loss": 0.6364415884017944, + "step": 5294 + }, + { + "epoch": 2.2345991561181435, + "grad_norm": 1.2401882410049438, + "learning_rate": 7.708711507795605e-05, + "loss": 0.5640100240707397, + "step": 5296 + }, + { + "epoch": 2.2354430379746835, + "grad_norm": 1.197432041168213, + "learning_rate": 7.706692659413959e-05, + "loss": 0.5919729471206665, + "step": 5298 + }, + { + "epoch": 2.2362869198312234, + "grad_norm": 1.1779764890670776, + "learning_rate": 7.704673186650298e-05, + "loss": 0.5569849014282227, + "step": 5300 + }, + { + "epoch": 2.2362869198312234, + "eval_loss": 0.6898328065872192, + "eval_runtime": 739.3794, + "eval_samples_per_second": 2.85, + "eval_steps_per_second": 2.85, + "step": 5300 + }, + { + "epoch": 2.237130801687764, + "grad_norm": 1.1371463537216187, + "learning_rate": 7.702653089970479e-05, + "loss": 0.5823061466217041, + "step": 5302 + }, + { + "epoch": 2.2379746835443037, + "grad_norm": 1.1877846717834473, + "learning_rate": 7.700632369840497e-05, + "loss": 0.5556252002716064, + "step": 5304 + }, + { + "epoch": 2.2388185654008437, + "grad_norm": 1.1580896377563477, + "learning_rate": 7.698611026726492e-05, + "loss": 0.5794119834899902, + "step": 5306 + }, + { + "epoch": 2.239662447257384, + "grad_norm": 1.29141366481781, + "learning_rate": 7.696589061094755e-05, + "loss": 0.5828680396080017, + "step": 5308 + }, + { + "epoch": 2.240506329113924, + "grad_norm": 1.1286728382110596, + "learning_rate": 7.694566473411706e-05, + "loss": 0.6161736845970154, + "step": 5310 + }, + { + "epoch": 2.241350210970464, + "grad_norm": 1.0969985723495483, + "learning_rate": 7.692543264143925e-05, + "loss": 0.570767879486084, + "step": 5312 + }, + { + "epoch": 2.2421940928270043, + "grad_norm": 1.2902227640151978, + "learning_rate": 7.690519433758123e-05, + "loss": 0.631476104259491, + "step": 5314 + }, + { + "epoch": 2.2430379746835443, + "grad_norm": 1.432735800743103, + "learning_rate": 7.68849498272116e-05, + "loss": 0.6142309904098511, + "step": 5316 + }, + { + "epoch": 2.243881856540084, + "grad_norm": 1.0824161767959595, + "learning_rate": 7.686469911500038e-05, + "loss": 0.5871514081954956, + "step": 5318 + }, + { + "epoch": 2.2447257383966246, + "grad_norm": 1.1694978475570679, + "learning_rate": 7.684444220561902e-05, + "loss": 0.6144557595252991, + "step": 5320 + }, + { + "epoch": 2.2455696202531645, + "grad_norm": 1.2981040477752686, + "learning_rate": 7.68241791037404e-05, + "loss": 0.6049425601959229, + "step": 5322 + }, + { + "epoch": 2.2464135021097045, + "grad_norm": 1.132128357887268, + "learning_rate": 7.680390981403885e-05, + "loss": 0.5571867823600769, + "step": 5324 + }, + { + "epoch": 2.247257383966245, + "grad_norm": 1.1760079860687256, + "learning_rate": 7.678363434119005e-05, + "loss": 0.5710517168045044, + "step": 5326 + }, + { + "epoch": 2.248101265822785, + "grad_norm": 1.1918572187423706, + "learning_rate": 7.67633526898712e-05, + "loss": 0.5508866906166077, + "step": 5328 + }, + { + "epoch": 2.2489451476793247, + "grad_norm": 1.1837294101715088, + "learning_rate": 7.674306486476091e-05, + "loss": 0.6242696046829224, + "step": 5330 + }, + { + "epoch": 2.249789029535865, + "grad_norm": 1.384918212890625, + "learning_rate": 7.672277087053914e-05, + "loss": 0.5821678042411804, + "step": 5332 + }, + { + "epoch": 2.250632911392405, + "grad_norm": 1.1248877048492432, + "learning_rate": 7.670247071188738e-05, + "loss": 0.5415928363800049, + "step": 5334 + }, + { + "epoch": 2.251476793248945, + "grad_norm": 1.228140950202942, + "learning_rate": 7.668216439348843e-05, + "loss": 0.5475174188613892, + "step": 5336 + }, + { + "epoch": 2.2523206751054854, + "grad_norm": 1.3816046714782715, + "learning_rate": 7.666185192002662e-05, + "loss": 0.5793306231498718, + "step": 5338 + }, + { + "epoch": 2.2531645569620253, + "grad_norm": 1.2446565628051758, + "learning_rate": 7.664153329618759e-05, + "loss": 0.6221131682395935, + "step": 5340 + }, + { + "epoch": 2.2540084388185653, + "grad_norm": 1.1677669286727905, + "learning_rate": 7.662120852665852e-05, + "loss": 0.5403847694396973, + "step": 5342 + }, + { + "epoch": 2.2548523206751057, + "grad_norm": 1.2485873699188232, + "learning_rate": 7.66008776161279e-05, + "loss": 0.620201587677002, + "step": 5344 + }, + { + "epoch": 2.2556962025316456, + "grad_norm": 1.2486802339553833, + "learning_rate": 7.658054056928568e-05, + "loss": 0.5969216227531433, + "step": 5346 + }, + { + "epoch": 2.2565400843881855, + "grad_norm": 1.2621372938156128, + "learning_rate": 7.656019739082326e-05, + "loss": 0.6376339793205261, + "step": 5348 + }, + { + "epoch": 2.257383966244726, + "grad_norm": 1.238633155822754, + "learning_rate": 7.65398480854334e-05, + "loss": 0.6374872326850891, + "step": 5350 + }, + { + "epoch": 2.258227848101266, + "grad_norm": 1.3031803369522095, + "learning_rate": 7.651949265781029e-05, + "loss": 0.6348551511764526, + "step": 5352 + }, + { + "epoch": 2.259071729957806, + "grad_norm": 1.3735158443450928, + "learning_rate": 7.649913111264952e-05, + "loss": 0.6267750859260559, + "step": 5354 + }, + { + "epoch": 2.259915611814346, + "grad_norm": 1.1227772235870361, + "learning_rate": 7.647876345464817e-05, + "loss": 0.623030960559845, + "step": 5356 + }, + { + "epoch": 2.260759493670886, + "grad_norm": 1.4555678367614746, + "learning_rate": 7.645838968850459e-05, + "loss": 0.5810713171958923, + "step": 5358 + }, + { + "epoch": 2.261603375527426, + "grad_norm": 1.227725863456726, + "learning_rate": 7.643800981891867e-05, + "loss": 0.6150093078613281, + "step": 5360 + }, + { + "epoch": 2.2624472573839665, + "grad_norm": 1.0648300647735596, + "learning_rate": 7.641762385059161e-05, + "loss": 0.5350445508956909, + "step": 5362 + }, + { + "epoch": 2.2632911392405064, + "grad_norm": 1.179452896118164, + "learning_rate": 7.639723178822613e-05, + "loss": 0.6253421306610107, + "step": 5364 + }, + { + "epoch": 2.2641350210970463, + "grad_norm": 1.0983240604400635, + "learning_rate": 7.637683363652621e-05, + "loss": 0.5512562990188599, + "step": 5366 + }, + { + "epoch": 2.2649789029535867, + "grad_norm": 1.1825451850891113, + "learning_rate": 7.635642940019736e-05, + "loss": 0.5584151148796082, + "step": 5368 + }, + { + "epoch": 2.2658227848101267, + "grad_norm": 1.1022000312805176, + "learning_rate": 7.633601908394643e-05, + "loss": 0.5881790518760681, + "step": 5370 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 1.1935697793960571, + "learning_rate": 7.631560269248169e-05, + "loss": 0.6060683131217957, + "step": 5372 + }, + { + "epoch": 2.267510548523207, + "grad_norm": 1.1174103021621704, + "learning_rate": 7.62951802305128e-05, + "loss": 0.5877062678337097, + "step": 5374 + }, + { + "epoch": 2.268354430379747, + "grad_norm": 1.3934977054595947, + "learning_rate": 7.627475170275086e-05, + "loss": 0.5145504474639893, + "step": 5376 + }, + { + "epoch": 2.269198312236287, + "grad_norm": 1.2637842893600464, + "learning_rate": 7.625431711390831e-05, + "loss": 0.6194025874137878, + "step": 5378 + }, + { + "epoch": 2.270042194092827, + "grad_norm": 1.2034388780593872, + "learning_rate": 7.623387646869902e-05, + "loss": 0.6205627918243408, + "step": 5380 + }, + { + "epoch": 2.270886075949367, + "grad_norm": 0.953880250453949, + "learning_rate": 7.621342977183826e-05, + "loss": 0.5609696507453918, + "step": 5382 + }, + { + "epoch": 2.271729957805907, + "grad_norm": 1.2841949462890625, + "learning_rate": 7.619297702804272e-05, + "loss": 0.6044906377792358, + "step": 5384 + }, + { + "epoch": 2.272573839662447, + "grad_norm": 1.146804690361023, + "learning_rate": 7.617251824203037e-05, + "loss": 0.5420435667037964, + "step": 5386 + }, + { + "epoch": 2.2734177215189875, + "grad_norm": 1.2225698232650757, + "learning_rate": 7.615205341852076e-05, + "loss": 0.6230710744857788, + "step": 5388 + }, + { + "epoch": 2.2742616033755274, + "grad_norm": 1.3423371315002441, + "learning_rate": 7.613158256223467e-05, + "loss": 0.6486349701881409, + "step": 5390 + }, + { + "epoch": 2.2751054852320673, + "grad_norm": 1.0840023756027222, + "learning_rate": 7.611110567789435e-05, + "loss": 0.6527825593948364, + "step": 5392 + }, + { + "epoch": 2.2759493670886077, + "grad_norm": 1.342466950416565, + "learning_rate": 7.609062277022341e-05, + "loss": 0.6859483122825623, + "step": 5394 + }, + { + "epoch": 2.2767932489451477, + "grad_norm": 1.0406129360198975, + "learning_rate": 7.607013384394691e-05, + "loss": 0.5536003708839417, + "step": 5396 + }, + { + "epoch": 2.2776371308016876, + "grad_norm": 1.0853544473648071, + "learning_rate": 7.604963890379118e-05, + "loss": 0.5488654971122742, + "step": 5398 + }, + { + "epoch": 2.278481012658228, + "grad_norm": 1.0330145359039307, + "learning_rate": 7.602913795448407e-05, + "loss": 0.6072142720222473, + "step": 5400 + }, + { + "epoch": 2.278481012658228, + "eval_loss": 0.6875645518302917, + "eval_runtime": 861.3558, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 5400 + }, + { + "epoch": 2.279324894514768, + "grad_norm": 1.1858742237091064, + "learning_rate": 7.600863100075472e-05, + "loss": 0.5420109033584595, + "step": 5402 + }, + { + "epoch": 2.280168776371308, + "grad_norm": 1.2126039266586304, + "learning_rate": 7.598811804733373e-05, + "loss": 0.6109243631362915, + "step": 5404 + }, + { + "epoch": 2.2810126582278483, + "grad_norm": 1.1290241479873657, + "learning_rate": 7.5967599098953e-05, + "loss": 0.5889696478843689, + "step": 5406 + }, + { + "epoch": 2.281856540084388, + "grad_norm": 1.320263147354126, + "learning_rate": 7.594707416034586e-05, + "loss": 0.6548630595207214, + "step": 5408 + }, + { + "epoch": 2.282700421940928, + "grad_norm": 1.346169114112854, + "learning_rate": 7.592654323624703e-05, + "loss": 0.6556787490844727, + "step": 5410 + }, + { + "epoch": 2.2835443037974685, + "grad_norm": 1.2104716300964355, + "learning_rate": 7.590600633139265e-05, + "loss": 0.5631673336029053, + "step": 5412 + }, + { + "epoch": 2.2843881856540085, + "grad_norm": 1.3298237323760986, + "learning_rate": 7.58854634505201e-05, + "loss": 0.5931088328361511, + "step": 5414 + }, + { + "epoch": 2.2852320675105484, + "grad_norm": 1.4201204776763916, + "learning_rate": 7.586491459836829e-05, + "loss": 0.6966755986213684, + "step": 5416 + }, + { + "epoch": 2.286075949367089, + "grad_norm": 1.253135323524475, + "learning_rate": 7.584435977967743e-05, + "loss": 0.6172569394111633, + "step": 5418 + }, + { + "epoch": 2.2869198312236287, + "grad_norm": 1.133144736289978, + "learning_rate": 7.582379899918911e-05, + "loss": 0.5376655459403992, + "step": 5420 + }, + { + "epoch": 2.2877637130801687, + "grad_norm": 1.1103745698928833, + "learning_rate": 7.580323226164632e-05, + "loss": 0.6138498187065125, + "step": 5422 + }, + { + "epoch": 2.2886075949367086, + "grad_norm": 1.091636300086975, + "learning_rate": 7.57826595717934e-05, + "loss": 0.5049096345901489, + "step": 5424 + }, + { + "epoch": 2.289451476793249, + "grad_norm": 1.2486571073532104, + "learning_rate": 7.57620809343761e-05, + "loss": 0.5666115283966064, + "step": 5426 + }, + { + "epoch": 2.290295358649789, + "grad_norm": 1.510684847831726, + "learning_rate": 7.57414963541415e-05, + "loss": 0.49512919783592224, + "step": 5428 + }, + { + "epoch": 2.291139240506329, + "grad_norm": 1.1142191886901855, + "learning_rate": 7.572090583583805e-05, + "loss": 0.558807373046875, + "step": 5430 + }, + { + "epoch": 2.2919831223628693, + "grad_norm": 1.1162657737731934, + "learning_rate": 7.57003093842156e-05, + "loss": 0.6245265603065491, + "step": 5432 + }, + { + "epoch": 2.292827004219409, + "grad_norm": 1.2784614562988281, + "learning_rate": 7.567970700402537e-05, + "loss": 0.5505527853965759, + "step": 5434 + }, + { + "epoch": 2.293670886075949, + "grad_norm": 1.3142638206481934, + "learning_rate": 7.565909870001992e-05, + "loss": 0.6137702465057373, + "step": 5436 + }, + { + "epoch": 2.2945147679324895, + "grad_norm": 1.072805404663086, + "learning_rate": 7.563848447695318e-05, + "loss": 0.540766716003418, + "step": 5438 + }, + { + "epoch": 2.2953586497890295, + "grad_norm": 1.2861377000808716, + "learning_rate": 7.561786433958048e-05, + "loss": 0.6806555986404419, + "step": 5440 + }, + { + "epoch": 2.2962025316455694, + "grad_norm": 1.3193045854568481, + "learning_rate": 7.559723829265847e-05, + "loss": 0.6191258430480957, + "step": 5442 + }, + { + "epoch": 2.29704641350211, + "grad_norm": 1.1969127655029297, + "learning_rate": 7.55766063409452e-05, + "loss": 0.6067718863487244, + "step": 5444 + }, + { + "epoch": 2.2978902953586497, + "grad_norm": 1.2129666805267334, + "learning_rate": 7.555596848920006e-05, + "loss": 0.5673627257347107, + "step": 5446 + }, + { + "epoch": 2.2987341772151897, + "grad_norm": 1.1639961004257202, + "learning_rate": 7.553532474218379e-05, + "loss": 0.61825031042099, + "step": 5448 + }, + { + "epoch": 2.29957805907173, + "grad_norm": 1.3893283605575562, + "learning_rate": 7.551467510465852e-05, + "loss": 0.6096790432929993, + "step": 5450 + }, + { + "epoch": 2.30042194092827, + "grad_norm": 1.0708417892456055, + "learning_rate": 7.549401958138772e-05, + "loss": 0.6121414303779602, + "step": 5452 + }, + { + "epoch": 2.30126582278481, + "grad_norm": 1.3299298286437988, + "learning_rate": 7.547335817713624e-05, + "loss": 0.6504668593406677, + "step": 5454 + }, + { + "epoch": 2.3021097046413503, + "grad_norm": 1.3594682216644287, + "learning_rate": 7.545269089667022e-05, + "loss": 0.5761144161224365, + "step": 5456 + }, + { + "epoch": 2.3029535864978903, + "grad_norm": 1.1089586019515991, + "learning_rate": 7.543201774475726e-05, + "loss": 0.5457773804664612, + "step": 5458 + }, + { + "epoch": 2.3037974683544302, + "grad_norm": 1.3472918272018433, + "learning_rate": 7.541133872616624e-05, + "loss": 0.6014775037765503, + "step": 5460 + }, + { + "epoch": 2.3046413502109706, + "grad_norm": 1.2757689952850342, + "learning_rate": 7.53906538456674e-05, + "loss": 0.6246467232704163, + "step": 5462 + }, + { + "epoch": 2.3054852320675105, + "grad_norm": 1.4598166942596436, + "learning_rate": 7.536996310803236e-05, + "loss": 0.6583935022354126, + "step": 5464 + }, + { + "epoch": 2.3063291139240505, + "grad_norm": 1.2861602306365967, + "learning_rate": 7.534926651803407e-05, + "loss": 0.562523603439331, + "step": 5466 + }, + { + "epoch": 2.307172995780591, + "grad_norm": 1.0953221321105957, + "learning_rate": 7.532856408044684e-05, + "loss": 0.6093505620956421, + "step": 5468 + }, + { + "epoch": 2.308016877637131, + "grad_norm": 1.0982829332351685, + "learning_rate": 7.530785580004631e-05, + "loss": 0.6196447014808655, + "step": 5470 + }, + { + "epoch": 2.3088607594936708, + "grad_norm": 1.2224280834197998, + "learning_rate": 7.52871416816095e-05, + "loss": 0.6360989212989807, + "step": 5472 + }, + { + "epoch": 2.309704641350211, + "grad_norm": 1.244486927986145, + "learning_rate": 7.526642172991476e-05, + "loss": 0.6189543008804321, + "step": 5474 + }, + { + "epoch": 2.310548523206751, + "grad_norm": 1.2408053874969482, + "learning_rate": 7.524569594974178e-05, + "loss": 0.6137582659721375, + "step": 5476 + }, + { + "epoch": 2.311392405063291, + "grad_norm": 1.3323272466659546, + "learning_rate": 7.522496434587157e-05, + "loss": 0.6462169289588928, + "step": 5478 + }, + { + "epoch": 2.3122362869198314, + "grad_norm": 1.1076425313949585, + "learning_rate": 7.520422692308657e-05, + "loss": 0.5495362877845764, + "step": 5480 + }, + { + "epoch": 2.3130801687763713, + "grad_norm": 1.3298509120941162, + "learning_rate": 7.518348368617046e-05, + "loss": 0.5560636520385742, + "step": 5482 + }, + { + "epoch": 2.3139240506329113, + "grad_norm": 1.0740195512771606, + "learning_rate": 7.516273463990832e-05, + "loss": 0.5763371586799622, + "step": 5484 + }, + { + "epoch": 2.3147679324894517, + "grad_norm": 1.0748567581176758, + "learning_rate": 7.514197978908657e-05, + "loss": 0.5111498832702637, + "step": 5486 + }, + { + "epoch": 2.3156118143459916, + "grad_norm": 1.2047218084335327, + "learning_rate": 7.512121913849294e-05, + "loss": 0.6599951982498169, + "step": 5488 + }, + { + "epoch": 2.3164556962025316, + "grad_norm": 1.2956700325012207, + "learning_rate": 7.510045269291651e-05, + "loss": 0.6409770846366882, + "step": 5490 + }, + { + "epoch": 2.317299578059072, + "grad_norm": 1.241860032081604, + "learning_rate": 7.50796804571477e-05, + "loss": 0.5967662334442139, + "step": 5492 + }, + { + "epoch": 2.318143459915612, + "grad_norm": 1.1612682342529297, + "learning_rate": 7.50589024359783e-05, + "loss": 0.5856342315673828, + "step": 5494 + }, + { + "epoch": 2.318987341772152, + "grad_norm": 1.0895500183105469, + "learning_rate": 7.503811863420135e-05, + "loss": 0.5652023553848267, + "step": 5496 + }, + { + "epoch": 2.319831223628692, + "grad_norm": 1.3374481201171875, + "learning_rate": 7.50173290566113e-05, + "loss": 0.6777268648147583, + "step": 5498 + }, + { + "epoch": 2.320675105485232, + "grad_norm": 1.192614197731018, + "learning_rate": 7.499653370800391e-05, + "loss": 0.6052314043045044, + "step": 5500 + }, + { + "epoch": 2.320675105485232, + "eval_loss": 0.6867148876190186, + "eval_runtime": 941.3545, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 2.238, + "step": 5500 + }, + { + "epoch": 2.321518987341772, + "grad_norm": 1.1008832454681396, + "learning_rate": 7.497573259317625e-05, + "loss": 0.5208253860473633, + "step": 5502 + }, + { + "epoch": 2.3223628691983125, + "grad_norm": 1.2141541242599487, + "learning_rate": 7.495492571692677e-05, + "loss": 0.6352296471595764, + "step": 5504 + }, + { + "epoch": 2.3232067510548524, + "grad_norm": 1.2588802576065063, + "learning_rate": 7.493411308405517e-05, + "loss": 0.6132256388664246, + "step": 5506 + }, + { + "epoch": 2.3240506329113924, + "grad_norm": 1.348765254020691, + "learning_rate": 7.491329469936258e-05, + "loss": 0.571265697479248, + "step": 5508 + }, + { + "epoch": 2.3248945147679323, + "grad_norm": 1.266377329826355, + "learning_rate": 7.489247056765135e-05, + "loss": 0.5433708429336548, + "step": 5510 + }, + { + "epoch": 2.3257383966244727, + "grad_norm": 1.2920128107070923, + "learning_rate": 7.487164069372523e-05, + "loss": 0.6193158030509949, + "step": 5512 + }, + { + "epoch": 2.3265822784810126, + "grad_norm": 1.068169116973877, + "learning_rate": 7.485080508238928e-05, + "loss": 0.5817977786064148, + "step": 5514 + }, + { + "epoch": 2.3274261603375526, + "grad_norm": 1.2941710948944092, + "learning_rate": 7.482996373844985e-05, + "loss": 0.6558082103729248, + "step": 5516 + }, + { + "epoch": 2.328270042194093, + "grad_norm": 1.2143336534500122, + "learning_rate": 7.480911666671467e-05, + "loss": 0.5569961667060852, + "step": 5518 + }, + { + "epoch": 2.329113924050633, + "grad_norm": 1.3364789485931396, + "learning_rate": 7.478826387199274e-05, + "loss": 0.6497300863265991, + "step": 5520 + }, + { + "epoch": 2.329957805907173, + "grad_norm": 1.057530403137207, + "learning_rate": 7.47674053590944e-05, + "loss": 0.5793087482452393, + "step": 5522 + }, + { + "epoch": 2.330801687763713, + "grad_norm": 1.1543176174163818, + "learning_rate": 7.47465411328313e-05, + "loss": 0.5583140850067139, + "step": 5524 + }, + { + "epoch": 2.331645569620253, + "grad_norm": 1.3409180641174316, + "learning_rate": 7.472567119801645e-05, + "loss": 0.6318784952163696, + "step": 5526 + }, + { + "epoch": 2.332489451476793, + "grad_norm": 1.2899413108825684, + "learning_rate": 7.47047955594641e-05, + "loss": 0.5950855612754822, + "step": 5528 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 1.329220175743103, + "learning_rate": 7.468391422198989e-05, + "loss": 0.6181023716926575, + "step": 5530 + }, + { + "epoch": 2.3341772151898734, + "grad_norm": 1.202129602432251, + "learning_rate": 7.466302719041073e-05, + "loss": 0.6384578943252563, + "step": 5532 + }, + { + "epoch": 2.3350210970464134, + "grad_norm": 1.1890549659729004, + "learning_rate": 7.464213446954487e-05, + "loss": 0.6059293746948242, + "step": 5534 + }, + { + "epoch": 2.3358649789029537, + "grad_norm": 1.2041429281234741, + "learning_rate": 7.462123606421183e-05, + "loss": 0.6432797908782959, + "step": 5536 + }, + { + "epoch": 2.3367088607594937, + "grad_norm": 1.3827080726623535, + "learning_rate": 7.460033197923249e-05, + "loss": 0.6796717047691345, + "step": 5538 + }, + { + "epoch": 2.3375527426160336, + "grad_norm": 1.2323482036590576, + "learning_rate": 7.457942221942903e-05, + "loss": 0.5772476196289062, + "step": 5540 + }, + { + "epoch": 2.338396624472574, + "grad_norm": 1.2011388540267944, + "learning_rate": 7.455850678962493e-05, + "loss": 0.5964269042015076, + "step": 5542 + }, + { + "epoch": 2.339240506329114, + "grad_norm": 1.1133569478988647, + "learning_rate": 7.453758569464495e-05, + "loss": 0.6416608095169067, + "step": 5544 + }, + { + "epoch": 2.340084388185654, + "grad_norm": 1.1257679462432861, + "learning_rate": 7.451665893931521e-05, + "loss": 0.5668829679489136, + "step": 5546 + }, + { + "epoch": 2.3409282700421943, + "grad_norm": 1.3494724035263062, + "learning_rate": 7.449572652846311e-05, + "loss": 0.6029916405677795, + "step": 5548 + }, + { + "epoch": 2.3417721518987342, + "grad_norm": 1.2199759483337402, + "learning_rate": 7.447478846691735e-05, + "loss": 0.6336984634399414, + "step": 5550 + }, + { + "epoch": 2.342616033755274, + "grad_norm": 1.2806570529937744, + "learning_rate": 7.445384475950792e-05, + "loss": 0.579140305519104, + "step": 5552 + }, + { + "epoch": 2.343459915611814, + "grad_norm": 0.9874221086502075, + "learning_rate": 7.443289541106616e-05, + "loss": 0.6061640381813049, + "step": 5554 + }, + { + "epoch": 2.3443037974683545, + "grad_norm": 1.2271486520767212, + "learning_rate": 7.441194042642467e-05, + "loss": 0.5502339601516724, + "step": 5556 + }, + { + "epoch": 2.3451476793248944, + "grad_norm": 1.2522462606430054, + "learning_rate": 7.439097981041738e-05, + "loss": 0.5774438381195068, + "step": 5558 + }, + { + "epoch": 2.3459915611814344, + "grad_norm": 1.267204761505127, + "learning_rate": 7.437001356787945e-05, + "loss": 0.6091527342796326, + "step": 5560 + }, + { + "epoch": 2.3468354430379748, + "grad_norm": 1.1711935997009277, + "learning_rate": 7.434904170364747e-05, + "loss": 0.5443631410598755, + "step": 5562 + }, + { + "epoch": 2.3476793248945147, + "grad_norm": 1.085097074508667, + "learning_rate": 7.432806422255918e-05, + "loss": 0.5255029201507568, + "step": 5564 + }, + { + "epoch": 2.3485232067510546, + "grad_norm": 1.3244949579238892, + "learning_rate": 7.430708112945369e-05, + "loss": 0.5197238922119141, + "step": 5566 + }, + { + "epoch": 2.349367088607595, + "grad_norm": 1.3646879196166992, + "learning_rate": 7.428609242917141e-05, + "loss": 0.5576170682907104, + "step": 5568 + }, + { + "epoch": 2.350210970464135, + "grad_norm": 1.339190125465393, + "learning_rate": 7.426509812655406e-05, + "loss": 0.6254662275314331, + "step": 5570 + }, + { + "epoch": 2.351054852320675, + "grad_norm": 1.4624155759811401, + "learning_rate": 7.424409822644457e-05, + "loss": 0.6593500375747681, + "step": 5572 + }, + { + "epoch": 2.3518987341772153, + "grad_norm": 1.1931114196777344, + "learning_rate": 7.422309273368722e-05, + "loss": 0.6102238297462463, + "step": 5574 + }, + { + "epoch": 2.3527426160337552, + "grad_norm": 1.789340615272522, + "learning_rate": 7.420208165312762e-05, + "loss": 0.6695854067802429, + "step": 5576 + }, + { + "epoch": 2.353586497890295, + "grad_norm": 1.2364262342453003, + "learning_rate": 7.418106498961258e-05, + "loss": 0.578844428062439, + "step": 5578 + }, + { + "epoch": 2.3544303797468356, + "grad_norm": 1.1568509340286255, + "learning_rate": 7.416004274799027e-05, + "loss": 0.5717503428459167, + "step": 5580 + }, + { + "epoch": 2.3552742616033755, + "grad_norm": 1.1744630336761475, + "learning_rate": 7.413901493311009e-05, + "loss": 0.6170201897621155, + "step": 5582 + }, + { + "epoch": 2.3561181434599154, + "grad_norm": 1.0684332847595215, + "learning_rate": 7.411798154982275e-05, + "loss": 0.6482691764831543, + "step": 5584 + }, + { + "epoch": 2.356962025316456, + "grad_norm": 1.046196460723877, + "learning_rate": 7.409694260298025e-05, + "loss": 0.572839617729187, + "step": 5586 + }, + { + "epoch": 2.3578059071729958, + "grad_norm": 1.0110210180282593, + "learning_rate": 7.407589809743591e-05, + "loss": 0.5645976662635803, + "step": 5588 + }, + { + "epoch": 2.3586497890295357, + "grad_norm": 1.0801016092300415, + "learning_rate": 7.405484803804425e-05, + "loss": 0.5653133392333984, + "step": 5590 + }, + { + "epoch": 2.359493670886076, + "grad_norm": 1.0934380292892456, + "learning_rate": 7.403379242966116e-05, + "loss": 0.5972150564193726, + "step": 5592 + }, + { + "epoch": 2.360337552742616, + "grad_norm": 1.3722410202026367, + "learning_rate": 7.40127312771437e-05, + "loss": 0.5927542448043823, + "step": 5594 + }, + { + "epoch": 2.361181434599156, + "grad_norm": 1.1567236185073853, + "learning_rate": 7.399166458535032e-05, + "loss": 0.547027051448822, + "step": 5596 + }, + { + "epoch": 2.3620253164556964, + "grad_norm": 1.2254211902618408, + "learning_rate": 7.397059235914067e-05, + "loss": 0.5356617569923401, + "step": 5598 + }, + { + "epoch": 2.3628691983122363, + "grad_norm": 1.1529103517532349, + "learning_rate": 7.394951460337575e-05, + "loss": 0.5424175262451172, + "step": 5600 + }, + { + "epoch": 2.3628691983122363, + "eval_loss": 0.6851074695587158, + "eval_runtime": 938.5536, + "eval_samples_per_second": 2.245, + "eval_steps_per_second": 2.245, + "step": 5600 + }, + { + "epoch": 2.3637130801687762, + "grad_norm": 1.2050299644470215, + "learning_rate": 7.392843132291777e-05, + "loss": 0.5834107398986816, + "step": 5602 + }, + { + "epoch": 2.3645569620253166, + "grad_norm": 1.264567494392395, + "learning_rate": 7.390734252263024e-05, + "loss": 0.5445035099983215, + "step": 5604 + }, + { + "epoch": 2.3654008438818566, + "grad_norm": 1.357791781425476, + "learning_rate": 7.388624820737791e-05, + "loss": 0.6207653880119324, + "step": 5606 + }, + { + "epoch": 2.3662447257383965, + "grad_norm": 1.2246928215026855, + "learning_rate": 7.386514838202689e-05, + "loss": 0.6628696322441101, + "step": 5608 + }, + { + "epoch": 2.367088607594937, + "grad_norm": 1.1455399990081787, + "learning_rate": 7.384404305144447e-05, + "loss": 0.5870704054832458, + "step": 5610 + }, + { + "epoch": 2.367932489451477, + "grad_norm": 1.2338638305664062, + "learning_rate": 7.382293222049925e-05, + "loss": 0.6160538792610168, + "step": 5612 + }, + { + "epoch": 2.3687763713080168, + "grad_norm": 1.231271505355835, + "learning_rate": 7.38018158940611e-05, + "loss": 0.6274036765098572, + "step": 5614 + }, + { + "epoch": 2.369620253164557, + "grad_norm": 1.022050380706787, + "learning_rate": 7.378069407700114e-05, + "loss": 0.5623515248298645, + "step": 5616 + }, + { + "epoch": 2.370464135021097, + "grad_norm": 1.2040951251983643, + "learning_rate": 7.375956677419178e-05, + "loss": 0.5505564212799072, + "step": 5618 + }, + { + "epoch": 2.371308016877637, + "grad_norm": 1.1754523515701294, + "learning_rate": 7.373843399050668e-05, + "loss": 0.6537002921104431, + "step": 5620 + }, + { + "epoch": 2.3721518987341774, + "grad_norm": 1.1710485219955444, + "learning_rate": 7.371729573082073e-05, + "loss": 0.6224458813667297, + "step": 5622 + }, + { + "epoch": 2.3729957805907174, + "grad_norm": 1.1629483699798584, + "learning_rate": 7.36961520000102e-05, + "loss": 0.6297177076339722, + "step": 5624 + }, + { + "epoch": 2.3738396624472573, + "grad_norm": 1.1069440841674805, + "learning_rate": 7.367500280295248e-05, + "loss": 0.5202008485794067, + "step": 5626 + }, + { + "epoch": 2.3746835443037977, + "grad_norm": 1.0068297386169434, + "learning_rate": 7.36538481445263e-05, + "loss": 0.5256102681159973, + "step": 5628 + }, + { + "epoch": 2.3755274261603376, + "grad_norm": 1.1103417873382568, + "learning_rate": 7.363268802961161e-05, + "loss": 0.5460903644561768, + "step": 5630 + }, + { + "epoch": 2.3763713080168776, + "grad_norm": 1.2885268926620483, + "learning_rate": 7.361152246308969e-05, + "loss": 0.5817124247550964, + "step": 5632 + }, + { + "epoch": 2.377215189873418, + "grad_norm": 1.233831524848938, + "learning_rate": 7.359035144984302e-05, + "loss": 0.5415143966674805, + "step": 5634 + }, + { + "epoch": 2.378059071729958, + "grad_norm": 1.3451908826828003, + "learning_rate": 7.35691749947553e-05, + "loss": 0.6837685108184814, + "step": 5636 + }, + { + "epoch": 2.378902953586498, + "grad_norm": 1.1320621967315674, + "learning_rate": 7.354799310271159e-05, + "loss": 0.5966196656227112, + "step": 5638 + }, + { + "epoch": 2.379746835443038, + "grad_norm": 1.1884461641311646, + "learning_rate": 7.35268057785981e-05, + "loss": 0.5607479214668274, + "step": 5640 + }, + { + "epoch": 2.380590717299578, + "grad_norm": 1.2710856199264526, + "learning_rate": 7.350561302730236e-05, + "loss": 0.595242977142334, + "step": 5642 + }, + { + "epoch": 2.381434599156118, + "grad_norm": 1.3110458850860596, + "learning_rate": 7.348441485371314e-05, + "loss": 0.6208752393722534, + "step": 5644 + }, + { + "epoch": 2.382278481012658, + "grad_norm": 1.1734380722045898, + "learning_rate": 7.346321126272044e-05, + "loss": 0.6173125505447388, + "step": 5646 + }, + { + "epoch": 2.3831223628691984, + "grad_norm": 1.2024762630462646, + "learning_rate": 7.34420022592155e-05, + "loss": 0.6013050675392151, + "step": 5648 + }, + { + "epoch": 2.3839662447257384, + "grad_norm": 1.1305288076400757, + "learning_rate": 7.342078784809086e-05, + "loss": 0.5919594764709473, + "step": 5650 + }, + { + "epoch": 2.3848101265822783, + "grad_norm": 1.075323462486267, + "learning_rate": 7.339956803424028e-05, + "loss": 0.5399283766746521, + "step": 5652 + }, + { + "epoch": 2.3856540084388187, + "grad_norm": 1.2035599946975708, + "learning_rate": 7.337834282255873e-05, + "loss": 0.6253576874732971, + "step": 5654 + }, + { + "epoch": 2.3864978902953586, + "grad_norm": 1.0572105646133423, + "learning_rate": 7.335711221794251e-05, + "loss": 0.5247007608413696, + "step": 5656 + }, + { + "epoch": 2.3873417721518986, + "grad_norm": 1.2701191902160645, + "learning_rate": 7.333587622528906e-05, + "loss": 0.5800243020057678, + "step": 5658 + }, + { + "epoch": 2.388185654008439, + "grad_norm": 1.1772741079330444, + "learning_rate": 7.331463484949716e-05, + "loss": 0.589645504951477, + "step": 5660 + }, + { + "epoch": 2.389029535864979, + "grad_norm": 1.0562703609466553, + "learning_rate": 7.329338809546674e-05, + "loss": 0.5820419192314148, + "step": 5662 + }, + { + "epoch": 2.389873417721519, + "grad_norm": 1.1634355783462524, + "learning_rate": 7.327213596809906e-05, + "loss": 0.591435432434082, + "step": 5664 + }, + { + "epoch": 2.3907172995780592, + "grad_norm": 1.2220302820205688, + "learning_rate": 7.325087847229655e-05, + "loss": 0.5630883574485779, + "step": 5666 + }, + { + "epoch": 2.391561181434599, + "grad_norm": 1.4087659120559692, + "learning_rate": 7.322961561296294e-05, + "loss": 0.6050130128860474, + "step": 5668 + }, + { + "epoch": 2.392405063291139, + "grad_norm": 1.1126172542572021, + "learning_rate": 7.320834739500313e-05, + "loss": 0.56146240234375, + "step": 5670 + }, + { + "epoch": 2.3932489451476795, + "grad_norm": 0.99373859167099, + "learning_rate": 7.31870738233233e-05, + "loss": 0.5507852435112, + "step": 5672 + }, + { + "epoch": 2.3940928270042194, + "grad_norm": 1.14408540725708, + "learning_rate": 7.316579490283085e-05, + "loss": 0.5895347595214844, + "step": 5674 + }, + { + "epoch": 2.3949367088607594, + "grad_norm": 1.1728581190109253, + "learning_rate": 7.314451063843443e-05, + "loss": 0.5304404497146606, + "step": 5676 + }, + { + "epoch": 2.3957805907172998, + "grad_norm": 1.1721378564834595, + "learning_rate": 7.31232210350439e-05, + "loss": 0.5805793404579163, + "step": 5678 + }, + { + "epoch": 2.3966244725738397, + "grad_norm": 1.0499866008758545, + "learning_rate": 7.310192609757038e-05, + "loss": 0.5671767592430115, + "step": 5680 + }, + { + "epoch": 2.3974683544303796, + "grad_norm": 1.0959177017211914, + "learning_rate": 7.308062583092617e-05, + "loss": 0.6335723400115967, + "step": 5682 + }, + { + "epoch": 2.3983122362869196, + "grad_norm": 1.31142258644104, + "learning_rate": 7.305932024002487e-05, + "loss": 0.6032374501228333, + "step": 5684 + }, + { + "epoch": 2.39915611814346, + "grad_norm": 0.9212818741798401, + "learning_rate": 7.303800932978124e-05, + "loss": 0.5492936372756958, + "step": 5686 + }, + { + "epoch": 2.4, + "grad_norm": 1.1956428289413452, + "learning_rate": 7.301669310511132e-05, + "loss": 0.5533297061920166, + "step": 5688 + }, + { + "epoch": 2.40084388185654, + "grad_norm": 1.4048634767532349, + "learning_rate": 7.299537157093232e-05, + "loss": 0.5859368443489075, + "step": 5690 + }, + { + "epoch": 2.4016877637130802, + "grad_norm": 1.0580679178237915, + "learning_rate": 7.297404473216277e-05, + "loss": 0.5099439024925232, + "step": 5692 + }, + { + "epoch": 2.40253164556962, + "grad_norm": 1.2450575828552246, + "learning_rate": 7.29527125937223e-05, + "loss": 0.5631486177444458, + "step": 5694 + }, + { + "epoch": 2.40337552742616, + "grad_norm": 1.338466763496399, + "learning_rate": 7.293137516053187e-05, + "loss": 0.6045404672622681, + "step": 5696 + }, + { + "epoch": 2.4042194092827005, + "grad_norm": 1.198588252067566, + "learning_rate": 7.291003243751358e-05, + "loss": 0.6063475608825684, + "step": 5698 + }, + { + "epoch": 2.4050632911392404, + "grad_norm": 1.2315080165863037, + "learning_rate": 7.288868442959081e-05, + "loss": 0.5734809041023254, + "step": 5700 + }, + { + "epoch": 2.4050632911392404, + "eval_loss": 0.6841402053833008, + "eval_runtime": 941.6641, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 2.238, + "step": 5700 + }, + { + "epoch": 2.4059071729957804, + "grad_norm": 1.1494885683059692, + "learning_rate": 7.286733114168812e-05, + "loss": 0.5744594931602478, + "step": 5702 + }, + { + "epoch": 2.4067510548523208, + "grad_norm": 1.3769505023956299, + "learning_rate": 7.284597257873132e-05, + "loss": 0.611789882183075, + "step": 5704 + }, + { + "epoch": 2.4075949367088607, + "grad_norm": 1.2326449155807495, + "learning_rate": 7.28246087456474e-05, + "loss": 0.6091431975364685, + "step": 5706 + }, + { + "epoch": 2.4084388185654007, + "grad_norm": 1.1960830688476562, + "learning_rate": 7.28032396473646e-05, + "loss": 0.49431973695755005, + "step": 5708 + }, + { + "epoch": 2.409282700421941, + "grad_norm": 1.1672827005386353, + "learning_rate": 7.278186528881237e-05, + "loss": 0.5344718098640442, + "step": 5710 + }, + { + "epoch": 2.410126582278481, + "grad_norm": 1.1923719644546509, + "learning_rate": 7.276048567492136e-05, + "loss": 0.6011165380477905, + "step": 5712 + }, + { + "epoch": 2.410970464135021, + "grad_norm": 1.2314990758895874, + "learning_rate": 7.273910081062341e-05, + "loss": 0.6300925016403198, + "step": 5714 + }, + { + "epoch": 2.4118143459915613, + "grad_norm": 0.8976680040359497, + "learning_rate": 7.27177107008516e-05, + "loss": 0.56329345703125, + "step": 5716 + }, + { + "epoch": 2.4126582278481012, + "grad_norm": 1.2954038381576538, + "learning_rate": 7.269631535054026e-05, + "loss": 0.6266427040100098, + "step": 5718 + }, + { + "epoch": 2.413502109704641, + "grad_norm": 1.3357585668563843, + "learning_rate": 7.267491476462485e-05, + "loss": 0.6234018802642822, + "step": 5720 + }, + { + "epoch": 2.4143459915611816, + "grad_norm": 1.1913645267486572, + "learning_rate": 7.265350894804209e-05, + "loss": 0.5909059047698975, + "step": 5722 + }, + { + "epoch": 2.4151898734177215, + "grad_norm": 1.3425955772399902, + "learning_rate": 7.263209790572986e-05, + "loss": 0.5708479285240173, + "step": 5724 + }, + { + "epoch": 2.4160337552742615, + "grad_norm": 1.2258507013320923, + "learning_rate": 7.261068164262734e-05, + "loss": 0.5810034871101379, + "step": 5726 + }, + { + "epoch": 2.416877637130802, + "grad_norm": 1.348794937133789, + "learning_rate": 7.258926016367479e-05, + "loss": 0.5939235687255859, + "step": 5728 + }, + { + "epoch": 2.4177215189873418, + "grad_norm": 1.0896574258804321, + "learning_rate": 7.256783347381375e-05, + "loss": 0.6298259496688843, + "step": 5730 + }, + { + "epoch": 2.4185654008438817, + "grad_norm": 1.164866328239441, + "learning_rate": 7.254640157798696e-05, + "loss": 0.5277430415153503, + "step": 5732 + }, + { + "epoch": 2.419409282700422, + "grad_norm": 1.1215453147888184, + "learning_rate": 7.252496448113833e-05, + "loss": 0.5724055767059326, + "step": 5734 + }, + { + "epoch": 2.420253164556962, + "grad_norm": 1.0640764236450195, + "learning_rate": 7.2503522188213e-05, + "loss": 0.5439977645874023, + "step": 5736 + }, + { + "epoch": 2.421097046413502, + "grad_norm": 1.4874604940414429, + "learning_rate": 7.248207470415729e-05, + "loss": 0.7568614482879639, + "step": 5738 + }, + { + "epoch": 2.4219409282700424, + "grad_norm": 1.2611099481582642, + "learning_rate": 7.246062203391873e-05, + "loss": 0.6389632225036621, + "step": 5740 + }, + { + "epoch": 2.4227848101265823, + "grad_norm": 1.185644507408142, + "learning_rate": 7.243916418244602e-05, + "loss": 0.6180628538131714, + "step": 5742 + }, + { + "epoch": 2.4236286919831223, + "grad_norm": 1.1648430824279785, + "learning_rate": 7.241770115468909e-05, + "loss": 0.619799017906189, + "step": 5744 + }, + { + "epoch": 2.4244725738396626, + "grad_norm": 1.1974445581436157, + "learning_rate": 7.239623295559903e-05, + "loss": 0.6446201205253601, + "step": 5746 + }, + { + "epoch": 2.4253164556962026, + "grad_norm": 1.140477180480957, + "learning_rate": 7.237475959012818e-05, + "loss": 0.5839580297470093, + "step": 5748 + }, + { + "epoch": 2.4261603375527425, + "grad_norm": 1.1374423503875732, + "learning_rate": 7.235328106322998e-05, + "loss": 0.48815420269966125, + "step": 5750 + }, + { + "epoch": 2.427004219409283, + "grad_norm": 1.411432147026062, + "learning_rate": 7.233179737985916e-05, + "loss": 0.638519287109375, + "step": 5752 + }, + { + "epoch": 2.427848101265823, + "grad_norm": 1.1232497692108154, + "learning_rate": 7.231030854497157e-05, + "loss": 0.5776677131652832, + "step": 5754 + }, + { + "epoch": 2.428691983122363, + "grad_norm": 1.0815738439559937, + "learning_rate": 7.228881456352428e-05, + "loss": 0.5297027230262756, + "step": 5756 + }, + { + "epoch": 2.429535864978903, + "grad_norm": 1.2230733633041382, + "learning_rate": 7.226731544047553e-05, + "loss": 0.5630011558532715, + "step": 5758 + }, + { + "epoch": 2.430379746835443, + "grad_norm": 1.2033147811889648, + "learning_rate": 7.224581118078476e-05, + "loss": 0.5772101283073425, + "step": 5760 + }, + { + "epoch": 2.431223628691983, + "grad_norm": 1.2150053977966309, + "learning_rate": 7.22243017894126e-05, + "loss": 0.5412847399711609, + "step": 5762 + }, + { + "epoch": 2.4320675105485234, + "grad_norm": 1.0494824647903442, + "learning_rate": 7.220278727132083e-05, + "loss": 0.5568405389785767, + "step": 5764 + }, + { + "epoch": 2.4329113924050634, + "grad_norm": 1.2803306579589844, + "learning_rate": 7.218126763147244e-05, + "loss": 0.6022217869758606, + "step": 5766 + }, + { + "epoch": 2.4337552742616033, + "grad_norm": 1.0832798480987549, + "learning_rate": 7.215974287483163e-05, + "loss": 0.5568796396255493, + "step": 5768 + }, + { + "epoch": 2.4345991561181437, + "grad_norm": 1.1829264163970947, + "learning_rate": 7.213821300636372e-05, + "loss": 0.5607990026473999, + "step": 5770 + }, + { + "epoch": 2.4354430379746836, + "grad_norm": 2.3017473220825195, + "learning_rate": 7.211667803103523e-05, + "loss": 0.6382274031639099, + "step": 5772 + }, + { + "epoch": 2.4362869198312236, + "grad_norm": 1.1701387166976929, + "learning_rate": 7.209513795381388e-05, + "loss": 0.5748776793479919, + "step": 5774 + }, + { + "epoch": 2.4371308016877635, + "grad_norm": 1.0480856895446777, + "learning_rate": 7.207359277966856e-05, + "loss": 0.5760934352874756, + "step": 5776 + }, + { + "epoch": 2.437974683544304, + "grad_norm": 1.2263693809509277, + "learning_rate": 7.20520425135693e-05, + "loss": 0.6387208104133606, + "step": 5778 + }, + { + "epoch": 2.438818565400844, + "grad_norm": 1.219246506690979, + "learning_rate": 7.203048716048737e-05, + "loss": 0.6078037619590759, + "step": 5780 + }, + { + "epoch": 2.439662447257384, + "grad_norm": 1.2452640533447266, + "learning_rate": 7.200892672539515e-05, + "loss": 0.606924831867218, + "step": 5782 + }, + { + "epoch": 2.440506329113924, + "grad_norm": 1.3469732999801636, + "learning_rate": 7.198736121326621e-05, + "loss": 0.585297703742981, + "step": 5784 + }, + { + "epoch": 2.441350210970464, + "grad_norm": 1.151127576828003, + "learning_rate": 7.196579062907533e-05, + "loss": 0.5849902033805847, + "step": 5786 + }, + { + "epoch": 2.442194092827004, + "grad_norm": 1.0669564008712769, + "learning_rate": 7.19442149777984e-05, + "loss": 0.6150397062301636, + "step": 5788 + }, + { + "epoch": 2.4430379746835444, + "grad_norm": 1.1700209379196167, + "learning_rate": 7.192263426441252e-05, + "loss": 0.6324567794799805, + "step": 5790 + }, + { + "epoch": 2.4438818565400844, + "grad_norm": 1.2832094430923462, + "learning_rate": 7.190104849389597e-05, + "loss": 0.6202381253242493, + "step": 5792 + }, + { + "epoch": 2.4447257383966243, + "grad_norm": 1.2046177387237549, + "learning_rate": 7.187945767122813e-05, + "loss": 0.6156684756278992, + "step": 5794 + }, + { + "epoch": 2.4455696202531647, + "grad_norm": 1.031133770942688, + "learning_rate": 7.185786180138961e-05, + "loss": 0.5763497352600098, + "step": 5796 + }, + { + "epoch": 2.4464135021097047, + "grad_norm": 1.2803475856781006, + "learning_rate": 7.183626088936216e-05, + "loss": 0.5419677495956421, + "step": 5798 + }, + { + "epoch": 2.4472573839662446, + "grad_norm": 1.2407588958740234, + "learning_rate": 7.181465494012869e-05, + "loss": 0.629108190536499, + "step": 5800 + }, + { + "epoch": 2.4472573839662446, + "eval_loss": 0.6835155487060547, + "eval_runtime": 758.407, + "eval_samples_per_second": 2.778, + "eval_steps_per_second": 2.778, + "step": 5800 + }, + { + "epoch": 2.448101265822785, + "grad_norm": 1.3525878190994263, + "learning_rate": 7.17930439586733e-05, + "loss": 0.6146516799926758, + "step": 5802 + }, + { + "epoch": 2.448945147679325, + "grad_norm": 1.255921721458435, + "learning_rate": 7.177142794998121e-05, + "loss": 0.5796315670013428, + "step": 5804 + }, + { + "epoch": 2.449789029535865, + "grad_norm": 1.2135448455810547, + "learning_rate": 7.174980691903881e-05, + "loss": 0.5978766679763794, + "step": 5806 + }, + { + "epoch": 2.4506329113924052, + "grad_norm": 1.117942214012146, + "learning_rate": 7.172818087083367e-05, + "loss": 0.5941054821014404, + "step": 5808 + }, + { + "epoch": 2.451476793248945, + "grad_norm": 1.2917672395706177, + "learning_rate": 7.17065498103545e-05, + "loss": 0.6213865876197815, + "step": 5810 + }, + { + "epoch": 2.452320675105485, + "grad_norm": 1.2287952899932861, + "learning_rate": 7.168491374259118e-05, + "loss": 0.627090573310852, + "step": 5812 + }, + { + "epoch": 2.453164556962025, + "grad_norm": 1.2427480220794678, + "learning_rate": 7.16632726725347e-05, + "loss": 0.605871319770813, + "step": 5814 + }, + { + "epoch": 2.4540084388185655, + "grad_norm": 1.2568929195404053, + "learning_rate": 7.16416266051773e-05, + "loss": 0.5961518883705139, + "step": 5816 + }, + { + "epoch": 2.4548523206751054, + "grad_norm": 1.2202998399734497, + "learning_rate": 7.161997554551226e-05, + "loss": 0.585054874420166, + "step": 5818 + }, + { + "epoch": 2.4556962025316453, + "grad_norm": 1.2326043844223022, + "learning_rate": 7.159831949853409e-05, + "loss": 0.6219096779823303, + "step": 5820 + }, + { + "epoch": 2.4565400843881857, + "grad_norm": 1.2161623239517212, + "learning_rate": 7.15766584692384e-05, + "loss": 0.641189455986023, + "step": 5822 + }, + { + "epoch": 2.4573839662447257, + "grad_norm": 1.2391023635864258, + "learning_rate": 7.1554992462622e-05, + "loss": 0.577190101146698, + "step": 5824 + }, + { + "epoch": 2.4582278481012656, + "grad_norm": 1.0883333683013916, + "learning_rate": 7.153332148368281e-05, + "loss": 0.5264694690704346, + "step": 5826 + }, + { + "epoch": 2.459071729957806, + "grad_norm": 1.2129524946212769, + "learning_rate": 7.15116455374199e-05, + "loss": 0.631437361240387, + "step": 5828 + }, + { + "epoch": 2.459915611814346, + "grad_norm": 1.0476374626159668, + "learning_rate": 7.148996462883352e-05, + "loss": 0.5025489926338196, + "step": 5830 + }, + { + "epoch": 2.460759493670886, + "grad_norm": 1.1389570236206055, + "learning_rate": 7.146827876292502e-05, + "loss": 0.5903586745262146, + "step": 5832 + }, + { + "epoch": 2.4616033755274263, + "grad_norm": 1.4385539293289185, + "learning_rate": 7.14465879446969e-05, + "loss": 0.633786141872406, + "step": 5834 + }, + { + "epoch": 2.462447257383966, + "grad_norm": 1.1184585094451904, + "learning_rate": 7.142489217915283e-05, + "loss": 0.5889136791229248, + "step": 5836 + }, + { + "epoch": 2.463291139240506, + "grad_norm": 1.2257685661315918, + "learning_rate": 7.140319147129763e-05, + "loss": 0.5774597525596619, + "step": 5838 + }, + { + "epoch": 2.4641350210970465, + "grad_norm": 0.9524238109588623, + "learning_rate": 7.13814858261372e-05, + "loss": 0.5220611095428467, + "step": 5840 + }, + { + "epoch": 2.4649789029535865, + "grad_norm": 1.2814422845840454, + "learning_rate": 7.135977524867861e-05, + "loss": 0.5724858641624451, + "step": 5842 + }, + { + "epoch": 2.4658227848101264, + "grad_norm": 1.0978140830993652, + "learning_rate": 7.133805974393013e-05, + "loss": 0.5469759702682495, + "step": 5844 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 1.310279130935669, + "learning_rate": 7.131633931690104e-05, + "loss": 0.6554312705993652, + "step": 5846 + }, + { + "epoch": 2.4675105485232067, + "grad_norm": 1.286189317703247, + "learning_rate": 7.129461397260187e-05, + "loss": 0.6166019439697266, + "step": 5848 + }, + { + "epoch": 2.4683544303797467, + "grad_norm": 1.1586377620697021, + "learning_rate": 7.127288371604424e-05, + "loss": 0.6301121711730957, + "step": 5850 + }, + { + "epoch": 2.469198312236287, + "grad_norm": 1.1684564352035522, + "learning_rate": 7.125114855224087e-05, + "loss": 0.6022663712501526, + "step": 5852 + }, + { + "epoch": 2.470042194092827, + "grad_norm": 1.182511329650879, + "learning_rate": 7.122940848620567e-05, + "loss": 0.5959302186965942, + "step": 5854 + }, + { + "epoch": 2.470886075949367, + "grad_norm": 1.2383002042770386, + "learning_rate": 7.120766352295366e-05, + "loss": 0.6251413822174072, + "step": 5856 + }, + { + "epoch": 2.4717299578059073, + "grad_norm": 1.2001979351043701, + "learning_rate": 7.118591366750097e-05, + "loss": 0.6332544088363647, + "step": 5858 + }, + { + "epoch": 2.4725738396624473, + "grad_norm": 1.2166392803192139, + "learning_rate": 7.116415892486488e-05, + "loss": 0.5797795057296753, + "step": 5860 + }, + { + "epoch": 2.473417721518987, + "grad_norm": 1.2235382795333862, + "learning_rate": 7.114239930006379e-05, + "loss": 0.5335313081741333, + "step": 5862 + }, + { + "epoch": 2.4742616033755276, + "grad_norm": 1.2405973672866821, + "learning_rate": 7.112063479811724e-05, + "loss": 0.5536905527114868, + "step": 5864 + }, + { + "epoch": 2.4751054852320675, + "grad_norm": 1.116328477859497, + "learning_rate": 7.109886542404585e-05, + "loss": 0.554654061794281, + "step": 5866 + }, + { + "epoch": 2.4759493670886075, + "grad_norm": 1.2757837772369385, + "learning_rate": 7.107709118287143e-05, + "loss": 0.6017873287200928, + "step": 5868 + }, + { + "epoch": 2.476793248945148, + "grad_norm": 1.3445937633514404, + "learning_rate": 7.105531207961686e-05, + "loss": 0.6479908227920532, + "step": 5870 + }, + { + "epoch": 2.477637130801688, + "grad_norm": 1.1464542150497437, + "learning_rate": 7.103352811930619e-05, + "loss": 0.5829157829284668, + "step": 5872 + }, + { + "epoch": 2.4784810126582277, + "grad_norm": 1.3275130987167358, + "learning_rate": 7.101173930696453e-05, + "loss": 0.54380863904953, + "step": 5874 + }, + { + "epoch": 2.479324894514768, + "grad_norm": 1.006990909576416, + "learning_rate": 7.098994564761813e-05, + "loss": 0.6313910484313965, + "step": 5876 + }, + { + "epoch": 2.480168776371308, + "grad_norm": 1.1358299255371094, + "learning_rate": 7.09681471462944e-05, + "loss": 0.5343483090400696, + "step": 5878 + }, + { + "epoch": 2.481012658227848, + "grad_norm": 1.1456117630004883, + "learning_rate": 7.094634380802184e-05, + "loss": 0.49450409412384033, + "step": 5880 + }, + { + "epoch": 2.4818565400843884, + "grad_norm": 1.2961846590042114, + "learning_rate": 7.092453563783003e-05, + "loss": 0.6378757357597351, + "step": 5882 + }, + { + "epoch": 2.4827004219409283, + "grad_norm": 0.983889102935791, + "learning_rate": 7.090272264074972e-05, + "loss": 0.5937124490737915, + "step": 5884 + }, + { + "epoch": 2.4835443037974683, + "grad_norm": 1.0205817222595215, + "learning_rate": 7.088090482181273e-05, + "loss": 0.5301283597946167, + "step": 5886 + }, + { + "epoch": 2.4843881856540087, + "grad_norm": 1.1721397638320923, + "learning_rate": 7.085908218605204e-05, + "loss": 0.6191756129264832, + "step": 5888 + }, + { + "epoch": 2.4852320675105486, + "grad_norm": 1.2432814836502075, + "learning_rate": 7.083725473850168e-05, + "loss": 0.5928890109062195, + "step": 5890 + }, + { + "epoch": 2.4860759493670885, + "grad_norm": 1.252125859260559, + "learning_rate": 7.081542248419686e-05, + "loss": 0.6136764287948608, + "step": 5892 + }, + { + "epoch": 2.486919831223629, + "grad_norm": 1.3686699867248535, + "learning_rate": 7.079358542817382e-05, + "loss": 0.6084910035133362, + "step": 5894 + }, + { + "epoch": 2.487763713080169, + "grad_norm": 1.0877282619476318, + "learning_rate": 7.077174357546996e-05, + "loss": 0.5862250924110413, + "step": 5896 + }, + { + "epoch": 2.488607594936709, + "grad_norm": 1.164095401763916, + "learning_rate": 7.074989693112381e-05, + "loss": 0.6300894021987915, + "step": 5898 + }, + { + "epoch": 2.489451476793249, + "grad_norm": 1.1169507503509521, + "learning_rate": 7.072804550017493e-05, + "loss": 0.5508570075035095, + "step": 5900 + }, + { + "epoch": 2.489451476793249, + "eval_loss": 0.6820966005325317, + "eval_runtime": 513.3515, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 5900 + }, + { + "epoch": 2.490295358649789, + "grad_norm": 1.1718615293502808, + "learning_rate": 7.070618928766406e-05, + "loss": 0.550847589969635, + "step": 5902 + }, + { + "epoch": 2.491139240506329, + "grad_norm": 1.4725650548934937, + "learning_rate": 7.068432829863298e-05, + "loss": 0.5663347840309143, + "step": 5904 + }, + { + "epoch": 2.491983122362869, + "grad_norm": 1.042083978652954, + "learning_rate": 7.066246253812462e-05, + "loss": 0.5506191849708557, + "step": 5906 + }, + { + "epoch": 2.4928270042194094, + "grad_norm": 1.2020974159240723, + "learning_rate": 7.064059201118297e-05, + "loss": 0.5656929612159729, + "step": 5908 + }, + { + "epoch": 2.4936708860759493, + "grad_norm": 1.1040663719177246, + "learning_rate": 7.061871672285317e-05, + "loss": 0.5159370303153992, + "step": 5910 + }, + { + "epoch": 2.4945147679324893, + "grad_norm": 1.3681589365005493, + "learning_rate": 7.05968366781814e-05, + "loss": 0.6161949634552002, + "step": 5912 + }, + { + "epoch": 2.4953586497890297, + "grad_norm": 1.26628839969635, + "learning_rate": 7.057495188221498e-05, + "loss": 0.6357758641242981, + "step": 5914 + }, + { + "epoch": 2.4962025316455696, + "grad_norm": 1.2714020013809204, + "learning_rate": 7.05530623400023e-05, + "loss": 0.5467366576194763, + "step": 5916 + }, + { + "epoch": 2.4970464135021095, + "grad_norm": 1.2255018949508667, + "learning_rate": 7.053116805659287e-05, + "loss": 0.592526376247406, + "step": 5918 + }, + { + "epoch": 2.49789029535865, + "grad_norm": 1.2816206216812134, + "learning_rate": 7.050926903703729e-05, + "loss": 0.5819981694221497, + "step": 5920 + }, + { + "epoch": 2.49873417721519, + "grad_norm": 1.1938221454620361, + "learning_rate": 7.048736528638722e-05, + "loss": 0.6037712693214417, + "step": 5922 + }, + { + "epoch": 2.49957805907173, + "grad_norm": 1.1330323219299316, + "learning_rate": 7.046545680969545e-05, + "loss": 0.5567215085029602, + "step": 5924 + }, + { + "epoch": 2.50042194092827, + "grad_norm": 1.233564019203186, + "learning_rate": 7.044354361201585e-05, + "loss": 0.5626974105834961, + "step": 5926 + }, + { + "epoch": 2.50126582278481, + "grad_norm": 1.1913540363311768, + "learning_rate": 7.042162569840336e-05, + "loss": 0.5672739744186401, + "step": 5928 + }, + { + "epoch": 2.50210970464135, + "grad_norm": 1.060952067375183, + "learning_rate": 7.039970307391402e-05, + "loss": 0.5965602993965149, + "step": 5930 + }, + { + "epoch": 2.5029535864978905, + "grad_norm": 1.2003182172775269, + "learning_rate": 7.037777574360497e-05, + "loss": 0.590932309627533, + "step": 5932 + }, + { + "epoch": 2.5037974683544304, + "grad_norm": 1.073434829711914, + "learning_rate": 7.035584371253441e-05, + "loss": 0.5736868381500244, + "step": 5934 + }, + { + "epoch": 2.5046413502109703, + "grad_norm": 1.2641130685806274, + "learning_rate": 7.033390698576166e-05, + "loss": 0.614703357219696, + "step": 5936 + }, + { + "epoch": 2.5054852320675103, + "grad_norm": 1.2406511306762695, + "learning_rate": 7.031196556834708e-05, + "loss": 0.5866397023200989, + "step": 5938 + }, + { + "epoch": 2.5063291139240507, + "grad_norm": 1.231619119644165, + "learning_rate": 7.029001946535215e-05, + "loss": 0.5792667865753174, + "step": 5940 + }, + { + "epoch": 2.5071729957805906, + "grad_norm": 1.419447660446167, + "learning_rate": 7.026806868183939e-05, + "loss": 0.5686604976654053, + "step": 5942 + }, + { + "epoch": 2.5080168776371305, + "grad_norm": 1.139244556427002, + "learning_rate": 7.024611322287245e-05, + "loss": 0.5860661268234253, + "step": 5944 + }, + { + "epoch": 2.508860759493671, + "grad_norm": 1.070517897605896, + "learning_rate": 7.022415309351602e-05, + "loss": 0.5823250412940979, + "step": 5946 + }, + { + "epoch": 2.509704641350211, + "grad_norm": 1.0775398015975952, + "learning_rate": 7.020218829883589e-05, + "loss": 0.5291389226913452, + "step": 5948 + }, + { + "epoch": 2.510548523206751, + "grad_norm": 1.339716911315918, + "learning_rate": 7.018021884389892e-05, + "loss": 0.6215447783470154, + "step": 5950 + }, + { + "epoch": 2.511392405063291, + "grad_norm": 1.3589707612991333, + "learning_rate": 7.0158244733773e-05, + "loss": 0.5419909358024597, + "step": 5952 + }, + { + "epoch": 2.512236286919831, + "grad_norm": 1.1664098501205444, + "learning_rate": 7.01362659735272e-05, + "loss": 0.5476977229118347, + "step": 5954 + }, + { + "epoch": 2.513080168776371, + "grad_norm": 1.1184223890304565, + "learning_rate": 7.011428256823154e-05, + "loss": 0.5896323919296265, + "step": 5956 + }, + { + "epoch": 2.5139240506329115, + "grad_norm": 1.4071170091629028, + "learning_rate": 7.00922945229572e-05, + "loss": 0.6353691220283508, + "step": 5958 + }, + { + "epoch": 2.5147679324894514, + "grad_norm": 1.3740885257720947, + "learning_rate": 7.007030184277641e-05, + "loss": 0.6605582237243652, + "step": 5960 + }, + { + "epoch": 2.5156118143459913, + "grad_norm": 1.071395754814148, + "learning_rate": 7.004830453276241e-05, + "loss": 0.6399887800216675, + "step": 5962 + }, + { + "epoch": 2.5164556962025317, + "grad_norm": 1.2292311191558838, + "learning_rate": 7.002630259798962e-05, + "loss": 0.5992775559425354, + "step": 5964 + }, + { + "epoch": 2.5172995780590717, + "grad_norm": 1.0133391618728638, + "learning_rate": 7.000429604353341e-05, + "loss": 0.5716721415519714, + "step": 5966 + }, + { + "epoch": 2.5181434599156116, + "grad_norm": 1.2669343948364258, + "learning_rate": 6.998228487447032e-05, + "loss": 0.5455520749092102, + "step": 5968 + }, + { + "epoch": 2.518987341772152, + "grad_norm": 1.2026386260986328, + "learning_rate": 6.996026909587785e-05, + "loss": 0.6411572694778442, + "step": 5970 + }, + { + "epoch": 2.519831223628692, + "grad_norm": 1.359923243522644, + "learning_rate": 6.993824871283465e-05, + "loss": 0.6687750220298767, + "step": 5972 + }, + { + "epoch": 2.520675105485232, + "grad_norm": 1.1265650987625122, + "learning_rate": 6.99162237304204e-05, + "loss": 0.6271382570266724, + "step": 5974 + }, + { + "epoch": 2.5215189873417723, + "grad_norm": 1.197667121887207, + "learning_rate": 6.989419415371583e-05, + "loss": 0.6191279888153076, + "step": 5976 + }, + { + "epoch": 2.522362869198312, + "grad_norm": 1.169992446899414, + "learning_rate": 6.987215998780275e-05, + "loss": 0.6313687562942505, + "step": 5978 + }, + { + "epoch": 2.523206751054852, + "grad_norm": 1.2706433534622192, + "learning_rate": 6.9850121237764e-05, + "loss": 0.6058336496353149, + "step": 5980 + }, + { + "epoch": 2.5240506329113925, + "grad_norm": 1.322376012802124, + "learning_rate": 6.982807790868352e-05, + "loss": 0.6466464400291443, + "step": 5982 + }, + { + "epoch": 2.5248945147679325, + "grad_norm": 1.2398571968078613, + "learning_rate": 6.980603000564626e-05, + "loss": 0.5730098485946655, + "step": 5984 + }, + { + "epoch": 2.5257383966244724, + "grad_norm": 1.2035216093063354, + "learning_rate": 6.978397753373826e-05, + "loss": 0.5305635333061218, + "step": 5986 + }, + { + "epoch": 2.526582278481013, + "grad_norm": 1.1951299905776978, + "learning_rate": 6.976192049804661e-05, + "loss": 0.5601096153259277, + "step": 5988 + }, + { + "epoch": 2.5274261603375527, + "grad_norm": 0.9950459599494934, + "learning_rate": 6.973985890365945e-05, + "loss": 0.5049516558647156, + "step": 5990 + }, + { + "epoch": 2.5282700421940927, + "grad_norm": 1.2581008672714233, + "learning_rate": 6.971779275566593e-05, + "loss": 0.5456960797309875, + "step": 5992 + }, + { + "epoch": 2.529113924050633, + "grad_norm": 1.2196903228759766, + "learning_rate": 6.969572205915632e-05, + "loss": 0.6026827096939087, + "step": 5994 + }, + { + "epoch": 2.529957805907173, + "grad_norm": 1.3109357357025146, + "learning_rate": 6.967364681922189e-05, + "loss": 0.597453236579895, + "step": 5996 + }, + { + "epoch": 2.530801687763713, + "grad_norm": 1.016904354095459, + "learning_rate": 6.965156704095498e-05, + "loss": 0.5304323434829712, + "step": 5998 + }, + { + "epoch": 2.5316455696202533, + "grad_norm": 1.2363858222961426, + "learning_rate": 6.962948272944896e-05, + "loss": 0.5748253464698792, + "step": 6000 + }, + { + "epoch": 2.5316455696202533, + "eval_loss": 0.6813357472419739, + "eval_runtime": 513.5491, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 6000 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 1 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.231982336554848e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6000/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6500/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6500/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6500/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..547510ff0f44372ec08c1f4a0b2f5b4dd5cc7306 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d1e43af2e542472f36eafffb2f5f8925e103ddeff9b464393833786ef51a733 +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6500/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a39faed684f2703ba1f9830fed4b8d01fd02dad --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e73b5d19b10c301988f4559db1fecd4070436e326a852d1f33aff85b33b1c2bc +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6500/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..517eab73fb3156f665dcd0fb4fa84d03683090cc --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:975a45b2cbb52242a1862667a154f713372ac785cc44760bf03cd69812fefa6a +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6500/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..73eceeb8937da23c1964990d76fa61f6e30059b7 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8eb53dc0cfe9ef2de0950817407f4810fef5054d1535fb4859c69d56ed4b88 +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6500/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..94a79d4d670a544e4d26e9f878cbd87bb2754aec --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/trainer_state.json @@ -0,0 +1,23313 @@ +{ + "best_global_step": 6400, + "best_metric": 0.6764505505561829, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-6000", + "epoch": 2.742616033755274, + "eval_steps": 100, + "global_step": 6500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + }, + { + "epoch": 0.8447257383966245, + "grad_norm": 1.145434021949768, + "learning_rate": 9.893006338154401e-05, + "loss": 0.731850802898407, + "step": 2002 + }, + { + "epoch": 0.8455696202531645, + "grad_norm": 1.0618077516555786, + "learning_rate": 9.892511635084101e-05, + "loss": 0.6711665391921997, + "step": 2004 + }, + { + "epoch": 0.8464135021097047, + "grad_norm": 1.1657867431640625, + "learning_rate": 9.892015803405331e-05, + "loss": 0.6894803643226624, + "step": 2006 + }, + { + "epoch": 0.8472573839662447, + "grad_norm": 1.080140233039856, + "learning_rate": 9.891518843232467e-05, + "loss": 0.628146231174469, + "step": 2008 + }, + { + "epoch": 0.8481012658227848, + "grad_norm": 1.0664509534835815, + "learning_rate": 9.891020754680151e-05, + "loss": 0.740858793258667, + "step": 2010 + }, + { + "epoch": 0.8489451476793249, + "grad_norm": 1.5567615032196045, + "learning_rate": 9.89052153786328e-05, + "loss": 0.7763919234275818, + "step": 2012 + }, + { + "epoch": 0.849789029535865, + "grad_norm": 1.4347095489501953, + "learning_rate": 9.890021192897016e-05, + "loss": 0.8131396770477295, + "step": 2014 + }, + { + "epoch": 0.850632911392405, + "grad_norm": 1.1787892580032349, + "learning_rate": 9.889519719896776e-05, + "loss": 0.6829051375389099, + "step": 2016 + }, + { + "epoch": 0.8514767932489451, + "grad_norm": 1.239745855331421, + "learning_rate": 9.889017118978241e-05, + "loss": 0.7664558291435242, + "step": 2018 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 1.1224207878112793, + "learning_rate": 9.888513390257352e-05, + "loss": 0.7307376861572266, + "step": 2020 + }, + { + "epoch": 0.8531645569620253, + "grad_norm": 1.100536823272705, + "learning_rate": 9.88800853385031e-05, + "loss": 0.6786578893661499, + "step": 2022 + }, + { + "epoch": 0.8540084388185654, + "grad_norm": 1.25773024559021, + "learning_rate": 9.887502549873576e-05, + "loss": 0.7971984148025513, + "step": 2024 + }, + { + "epoch": 0.8548523206751055, + "grad_norm": 0.9980104565620422, + "learning_rate": 9.886995438443868e-05, + "loss": 0.6990941166877747, + "step": 2026 + }, + { + "epoch": 0.8556962025316456, + "grad_norm": 1.0464621782302856, + "learning_rate": 9.886487199678171e-05, + "loss": 0.763938307762146, + "step": 2028 + }, + { + "epoch": 0.8565400843881856, + "grad_norm": 1.2303017377853394, + "learning_rate": 9.885977833693724e-05, + "loss": 0.7165632247924805, + "step": 2030 + }, + { + "epoch": 0.8573839662447258, + "grad_norm": 1.2203325033187866, + "learning_rate": 9.885467340608027e-05, + "loss": 0.7586364150047302, + "step": 2032 + }, + { + "epoch": 0.8582278481012658, + "grad_norm": 1.113882064819336, + "learning_rate": 9.884955720538843e-05, + "loss": 0.703253984451294, + "step": 2034 + }, + { + "epoch": 0.8590717299578059, + "grad_norm": 1.1731632947921753, + "learning_rate": 9.88444297360419e-05, + "loss": 0.8530917763710022, + "step": 2036 + }, + { + "epoch": 0.859915611814346, + "grad_norm": 1.4592338800430298, + "learning_rate": 9.883929099922349e-05, + "loss": 0.8166638612747192, + "step": 2038 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.1279125213623047, + "learning_rate": 9.883414099611864e-05, + "loss": 0.6762415170669556, + "step": 2040 + }, + { + "epoch": 0.8616033755274262, + "grad_norm": 1.1587293148040771, + "learning_rate": 9.882897972791534e-05, + "loss": 0.6826539039611816, + "step": 2042 + }, + { + "epoch": 0.8624472573839662, + "grad_norm": 1.1909502744674683, + "learning_rate": 9.88238071958042e-05, + "loss": 0.7372410893440247, + "step": 2044 + }, + { + "epoch": 0.8632911392405064, + "grad_norm": 1.0340155363082886, + "learning_rate": 9.881862340097841e-05, + "loss": 0.699260950088501, + "step": 2046 + }, + { + "epoch": 0.8641350210970464, + "grad_norm": 1.1745870113372803, + "learning_rate": 9.881342834463379e-05, + "loss": 0.7689789533615112, + "step": 2048 + }, + { + "epoch": 0.8649789029535865, + "grad_norm": 1.0003606081008911, + "learning_rate": 9.880822202796872e-05, + "loss": 0.6877372860908508, + "step": 2050 + }, + { + "epoch": 0.8658227848101265, + "grad_norm": 1.2546781301498413, + "learning_rate": 9.88030044521842e-05, + "loss": 0.7632413506507874, + "step": 2052 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.1178704500198364, + "learning_rate": 9.879777561848385e-05, + "loss": 0.6776729822158813, + "step": 2054 + }, + { + "epoch": 0.8675105485232067, + "grad_norm": 1.523606777191162, + "learning_rate": 9.879253552807384e-05, + "loss": 0.7592973709106445, + "step": 2056 + }, + { + "epoch": 0.8683544303797468, + "grad_norm": 1.3490995168685913, + "learning_rate": 9.878728418216296e-05, + "loss": 0.8028839230537415, + "step": 2058 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 1.1851624250411987, + "learning_rate": 9.87820215819626e-05, + "loss": 0.7499933838844299, + "step": 2060 + }, + { + "epoch": 0.870042194092827, + "grad_norm": 1.1877925395965576, + "learning_rate": 9.877674772868672e-05, + "loss": 0.7324717044830322, + "step": 2062 + }, + { + "epoch": 0.8708860759493671, + "grad_norm": 1.2982885837554932, + "learning_rate": 9.877146262355194e-05, + "loss": 0.7456585168838501, + "step": 2064 + }, + { + "epoch": 0.8717299578059071, + "grad_norm": 1.043912649154663, + "learning_rate": 9.876616626777739e-05, + "loss": 0.7552799582481384, + "step": 2066 + }, + { + "epoch": 0.8725738396624473, + "grad_norm": 1.172580599784851, + "learning_rate": 9.876085866258487e-05, + "loss": 0.6964990496635437, + "step": 2068 + }, + { + "epoch": 0.8734177215189873, + "grad_norm": 1.26815927028656, + "learning_rate": 9.875553980919871e-05, + "loss": 0.7368612289428711, + "step": 2070 + }, + { + "epoch": 0.8742616033755274, + "grad_norm": 1.1268136501312256, + "learning_rate": 9.875020970884587e-05, + "loss": 0.7400802969932556, + "step": 2072 + }, + { + "epoch": 0.8751054852320675, + "grad_norm": 1.0556721687316895, + "learning_rate": 9.874486836275594e-05, + "loss": 0.6931334137916565, + "step": 2074 + }, + { + "epoch": 0.8759493670886076, + "grad_norm": 1.1967823505401611, + "learning_rate": 9.873951577216106e-05, + "loss": 0.7124089002609253, + "step": 2076 + }, + { + "epoch": 0.8767932489451477, + "grad_norm": 1.1753164529800415, + "learning_rate": 9.873415193829591e-05, + "loss": 0.7462030053138733, + "step": 2078 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 1.326923131942749, + "learning_rate": 9.872877686239789e-05, + "loss": 0.778078019618988, + "step": 2080 + }, + { + "epoch": 0.8784810126582279, + "grad_norm": 1.1472662687301636, + "learning_rate": 9.87233905457069e-05, + "loss": 0.6592919826507568, + "step": 2082 + }, + { + "epoch": 0.8793248945147679, + "grad_norm": 1.1162762641906738, + "learning_rate": 9.871799298946544e-05, + "loss": 0.661717414855957, + "step": 2084 + }, + { + "epoch": 0.880168776371308, + "grad_norm": 1.1694408655166626, + "learning_rate": 9.871258419491866e-05, + "loss": 0.6203670501708984, + "step": 2086 + }, + { + "epoch": 0.8810126582278481, + "grad_norm": 1.229691505432129, + "learning_rate": 9.870716416331425e-05, + "loss": 0.758888304233551, + "step": 2088 + }, + { + "epoch": 0.8818565400843882, + "grad_norm": 1.540377140045166, + "learning_rate": 9.870173289590251e-05, + "loss": 0.760649561882019, + "step": 2090 + }, + { + "epoch": 0.8827004219409282, + "grad_norm": 1.173628568649292, + "learning_rate": 9.869629039393632e-05, + "loss": 0.6981227397918701, + "step": 2092 + }, + { + "epoch": 0.8835443037974684, + "grad_norm": 1.1404013633728027, + "learning_rate": 9.869083665867116e-05, + "loss": 0.7808336615562439, + "step": 2094 + }, + { + "epoch": 0.8843881856540085, + "grad_norm": 1.1038721799850464, + "learning_rate": 9.868537169136511e-05, + "loss": 0.7540555596351624, + "step": 2096 + }, + { + "epoch": 0.8852320675105485, + "grad_norm": 1.1510080099105835, + "learning_rate": 9.867989549327885e-05, + "loss": 0.6650454998016357, + "step": 2098 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.166912317276001, + "learning_rate": 9.867440806567561e-05, + "loss": 0.673769474029541, + "step": 2100 + }, + { + "epoch": 0.8860759493670886, + "eval_loss": 0.7559094429016113, + "eval_runtime": 847.8311, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2100 + }, + { + "epoch": 0.8869198312236287, + "grad_norm": 1.227583885192871, + "learning_rate": 9.866890940982121e-05, + "loss": 0.8314241766929626, + "step": 2102 + }, + { + "epoch": 0.8877637130801688, + "grad_norm": 1.1813976764678955, + "learning_rate": 9.866339952698413e-05, + "loss": 0.6770843863487244, + "step": 2104 + }, + { + "epoch": 0.8886075949367088, + "grad_norm": 1.2471063137054443, + "learning_rate": 9.865787841843539e-05, + "loss": 0.7142292857170105, + "step": 2106 + }, + { + "epoch": 0.889451476793249, + "grad_norm": 1.1602860689163208, + "learning_rate": 9.865234608544858e-05, + "loss": 0.6981731653213501, + "step": 2108 + }, + { + "epoch": 0.890295358649789, + "grad_norm": 1.145677089691162, + "learning_rate": 9.864680252929992e-05, + "loss": 0.7019379138946533, + "step": 2110 + }, + { + "epoch": 0.8911392405063291, + "grad_norm": 1.2222462892532349, + "learning_rate": 9.86412477512682e-05, + "loss": 0.7690986394882202, + "step": 2112 + }, + { + "epoch": 0.8919831223628693, + "grad_norm": 1.1288166046142578, + "learning_rate": 9.863568175263478e-05, + "loss": 0.7241792678833008, + "step": 2114 + }, + { + "epoch": 0.8928270042194093, + "grad_norm": 1.1773978471755981, + "learning_rate": 9.863010453468364e-05, + "loss": 0.7392162084579468, + "step": 2116 + }, + { + "epoch": 0.8936708860759494, + "grad_norm": 1.102638840675354, + "learning_rate": 9.862451609870136e-05, + "loss": 0.7603078484535217, + "step": 2118 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.861891644597707e-05, + "loss": 0.6804911494255066, + "step": 2120 + }, + { + "epoch": 0.8953586497890296, + "grad_norm": 1.1381969451904297, + "learning_rate": 9.86133055778025e-05, + "loss": 0.787288248538971, + "step": 2122 + }, + { + "epoch": 0.8962025316455696, + "grad_norm": 1.2454546689987183, + "learning_rate": 9.860768349547196e-05, + "loss": 0.7282505035400391, + "step": 2124 + }, + { + "epoch": 0.8970464135021097, + "grad_norm": 1.2568305730819702, + "learning_rate": 9.860205020028237e-05, + "loss": 0.7554803490638733, + "step": 2126 + }, + { + "epoch": 0.8978902953586498, + "grad_norm": 1.1523523330688477, + "learning_rate": 9.859640569353321e-05, + "loss": 0.7126525044441223, + "step": 2128 + }, + { + "epoch": 0.8987341772151899, + "grad_norm": 1.314878225326538, + "learning_rate": 9.859074997652658e-05, + "loss": 0.7300811409950256, + "step": 2130 + }, + { + "epoch": 0.8995780590717299, + "grad_norm": 1.1272218227386475, + "learning_rate": 9.858508305056713e-05, + "loss": 0.7217329144477844, + "step": 2132 + }, + { + "epoch": 0.90042194092827, + "grad_norm": 1.10934317111969, + "learning_rate": 9.857940491696211e-05, + "loss": 0.714308500289917, + "step": 2134 + }, + { + "epoch": 0.9012658227848102, + "grad_norm": 1.1991039514541626, + "learning_rate": 9.857371557702136e-05, + "loss": 0.6613366007804871, + "step": 2136 + }, + { + "epoch": 0.9021097046413502, + "grad_norm": 1.3176918029785156, + "learning_rate": 9.85680150320573e-05, + "loss": 0.6972863078117371, + "step": 2138 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 1.1966592073440552, + "learning_rate": 9.856230328338496e-05, + "loss": 0.7299100160598755, + "step": 2140 + }, + { + "epoch": 0.9037974683544304, + "grad_norm": 1.2889270782470703, + "learning_rate": 9.85565803323219e-05, + "loss": 0.7145020961761475, + "step": 2142 + }, + { + "epoch": 0.9046413502109705, + "grad_norm": 1.2112789154052734, + "learning_rate": 9.855084618018828e-05, + "loss": 0.6717942953109741, + "step": 2144 + }, + { + "epoch": 0.9054852320675105, + "grad_norm": 1.2550239562988281, + "learning_rate": 9.85451008283069e-05, + "loss": 0.7460196018218994, + "step": 2146 + }, + { + "epoch": 0.9063291139240506, + "grad_norm": 1.2926387786865234, + "learning_rate": 9.853934427800309e-05, + "loss": 0.8300626873970032, + "step": 2148 + }, + { + "epoch": 0.9071729957805907, + "grad_norm": 1.0690672397613525, + "learning_rate": 9.853357653060478e-05, + "loss": 0.715215802192688, + "step": 2150 + }, + { + "epoch": 0.9080168776371308, + "grad_norm": 1.1021424531936646, + "learning_rate": 9.852779758744245e-05, + "loss": 0.7021427154541016, + "step": 2152 + }, + { + "epoch": 0.9088607594936708, + "grad_norm": 1.0713517665863037, + "learning_rate": 9.852200744984921e-05, + "loss": 0.7576406598091125, + "step": 2154 + }, + { + "epoch": 0.909704641350211, + "grad_norm": 1.277526617050171, + "learning_rate": 9.851620611916075e-05, + "loss": 0.7008846998214722, + "step": 2156 + }, + { + "epoch": 0.9105485232067511, + "grad_norm": 1.2434618473052979, + "learning_rate": 9.85103935967153e-05, + "loss": 0.7536613345146179, + "step": 2158 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 1.1654841899871826, + "learning_rate": 9.850456988385371e-05, + "loss": 0.7435567378997803, + "step": 2160 + }, + { + "epoch": 0.9122362869198313, + "grad_norm": 1.0718246698379517, + "learning_rate": 9.849873498191939e-05, + "loss": 0.7725666165351868, + "step": 2162 + }, + { + "epoch": 0.9130801687763713, + "grad_norm": 1.3425630331039429, + "learning_rate": 9.849288889225835e-05, + "loss": 0.7833593487739563, + "step": 2164 + }, + { + "epoch": 0.9139240506329114, + "grad_norm": 1.1989985704421997, + "learning_rate": 9.848703161621917e-05, + "loss": 0.7290158867835999, + "step": 2166 + }, + { + "epoch": 0.9147679324894514, + "grad_norm": 1.0549380779266357, + "learning_rate": 9.8481163155153e-05, + "loss": 0.6787996888160706, + "step": 2168 + }, + { + "epoch": 0.9156118143459916, + "grad_norm": 1.0757017135620117, + "learning_rate": 9.847528351041359e-05, + "loss": 0.7645748853683472, + "step": 2170 + }, + { + "epoch": 0.9164556962025316, + "grad_norm": 1.0636975765228271, + "learning_rate": 9.846939268335726e-05, + "loss": 0.6640698313713074, + "step": 2172 + }, + { + "epoch": 0.9172995780590717, + "grad_norm": 1.2038439512252808, + "learning_rate": 9.846349067534291e-05, + "loss": 0.7216284275054932, + "step": 2174 + }, + { + "epoch": 0.9181434599156119, + "grad_norm": 1.17854642868042, + "learning_rate": 9.845757748773203e-05, + "loss": 0.7244991660118103, + "step": 2176 + }, + { + "epoch": 0.9189873417721519, + "grad_norm": 1.0391159057617188, + "learning_rate": 9.845165312188864e-05, + "loss": 0.6043152809143066, + "step": 2178 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 1.2382071018218994, + "learning_rate": 9.844571757917944e-05, + "loss": 0.7791659832000732, + "step": 2180 + }, + { + "epoch": 0.920675105485232, + "grad_norm": 1.0855708122253418, + "learning_rate": 9.84397708609736e-05, + "loss": 0.7190433144569397, + "step": 2182 + }, + { + "epoch": 0.9215189873417722, + "grad_norm": 1.103308916091919, + "learning_rate": 9.843381296864291e-05, + "loss": 0.6648658514022827, + "step": 2184 + }, + { + "epoch": 0.9223628691983122, + "grad_norm": 1.073517918586731, + "learning_rate": 9.842784390356178e-05, + "loss": 0.6891760230064392, + "step": 2186 + }, + { + "epoch": 0.9232067510548523, + "grad_norm": 1.0806199312210083, + "learning_rate": 9.842186366710712e-05, + "loss": 0.6880859136581421, + "step": 2188 + }, + { + "epoch": 0.9240506329113924, + "grad_norm": 1.0631483793258667, + "learning_rate": 9.841587226065848e-05, + "loss": 0.6238307952880859, + "step": 2190 + }, + { + "epoch": 0.9248945147679325, + "grad_norm": 1.2630863189697266, + "learning_rate": 9.840986968559795e-05, + "loss": 0.6905744075775146, + "step": 2192 + }, + { + "epoch": 0.9257383966244725, + "grad_norm": 1.1307560205459595, + "learning_rate": 9.840385594331022e-05, + "loss": 0.7531564235687256, + "step": 2194 + }, + { + "epoch": 0.9265822784810127, + "grad_norm": 1.0294862985610962, + "learning_rate": 9.839783103518254e-05, + "loss": 0.6750671863555908, + "step": 2196 + }, + { + "epoch": 0.9274261603375528, + "grad_norm": 1.2446976900100708, + "learning_rate": 9.839179496260472e-05, + "loss": 0.7200804352760315, + "step": 2198 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 1.2673420906066895, + "learning_rate": 9.83857477269692e-05, + "loss": 0.7002623677253723, + "step": 2200 + }, + { + "epoch": 0.9282700421940928, + "eval_loss": 0.7497645616531372, + "eval_runtime": 856.8766, + "eval_samples_per_second": 2.459, + "eval_steps_per_second": 2.459, + "step": 2200 + }, + { + "epoch": 0.9291139240506329, + "grad_norm": 1.5114624500274658, + "learning_rate": 9.837968932967094e-05, + "loss": 0.7718265056610107, + "step": 2202 + }, + { + "epoch": 0.929957805907173, + "grad_norm": 1.2059369087219238, + "learning_rate": 9.837361977210751e-05, + "loss": 0.7204271554946899, + "step": 2204 + }, + { + "epoch": 0.9308016877637131, + "grad_norm": 1.2077301740646362, + "learning_rate": 9.836753905567902e-05, + "loss": 0.7371073961257935, + "step": 2206 + }, + { + "epoch": 0.9316455696202531, + "grad_norm": 1.120097279548645, + "learning_rate": 9.836144718178818e-05, + "loss": 0.6601167321205139, + "step": 2208 + }, + { + "epoch": 0.9324894514767933, + "grad_norm": 1.1755714416503906, + "learning_rate": 9.835534415184029e-05, + "loss": 0.6897423267364502, + "step": 2210 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3587000370025635, + "learning_rate": 9.834922996724317e-05, + "loss": 0.758438229560852, + "step": 2212 + }, + { + "epoch": 0.9341772151898734, + "grad_norm": 1.1898177862167358, + "learning_rate": 9.834310462940727e-05, + "loss": 0.7489214539527893, + "step": 2214 + }, + { + "epoch": 0.9350210970464135, + "grad_norm": 1.0814623832702637, + "learning_rate": 9.833696813974558e-05, + "loss": 0.6844488382339478, + "step": 2216 + }, + { + "epoch": 0.9358649789029536, + "grad_norm": 1.1060179471969604, + "learning_rate": 9.833082049967366e-05, + "loss": 0.6617586016654968, + "step": 2218 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 1.1780575513839722, + "learning_rate": 9.832466171060968e-05, + "loss": 0.7383584976196289, + "step": 2220 + }, + { + "epoch": 0.9375527426160337, + "grad_norm": 1.3734618425369263, + "learning_rate": 9.831849177397432e-05, + "loss": 0.7764308452606201, + "step": 2222 + }, + { + "epoch": 0.9383966244725739, + "grad_norm": 1.1367733478546143, + "learning_rate": 9.831231069119089e-05, + "loss": 0.6834397912025452, + "step": 2224 + }, + { + "epoch": 0.9392405063291139, + "grad_norm": 1.1695492267608643, + "learning_rate": 9.830611846368524e-05, + "loss": 0.7054480910301208, + "step": 2226 + }, + { + "epoch": 0.940084388185654, + "grad_norm": 1.0345736742019653, + "learning_rate": 9.829991509288579e-05, + "loss": 0.694448709487915, + "step": 2228 + }, + { + "epoch": 0.9409282700421941, + "grad_norm": 1.298105239868164, + "learning_rate": 9.829370058022356e-05, + "loss": 0.6839741468429565, + "step": 2230 + }, + { + "epoch": 0.9417721518987342, + "grad_norm": 1.2905502319335938, + "learning_rate": 9.828747492713209e-05, + "loss": 0.7886884212493896, + "step": 2232 + }, + { + "epoch": 0.9426160337552743, + "grad_norm": 1.12301504611969, + "learning_rate": 9.828123813504753e-05, + "loss": 0.7206413149833679, + "step": 2234 + }, + { + "epoch": 0.9434599156118143, + "grad_norm": 1.2644896507263184, + "learning_rate": 9.82749902054086e-05, + "loss": 0.7700693607330322, + "step": 2236 + }, + { + "epoch": 0.9443037974683545, + "grad_norm": 1.1626365184783936, + "learning_rate": 9.826873113965655e-05, + "loss": 0.7199711203575134, + "step": 2238 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 1.0728627443313599, + "learning_rate": 9.826246093923528e-05, + "loss": 0.7183539271354675, + "step": 2240 + }, + { + "epoch": 0.9459915611814346, + "grad_norm": 1.1444766521453857, + "learning_rate": 9.825617960559114e-05, + "loss": 0.7417964935302734, + "step": 2242 + }, + { + "epoch": 0.9468354430379747, + "grad_norm": 1.4059823751449585, + "learning_rate": 9.824988714017316e-05, + "loss": 0.7949740290641785, + "step": 2244 + }, + { + "epoch": 0.9476793248945148, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.824358354443286e-05, + "loss": 0.6433083415031433, + "step": 2246 + }, + { + "epoch": 0.9485232067510548, + "grad_norm": 1.0879144668579102, + "learning_rate": 9.823726881982438e-05, + "loss": 0.6519861817359924, + "step": 2248 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.2289162874221802, + "learning_rate": 9.82309429678044e-05, + "loss": 0.7280195355415344, + "step": 2250 + }, + { + "epoch": 0.950210970464135, + "grad_norm": 1.1755765676498413, + "learning_rate": 9.822460598983217e-05, + "loss": 0.7524687647819519, + "step": 2252 + }, + { + "epoch": 0.9510548523206751, + "grad_norm": 1.179807186126709, + "learning_rate": 9.821825788736949e-05, + "loss": 0.7543174624443054, + "step": 2254 + }, + { + "epoch": 0.9518987341772152, + "grad_norm": 1.1234289407730103, + "learning_rate": 9.821189866188079e-05, + "loss": 0.716377854347229, + "step": 2256 + }, + { + "epoch": 0.9527426160337553, + "grad_norm": 1.0324063301086426, + "learning_rate": 9.820552831483297e-05, + "loss": 0.6403332948684692, + "step": 2258 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 1.1459579467773438, + "learning_rate": 9.819914684769558e-05, + "loss": 0.7406947612762451, + "step": 2260 + }, + { + "epoch": 0.9544303797468354, + "grad_norm": 1.2886124849319458, + "learning_rate": 9.819275426194072e-05, + "loss": 0.749687671661377, + "step": 2262 + }, + { + "epoch": 0.9552742616033755, + "grad_norm": 1.3349844217300415, + "learning_rate": 9.818635055904299e-05, + "loss": 0.778410017490387, + "step": 2264 + }, + { + "epoch": 0.9561181434599156, + "grad_norm": 1.0994901657104492, + "learning_rate": 9.81799357404796e-05, + "loss": 0.6701914668083191, + "step": 2266 + }, + { + "epoch": 0.9569620253164557, + "grad_norm": 1.1787796020507812, + "learning_rate": 9.817350980773038e-05, + "loss": 0.7205135226249695, + "step": 2268 + }, + { + "epoch": 0.9578059071729957, + "grad_norm": 1.100813627243042, + "learning_rate": 9.816707276227763e-05, + "loss": 0.6897916197776794, + "step": 2270 + }, + { + "epoch": 0.9586497890295359, + "grad_norm": 1.1280698776245117, + "learning_rate": 9.816062460560627e-05, + "loss": 0.6763570308685303, + "step": 2272 + }, + { + "epoch": 0.959493670886076, + "grad_norm": 1.2322514057159424, + "learning_rate": 9.815416533920374e-05, + "loss": 0.6948683857917786, + "step": 2274 + }, + { + "epoch": 0.960337552742616, + "grad_norm": 1.3963630199432373, + "learning_rate": 9.814769496456008e-05, + "loss": 0.7876828908920288, + "step": 2276 + }, + { + "epoch": 0.9611814345991562, + "grad_norm": 1.2093676328659058, + "learning_rate": 9.814121348316792e-05, + "loss": 0.8191362619400024, + "step": 2278 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 1.2223572731018066, + "learning_rate": 9.813472089652233e-05, + "loss": 0.7162626385688782, + "step": 2280 + }, + { + "epoch": 0.9628691983122363, + "grad_norm": 1.1498078107833862, + "learning_rate": 9.812821720612111e-05, + "loss": 0.7183970212936401, + "step": 2282 + }, + { + "epoch": 0.9637130801687763, + "grad_norm": 1.1563853025436401, + "learning_rate": 9.812170241346449e-05, + "loss": 0.734487771987915, + "step": 2284 + }, + { + "epoch": 0.9645569620253165, + "grad_norm": 1.1823415756225586, + "learning_rate": 9.81151765200553e-05, + "loss": 0.7312371730804443, + "step": 2286 + }, + { + "epoch": 0.9654008438818565, + "grad_norm": 1.1336151361465454, + "learning_rate": 9.810863952739899e-05, + "loss": 0.7668377757072449, + "step": 2288 + }, + { + "epoch": 0.9662447257383966, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.810209143700347e-05, + "loss": 0.7100399732589722, + "step": 2290 + }, + { + "epoch": 0.9670886075949368, + "grad_norm": 1.1368129253387451, + "learning_rate": 9.809553225037926e-05, + "loss": 0.7169836163520813, + "step": 2292 + }, + { + "epoch": 0.9679324894514768, + "grad_norm": 1.141107439994812, + "learning_rate": 9.808896196903947e-05, + "loss": 0.7709535956382751, + "step": 2294 + }, + { + "epoch": 0.9687763713080169, + "grad_norm": 1.276405930519104, + "learning_rate": 9.808238059449971e-05, + "loss": 0.7300511002540588, + "step": 2296 + }, + { + "epoch": 0.9696202531645569, + "grad_norm": 0.9817046523094177, + "learning_rate": 9.80757881282782e-05, + "loss": 0.6259129047393799, + "step": 2298 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 1.3965257406234741, + "learning_rate": 9.806918457189566e-05, + "loss": 0.7361716032028198, + "step": 2300 + }, + { + "epoch": 0.9704641350210971, + "eval_loss": 0.7464568614959717, + "eval_runtime": 864.2128, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 2.438, + "step": 2300 + }, + { + "epoch": 0.9713080168776371, + "grad_norm": 1.2168612480163574, + "learning_rate": 9.806256992687544e-05, + "loss": 0.805477499961853, + "step": 2302 + }, + { + "epoch": 0.9721518987341772, + "grad_norm": 1.0418168306350708, + "learning_rate": 9.80559441947434e-05, + "loss": 0.6673368811607361, + "step": 2304 + }, + { + "epoch": 0.9729957805907173, + "grad_norm": 1.223128318786621, + "learning_rate": 9.804930737702796e-05, + "loss": 0.7585647106170654, + "step": 2306 + }, + { + "epoch": 0.9738396624472574, + "grad_norm": 1.264511227607727, + "learning_rate": 9.804265947526011e-05, + "loss": 0.7642034888267517, + "step": 2308 + }, + { + "epoch": 0.9746835443037974, + "grad_norm": 1.076887607574463, + "learning_rate": 9.803600049097339e-05, + "loss": 0.7094541192054749, + "step": 2310 + }, + { + "epoch": 0.9755274261603376, + "grad_norm": 1.0214987993240356, + "learning_rate": 9.802933042570392e-05, + "loss": 0.7370059490203857, + "step": 2312 + }, + { + "epoch": 0.9763713080168777, + "grad_norm": 1.3075295686721802, + "learning_rate": 9.802264928099035e-05, + "loss": 0.726834237575531, + "step": 2314 + }, + { + "epoch": 0.9772151898734177, + "grad_norm": 1.057386040687561, + "learning_rate": 9.801595705837385e-05, + "loss": 0.6742353439331055, + "step": 2316 + }, + { + "epoch": 0.9780590717299578, + "grad_norm": 1.3998085260391235, + "learning_rate": 9.800925375939825e-05, + "loss": 0.6862425208091736, + "step": 2318 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 1.080574631690979, + "learning_rate": 9.800253938560983e-05, + "loss": 0.6212031245231628, + "step": 2320 + }, + { + "epoch": 0.979746835443038, + "grad_norm": 1.3643771409988403, + "learning_rate": 9.799581393855748e-05, + "loss": 0.7522522211074829, + "step": 2322 + }, + { + "epoch": 0.980590717299578, + "grad_norm": 1.2455768585205078, + "learning_rate": 9.798907741979264e-05, + "loss": 0.7265716791152954, + "step": 2324 + }, + { + "epoch": 0.9814345991561182, + "grad_norm": 1.078774333000183, + "learning_rate": 9.798232983086927e-05, + "loss": 0.7160419225692749, + "step": 2326 + }, + { + "epoch": 0.9822784810126582, + "grad_norm": 1.3013948202133179, + "learning_rate": 9.797557117334394e-05, + "loss": 0.7991124391555786, + "step": 2328 + }, + { + "epoch": 0.9831223628691983, + "grad_norm": 1.2216732501983643, + "learning_rate": 9.796880144877572e-05, + "loss": 0.7193916440010071, + "step": 2330 + }, + { + "epoch": 0.9839662447257383, + "grad_norm": 1.1469542980194092, + "learning_rate": 9.796202065872627e-05, + "loss": 0.7184370756149292, + "step": 2332 + }, + { + "epoch": 0.9848101265822785, + "grad_norm": 1.0431830883026123, + "learning_rate": 9.795522880475979e-05, + "loss": 0.6474619507789612, + "step": 2334 + }, + { + "epoch": 0.9856540084388186, + "grad_norm": 1.1819576025009155, + "learning_rate": 9.794842588844299e-05, + "loss": 0.6392545700073242, + "step": 2336 + }, + { + "epoch": 0.9864978902953586, + "grad_norm": 1.1984983682632446, + "learning_rate": 9.794161191134525e-05, + "loss": 0.7358114719390869, + "step": 2338 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 1.3378512859344482, + "learning_rate": 9.793478687503834e-05, + "loss": 0.6762020587921143, + "step": 2340 + }, + { + "epoch": 0.9881856540084388, + "grad_norm": 1.272674560546875, + "learning_rate": 9.792795078109673e-05, + "loss": 0.7478934526443481, + "step": 2342 + }, + { + "epoch": 0.9890295358649789, + "grad_norm": 1.153746247291565, + "learning_rate": 9.792110363109733e-05, + "loss": 0.7316533923149109, + "step": 2344 + }, + { + "epoch": 0.9898734177215189, + "grad_norm": 1.1361702680587769, + "learning_rate": 9.791424542661967e-05, + "loss": 0.7078539133071899, + "step": 2346 + }, + { + "epoch": 0.9907172995780591, + "grad_norm": 1.3043115139007568, + "learning_rate": 9.790737616924581e-05, + "loss": 0.7945935130119324, + "step": 2348 + }, + { + "epoch": 0.9915611814345991, + "grad_norm": 1.1913264989852905, + "learning_rate": 9.790049586056034e-05, + "loss": 0.8247197866439819, + "step": 2350 + }, + { + "epoch": 0.9924050632911392, + "grad_norm": 1.1560171842575073, + "learning_rate": 9.789360450215041e-05, + "loss": 0.7099657654762268, + "step": 2352 + }, + { + "epoch": 0.9932489451476794, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.788670209560575e-05, + "loss": 0.7480318546295166, + "step": 2354 + }, + { + "epoch": 0.9940928270042194, + "grad_norm": 1.1584707498550415, + "learning_rate": 9.787978864251859e-05, + "loss": 0.6870889067649841, + "step": 2356 + }, + { + "epoch": 0.9949367088607595, + "grad_norm": 1.057478666305542, + "learning_rate": 9.787286414448375e-05, + "loss": 0.6114922165870667, + "step": 2358 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 1.1431775093078613, + "learning_rate": 9.786592860309856e-05, + "loss": 0.6955118179321289, + "step": 2360 + }, + { + "epoch": 0.9966244725738397, + "grad_norm": 1.232142448425293, + "learning_rate": 9.785898201996292e-05, + "loss": 0.735048770904541, + "step": 2362 + }, + { + "epoch": 0.9974683544303797, + "grad_norm": 1.1236306428909302, + "learning_rate": 9.785202439667928e-05, + "loss": 0.7150241136550903, + "step": 2364 + }, + { + "epoch": 0.9983122362869198, + "grad_norm": 1.0517534017562866, + "learning_rate": 9.784505573485263e-05, + "loss": 0.6870222687721252, + "step": 2366 + }, + { + "epoch": 0.99915611814346, + "grad_norm": 1.1747480630874634, + "learning_rate": 9.78380760360905e-05, + "loss": 0.7521567940711975, + "step": 2368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2790346145629883, + "learning_rate": 9.783108530200298e-05, + "loss": 0.7336234450340271, + "step": 2370 + }, + { + "epoch": 1.0008438818565402, + "grad_norm": 1.1216399669647217, + "learning_rate": 9.78240835342027e-05, + "loss": 0.6378109455108643, + "step": 2372 + }, + { + "epoch": 1.00168776371308, + "grad_norm": 1.267336368560791, + "learning_rate": 9.781707073430482e-05, + "loss": 0.6174905300140381, + "step": 2374 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 1.1342934370040894, + "learning_rate": 9.781004690392706e-05, + "loss": 0.6579123139381409, + "step": 2376 + }, + { + "epoch": 1.0033755274261604, + "grad_norm": 1.1317468881607056, + "learning_rate": 9.78030120446897e-05, + "loss": 0.6679617166519165, + "step": 2378 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 1.2992616891860962, + "learning_rate": 9.779596615821552e-05, + "loss": 0.7368149161338806, + "step": 2380 + }, + { + "epoch": 1.0050632911392405, + "grad_norm": 1.1714510917663574, + "learning_rate": 9.77889092461299e-05, + "loss": 0.6887164115905762, + "step": 2382 + }, + { + "epoch": 1.0059071729957807, + "grad_norm": 1.1670639514923096, + "learning_rate": 9.778184131006071e-05, + "loss": 0.681344211101532, + "step": 2384 + }, + { + "epoch": 1.0067510548523206, + "grad_norm": 1.2487291097640991, + "learning_rate": 9.77747623516384e-05, + "loss": 0.7342769503593445, + "step": 2386 + }, + { + "epoch": 1.0075949367088608, + "grad_norm": 1.2408956289291382, + "learning_rate": 9.776767237249595e-05, + "loss": 0.577454149723053, + "step": 2388 + }, + { + "epoch": 1.0084388185654007, + "grad_norm": 1.067991852760315, + "learning_rate": 9.776057137426889e-05, + "loss": 0.6588307023048401, + "step": 2390 + }, + { + "epoch": 1.009282700421941, + "grad_norm": 1.2821543216705322, + "learning_rate": 9.775345935859525e-05, + "loss": 0.7045041918754578, + "step": 2392 + }, + { + "epoch": 1.010126582278481, + "grad_norm": 1.3160134553909302, + "learning_rate": 9.774633632711569e-05, + "loss": 0.7141479253768921, + "step": 2394 + }, + { + "epoch": 1.010970464135021, + "grad_norm": 1.66774320602417, + "learning_rate": 9.773920228147329e-05, + "loss": 0.723293662071228, + "step": 2396 + }, + { + "epoch": 1.0118143459915612, + "grad_norm": 1.027588963508606, + "learning_rate": 9.77320572233138e-05, + "loss": 0.5812023878097534, + "step": 2398 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.406507968902588, + "learning_rate": 9.77249011542854e-05, + "loss": 0.7071458101272583, + "step": 2400 + }, + { + "epoch": 1.0126582278481013, + "eval_loss": 0.7421699166297913, + "eval_runtime": 854.2185, + "eval_samples_per_second": 2.467, + "eval_steps_per_second": 2.467, + "step": 2400 + }, + { + "epoch": 1.0135021097046413, + "grad_norm": 1.1236240863800049, + "learning_rate": 9.771773407603889e-05, + "loss": 0.7049722671508789, + "step": 2402 + }, + { + "epoch": 1.0143459915611814, + "grad_norm": 1.1924289464950562, + "learning_rate": 9.771055599022756e-05, + "loss": 0.635308027267456, + "step": 2404 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 1.1744966506958008, + "learning_rate": 9.770336689850727e-05, + "loss": 0.7286487817764282, + "step": 2406 + }, + { + "epoch": 1.0160337552742615, + "grad_norm": 1.2131173610687256, + "learning_rate": 9.769616680253639e-05, + "loss": 0.6828222274780273, + "step": 2408 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 1.0517828464508057, + "learning_rate": 9.768895570397585e-05, + "loss": 0.6652156114578247, + "step": 2410 + }, + { + "epoch": 1.0177215189873419, + "grad_norm": 1.1603758335113525, + "learning_rate": 9.768173360448912e-05, + "loss": 0.7278267741203308, + "step": 2412 + }, + { + "epoch": 1.0185654008438818, + "grad_norm": 1.3167752027511597, + "learning_rate": 9.767450050574218e-05, + "loss": 0.6082334518432617, + "step": 2414 + }, + { + "epoch": 1.019409282700422, + "grad_norm": 1.1754449605941772, + "learning_rate": 9.766725640940358e-05, + "loss": 0.67228102684021, + "step": 2416 + }, + { + "epoch": 1.0202531645569621, + "grad_norm": 1.060952067375183, + "learning_rate": 9.766000131714442e-05, + "loss": 0.5984366536140442, + "step": 2418 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 1.0826152563095093, + "learning_rate": 9.765273523063825e-05, + "loss": 0.690661609172821, + "step": 2420 + }, + { + "epoch": 1.0219409282700422, + "grad_norm": 1.423723816871643, + "learning_rate": 9.764545815156125e-05, + "loss": 0.7960668802261353, + "step": 2422 + }, + { + "epoch": 1.0227848101265822, + "grad_norm": 1.0882549285888672, + "learning_rate": 9.763817008159212e-05, + "loss": 0.6971074342727661, + "step": 2424 + }, + { + "epoch": 1.0236286919831223, + "grad_norm": 1.1053040027618408, + "learning_rate": 9.763087102241206e-05, + "loss": 0.6854458451271057, + "step": 2426 + }, + { + "epoch": 1.0244725738396625, + "grad_norm": 1.1975224018096924, + "learning_rate": 9.762356097570482e-05, + "loss": 0.6724489331245422, + "step": 2428 + }, + { + "epoch": 1.0253164556962024, + "grad_norm": 1.1692171096801758, + "learning_rate": 9.76162399431567e-05, + "loss": 0.7064506411552429, + "step": 2430 + }, + { + "epoch": 1.0261603375527426, + "grad_norm": 1.1927787065505981, + "learning_rate": 9.760890792645649e-05, + "loss": 0.6605257391929626, + "step": 2432 + }, + { + "epoch": 1.0270042194092828, + "grad_norm": 1.4147427082061768, + "learning_rate": 9.760156492729558e-05, + "loss": 0.6872501373291016, + "step": 2434 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 1.2503126859664917, + "learning_rate": 9.759421094736785e-05, + "loss": 0.7117500305175781, + "step": 2436 + }, + { + "epoch": 1.0286919831223629, + "grad_norm": 1.229978084564209, + "learning_rate": 9.758684598836971e-05, + "loss": 0.6740369200706482, + "step": 2438 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 1.4765945672988892, + "learning_rate": 9.757947005200014e-05, + "loss": 0.7215790748596191, + "step": 2440 + }, + { + "epoch": 1.030379746835443, + "grad_norm": 1.282632827758789, + "learning_rate": 9.757208313996061e-05, + "loss": 0.6961746215820312, + "step": 2442 + }, + { + "epoch": 1.0312236286919831, + "grad_norm": 1.259828805923462, + "learning_rate": 9.756468525395512e-05, + "loss": 0.6348349452018738, + "step": 2444 + }, + { + "epoch": 1.0320675105485233, + "grad_norm": 1.0984172821044922, + "learning_rate": 9.755727639569024e-05, + "loss": 0.6756057739257812, + "step": 2446 + }, + { + "epoch": 1.0329113924050632, + "grad_norm": 1.235835075378418, + "learning_rate": 9.754985656687506e-05, + "loss": 0.6968509554862976, + "step": 2448 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 1.273032546043396, + "learning_rate": 9.754242576922119e-05, + "loss": 0.6793950796127319, + "step": 2450 + }, + { + "epoch": 1.0345991561181433, + "grad_norm": 1.251996397972107, + "learning_rate": 9.753498400444274e-05, + "loss": 0.645270586013794, + "step": 2452 + }, + { + "epoch": 1.0354430379746835, + "grad_norm": 1.4310805797576904, + "learning_rate": 9.752753127425642e-05, + "loss": 0.7291322350502014, + "step": 2454 + }, + { + "epoch": 1.0362869198312237, + "grad_norm": 1.6582196950912476, + "learning_rate": 9.752006758038142e-05, + "loss": 0.7553019523620605, + "step": 2456 + }, + { + "epoch": 1.0371308016877636, + "grad_norm": 1.081773042678833, + "learning_rate": 9.751259292453947e-05, + "loss": 0.5637331008911133, + "step": 2458 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 1.1483876705169678, + "learning_rate": 9.750510730845483e-05, + "loss": 0.6012396216392517, + "step": 2460 + }, + { + "epoch": 1.038818565400844, + "grad_norm": 1.0879185199737549, + "learning_rate": 9.749761073385428e-05, + "loss": 0.6795822381973267, + "step": 2462 + }, + { + "epoch": 1.0396624472573839, + "grad_norm": 1.2378218173980713, + "learning_rate": 9.749010320246714e-05, + "loss": 0.6895145773887634, + "step": 2464 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 1.253233790397644, + "learning_rate": 9.748258471602527e-05, + "loss": 0.7124115228652954, + "step": 2466 + }, + { + "epoch": 1.0413502109704642, + "grad_norm": 1.3994864225387573, + "learning_rate": 9.747505527626302e-05, + "loss": 0.7304861545562744, + "step": 2468 + }, + { + "epoch": 1.0421940928270041, + "grad_norm": 1.2360669374465942, + "learning_rate": 9.74675148849173e-05, + "loss": 0.6845837831497192, + "step": 2470 + }, + { + "epoch": 1.0430379746835443, + "grad_norm": 1.126849889755249, + "learning_rate": 9.74599635437275e-05, + "loss": 0.6780203580856323, + "step": 2472 + }, + { + "epoch": 1.0438818565400845, + "grad_norm": 1.169788122177124, + "learning_rate": 9.745240125443562e-05, + "loss": 0.7550003528594971, + "step": 2474 + }, + { + "epoch": 1.0447257383966244, + "grad_norm": 1.1311867237091064, + "learning_rate": 9.744482801878612e-05, + "loss": 0.6910399198532104, + "step": 2476 + }, + { + "epoch": 1.0455696202531646, + "grad_norm": 1.1267731189727783, + "learning_rate": 9.743724383852597e-05, + "loss": 0.7164814472198486, + "step": 2478 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 1.2239704132080078, + "learning_rate": 9.742964871540472e-05, + "loss": 0.6428439617156982, + "step": 2480 + }, + { + "epoch": 1.0472573839662447, + "grad_norm": 1.1854743957519531, + "learning_rate": 9.742204265117443e-05, + "loss": 0.6994290351867676, + "step": 2482 + }, + { + "epoch": 1.0481012658227848, + "grad_norm": 1.0695894956588745, + "learning_rate": 9.741442564758964e-05, + "loss": 0.6725777983665466, + "step": 2484 + }, + { + "epoch": 1.048945147679325, + "grad_norm": 1.1799863576889038, + "learning_rate": 9.740679770640748e-05, + "loss": 0.6538674235343933, + "step": 2486 + }, + { + "epoch": 1.049789029535865, + "grad_norm": 1.295546293258667, + "learning_rate": 9.739915882938754e-05, + "loss": 0.780756950378418, + "step": 2488 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 1.2371755838394165, + "learning_rate": 9.739150901829198e-05, + "loss": 0.6657930612564087, + "step": 2490 + }, + { + "epoch": 1.051476793248945, + "grad_norm": 1.103037714958191, + "learning_rate": 9.738384827488547e-05, + "loss": 0.6675208210945129, + "step": 2492 + }, + { + "epoch": 1.0523206751054852, + "grad_norm": 1.1835435628890991, + "learning_rate": 9.737617660093517e-05, + "loss": 0.6693358421325684, + "step": 2494 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 1.003771424293518, + "learning_rate": 9.736849399821082e-05, + "loss": 0.624502956867218, + "step": 2496 + }, + { + "epoch": 1.0540084388185653, + "grad_norm": 1.1391769647598267, + "learning_rate": 9.736080046848463e-05, + "loss": 0.6350868344306946, + "step": 2498 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 1.376518726348877, + "learning_rate": 9.735309601353134e-05, + "loss": 0.6721012592315674, + "step": 2500 + }, + { + "epoch": 1.0548523206751055, + "eval_loss": 0.741338849067688, + "eval_runtime": 847.7478, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2500 + }, + { + "epoch": 1.0556962025316456, + "grad_norm": 1.194190502166748, + "learning_rate": 9.734538063512824e-05, + "loss": 0.6888233423233032, + "step": 2502 + }, + { + "epoch": 1.0565400843881856, + "grad_norm": 1.378830909729004, + "learning_rate": 9.733765433505513e-05, + "loss": 0.7095553278923035, + "step": 2504 + }, + { + "epoch": 1.0573839662447257, + "grad_norm": 1.1289541721343994, + "learning_rate": 9.732991711509428e-05, + "loss": 0.6734166145324707, + "step": 2506 + }, + { + "epoch": 1.058227848101266, + "grad_norm": 1.1858116388320923, + "learning_rate": 9.732216897703054e-05, + "loss": 0.7006195187568665, + "step": 2508 + }, + { + "epoch": 1.0590717299578059, + "grad_norm": 1.1365686655044556, + "learning_rate": 9.731440992265127e-05, + "loss": 0.6481205821037292, + "step": 2510 + }, + { + "epoch": 1.059915611814346, + "grad_norm": 1.2886228561401367, + "learning_rate": 9.730663995374632e-05, + "loss": 0.679282546043396, + "step": 2512 + }, + { + "epoch": 1.0607594936708862, + "grad_norm": 1.355322003364563, + "learning_rate": 9.729885907210808e-05, + "loss": 0.7656359672546387, + "step": 2514 + }, + { + "epoch": 1.0616033755274261, + "grad_norm": 1.1552364826202393, + "learning_rate": 9.729106727953142e-05, + "loss": 0.5996183156967163, + "step": 2516 + }, + { + "epoch": 1.0624472573839663, + "grad_norm": 1.1419235467910767, + "learning_rate": 9.728326457781381e-05, + "loss": 0.7599716782569885, + "step": 2518 + }, + { + "epoch": 1.0632911392405062, + "grad_norm": 1.2240079641342163, + "learning_rate": 9.727545096875512e-05, + "loss": 0.7150241732597351, + "step": 2520 + }, + { + "epoch": 1.0641350210970464, + "grad_norm": 1.2463440895080566, + "learning_rate": 9.726762645415785e-05, + "loss": 0.734352171421051, + "step": 2522 + }, + { + "epoch": 1.0649789029535865, + "grad_norm": 1.1680364608764648, + "learning_rate": 9.725979103582697e-05, + "loss": 0.6950796842575073, + "step": 2524 + }, + { + "epoch": 1.0658227848101265, + "grad_norm": 1.1680421829223633, + "learning_rate": 9.725194471556991e-05, + "loss": 0.7096341252326965, + "step": 2526 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.043717861175537, + "learning_rate": 9.724408749519671e-05, + "loss": 0.6486304402351379, + "step": 2528 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 1.1240284442901611, + "learning_rate": 9.723621937651985e-05, + "loss": 0.6519505381584167, + "step": 2530 + }, + { + "epoch": 1.0683544303797468, + "grad_norm": 1.185223937034607, + "learning_rate": 9.722834036135439e-05, + "loss": 0.6724293231964111, + "step": 2532 + }, + { + "epoch": 1.069198312236287, + "grad_norm": 1.3234196901321411, + "learning_rate": 9.722045045151784e-05, + "loss": 0.6886576414108276, + "step": 2534 + }, + { + "epoch": 1.070042194092827, + "grad_norm": 1.333084225654602, + "learning_rate": 9.721254964883024e-05, + "loss": 0.688493549823761, + "step": 2536 + }, + { + "epoch": 1.070886075949367, + "grad_norm": 1.2435462474822998, + "learning_rate": 9.720463795511419e-05, + "loss": 0.6527412533760071, + "step": 2538 + }, + { + "epoch": 1.0717299578059072, + "grad_norm": 1.1521880626678467, + "learning_rate": 9.719671537219472e-05, + "loss": 0.6508163809776306, + "step": 2540 + }, + { + "epoch": 1.0725738396624473, + "grad_norm": 1.015013575553894, + "learning_rate": 9.718878190189947e-05, + "loss": 0.6954023838043213, + "step": 2542 + }, + { + "epoch": 1.0734177215189873, + "grad_norm": 1.1507678031921387, + "learning_rate": 9.718083754605851e-05, + "loss": 0.7201322913169861, + "step": 2544 + }, + { + "epoch": 1.0742616033755275, + "grad_norm": 1.0569016933441162, + "learning_rate": 9.717288230650444e-05, + "loss": 0.6688649654388428, + "step": 2546 + }, + { + "epoch": 1.0751054852320676, + "grad_norm": 1.2178492546081543, + "learning_rate": 9.716491618507241e-05, + "loss": 0.7077898979187012, + "step": 2548 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 1.3587230443954468, + "learning_rate": 9.715693918360002e-05, + "loss": 0.7312119603157043, + "step": 2550 + }, + { + "epoch": 1.0767932489451477, + "grad_norm": 1.1930122375488281, + "learning_rate": 9.714895130392744e-05, + "loss": 0.6910589337348938, + "step": 2552 + }, + { + "epoch": 1.0776371308016879, + "grad_norm": 1.2440707683563232, + "learning_rate": 9.71409525478973e-05, + "loss": 0.7942836284637451, + "step": 2554 + }, + { + "epoch": 1.0784810126582278, + "grad_norm": 1.3755065202713013, + "learning_rate": 9.713294291735477e-05, + "loss": 0.6652286052703857, + "step": 2556 + }, + { + "epoch": 1.079324894514768, + "grad_norm": 1.165448784828186, + "learning_rate": 9.71249224141475e-05, + "loss": 0.6025735139846802, + "step": 2558 + }, + { + "epoch": 1.080168776371308, + "grad_norm": 1.2981204986572266, + "learning_rate": 9.711689104012569e-05, + "loss": 0.7343734502792358, + "step": 2560 + }, + { + "epoch": 1.081012658227848, + "grad_norm": 1.2040622234344482, + "learning_rate": 9.710884879714202e-05, + "loss": 0.6903306841850281, + "step": 2562 + }, + { + "epoch": 1.0818565400843883, + "grad_norm": 1.1835904121398926, + "learning_rate": 9.710079568705168e-05, + "loss": 0.69134920835495, + "step": 2564 + }, + { + "epoch": 1.0827004219409282, + "grad_norm": 1.3345229625701904, + "learning_rate": 9.709273171171235e-05, + "loss": 0.6471185088157654, + "step": 2566 + }, + { + "epoch": 1.0835443037974684, + "grad_norm": 1.0884469747543335, + "learning_rate": 9.708465687298425e-05, + "loss": 0.6302382349967957, + "step": 2568 + }, + { + "epoch": 1.0843881856540085, + "grad_norm": 1.1994211673736572, + "learning_rate": 9.707657117273007e-05, + "loss": 0.7329678535461426, + "step": 2570 + }, + { + "epoch": 1.0852320675105485, + "grad_norm": 1.2609503269195557, + "learning_rate": 9.706847461281507e-05, + "loss": 0.719862163066864, + "step": 2572 + }, + { + "epoch": 1.0860759493670886, + "grad_norm": 1.2686879634857178, + "learning_rate": 9.706036719510694e-05, + "loss": 0.7142901420593262, + "step": 2574 + }, + { + "epoch": 1.0869198312236288, + "grad_norm": 1.2763310670852661, + "learning_rate": 9.705224892147591e-05, + "loss": 0.7009075284004211, + "step": 2576 + }, + { + "epoch": 1.0877637130801687, + "grad_norm": 1.1704022884368896, + "learning_rate": 9.70441197937947e-05, + "loss": 0.6873779296875, + "step": 2578 + }, + { + "epoch": 1.0886075949367089, + "grad_norm": 1.0482875108718872, + "learning_rate": 9.703597981393856e-05, + "loss": 0.6437726020812988, + "step": 2580 + }, + { + "epoch": 1.0894514767932488, + "grad_norm": 1.28431236743927, + "learning_rate": 9.702782898378521e-05, + "loss": 0.6933431625366211, + "step": 2582 + }, + { + "epoch": 1.090295358649789, + "grad_norm": 1.0962283611297607, + "learning_rate": 9.701966730521491e-05, + "loss": 0.6488757133483887, + "step": 2584 + }, + { + "epoch": 1.0911392405063292, + "grad_norm": 1.2177873849868774, + "learning_rate": 9.70114947801104e-05, + "loss": 0.6385396122932434, + "step": 2586 + }, + { + "epoch": 1.091983122362869, + "grad_norm": 1.197059988975525, + "learning_rate": 9.70033114103569e-05, + "loss": 0.6826614737510681, + "step": 2588 + }, + { + "epoch": 1.0928270042194093, + "grad_norm": 1.1624075174331665, + "learning_rate": 9.699511719784217e-05, + "loss": 0.605629563331604, + "step": 2590 + }, + { + "epoch": 1.0936708860759494, + "grad_norm": 1.2975167036056519, + "learning_rate": 9.698691214445648e-05, + "loss": 0.734926700592041, + "step": 2592 + }, + { + "epoch": 1.0945147679324894, + "grad_norm": 1.215414047241211, + "learning_rate": 9.697869625209255e-05, + "loss": 0.7281333804130554, + "step": 2594 + }, + { + "epoch": 1.0953586497890295, + "grad_norm": 1.1862860918045044, + "learning_rate": 9.697046952264563e-05, + "loss": 0.7388250827789307, + "step": 2596 + }, + { + "epoch": 1.0962025316455697, + "grad_norm": 1.1127797365188599, + "learning_rate": 9.696223195801348e-05, + "loss": 0.6495320796966553, + "step": 2598 + }, + { + "epoch": 1.0970464135021096, + "grad_norm": 1.0863338708877563, + "learning_rate": 9.695398356009636e-05, + "loss": 0.7157143950462341, + "step": 2600 + }, + { + "epoch": 1.0970464135021096, + "eval_loss": 0.7377332448959351, + "eval_runtime": 859.6612, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 2600 + }, + { + "epoch": 1.0978902953586498, + "grad_norm": 1.1228652000427246, + "learning_rate": 9.694572433079699e-05, + "loss": 0.6597335934638977, + "step": 2602 + }, + { + "epoch": 1.09873417721519, + "grad_norm": 1.3077653646469116, + "learning_rate": 9.69374542720206e-05, + "loss": 0.6715680360794067, + "step": 2604 + }, + { + "epoch": 1.09957805907173, + "grad_norm": 1.241603970527649, + "learning_rate": 9.692917338567499e-05, + "loss": 0.6910243034362793, + "step": 2606 + }, + { + "epoch": 1.10042194092827, + "grad_norm": 1.1372551918029785, + "learning_rate": 9.692088167367037e-05, + "loss": 0.6519553065299988, + "step": 2608 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 1.2894765138626099, + "learning_rate": 9.691257913791949e-05, + "loss": 0.6542758941650391, + "step": 2610 + }, + { + "epoch": 1.1021097046413502, + "grad_norm": 1.0800915956497192, + "learning_rate": 9.690426578033755e-05, + "loss": 0.6886795163154602, + "step": 2612 + }, + { + "epoch": 1.1029535864978903, + "grad_norm": 1.3394384384155273, + "learning_rate": 9.689594160284233e-05, + "loss": 0.7512150406837463, + "step": 2614 + }, + { + "epoch": 1.1037974683544305, + "grad_norm": 1.2175323963165283, + "learning_rate": 9.688760660735402e-05, + "loss": 0.67207932472229, + "step": 2616 + }, + { + "epoch": 1.1046413502109704, + "grad_norm": 1.2181185483932495, + "learning_rate": 9.687926079579537e-05, + "loss": 0.6591740846633911, + "step": 2618 + }, + { + "epoch": 1.1054852320675106, + "grad_norm": 1.1740983724594116, + "learning_rate": 9.68709041700916e-05, + "loss": 0.6431041359901428, + "step": 2620 + }, + { + "epoch": 1.1063291139240505, + "grad_norm": 1.1792434453964233, + "learning_rate": 9.686253673217038e-05, + "loss": 0.6573615074157715, + "step": 2622 + }, + { + "epoch": 1.1071729957805907, + "grad_norm": 1.058391809463501, + "learning_rate": 9.685415848396196e-05, + "loss": 0.5576209425926208, + "step": 2624 + }, + { + "epoch": 1.1080168776371309, + "grad_norm": 1.3203206062316895, + "learning_rate": 9.684576942739903e-05, + "loss": 0.668684184551239, + "step": 2626 + }, + { + "epoch": 1.1088607594936708, + "grad_norm": 1.2391762733459473, + "learning_rate": 9.68373695644168e-05, + "loss": 0.6800089478492737, + "step": 2628 + }, + { + "epoch": 1.109704641350211, + "grad_norm": 1.2323405742645264, + "learning_rate": 9.682895889695292e-05, + "loss": 0.6433757543563843, + "step": 2630 + }, + { + "epoch": 1.1105485232067511, + "grad_norm": 1.2656551599502563, + "learning_rate": 9.682053742694759e-05, + "loss": 0.6628785729408264, + "step": 2632 + }, + { + "epoch": 1.111392405063291, + "grad_norm": 1.2984392642974854, + "learning_rate": 9.681210515634349e-05, + "loss": 0.6838971972465515, + "step": 2634 + }, + { + "epoch": 1.1122362869198312, + "grad_norm": 1.3200393915176392, + "learning_rate": 9.680366208708576e-05, + "loss": 0.7548647522926331, + "step": 2636 + }, + { + "epoch": 1.1130801687763714, + "grad_norm": 1.225388526916504, + "learning_rate": 9.679520822112208e-05, + "loss": 0.6553335189819336, + "step": 2638 + }, + { + "epoch": 1.1139240506329113, + "grad_norm": 1.2350653409957886, + "learning_rate": 9.678674356040259e-05, + "loss": 0.631401538848877, + "step": 2640 + }, + { + "epoch": 1.1147679324894515, + "grad_norm": 1.2325507402420044, + "learning_rate": 9.677826810687989e-05, + "loss": 0.6459156274795532, + "step": 2642 + }, + { + "epoch": 1.1156118143459917, + "grad_norm": 1.0008996725082397, + "learning_rate": 9.676978186250915e-05, + "loss": 0.6425284743309021, + "step": 2644 + }, + { + "epoch": 1.1164556962025316, + "grad_norm": 1.3767247200012207, + "learning_rate": 9.676128482924796e-05, + "loss": 0.6451422572135925, + "step": 2646 + }, + { + "epoch": 1.1172995780590718, + "grad_norm": 1.2070895433425903, + "learning_rate": 9.675277700905643e-05, + "loss": 0.6713272929191589, + "step": 2648 + }, + { + "epoch": 1.1181434599156117, + "grad_norm": 1.1582069396972656, + "learning_rate": 9.674425840389716e-05, + "loss": 0.6285044550895691, + "step": 2650 + }, + { + "epoch": 1.1189873417721519, + "grad_norm": 1.1641311645507812, + "learning_rate": 9.67357290157352e-05, + "loss": 0.624229907989502, + "step": 2652 + }, + { + "epoch": 1.119831223628692, + "grad_norm": 1.3071147203445435, + "learning_rate": 9.672718884653814e-05, + "loss": 0.7214919328689575, + "step": 2654 + }, + { + "epoch": 1.120675105485232, + "grad_norm": 1.2157800197601318, + "learning_rate": 9.671863789827602e-05, + "loss": 0.8062215447425842, + "step": 2656 + }, + { + "epoch": 1.1215189873417721, + "grad_norm": 1.2843927145004272, + "learning_rate": 9.671007617292138e-05, + "loss": 0.6362426280975342, + "step": 2658 + }, + { + "epoch": 1.1223628691983123, + "grad_norm": 1.1182712316513062, + "learning_rate": 9.670150367244927e-05, + "loss": 0.6181318163871765, + "step": 2660 + }, + { + "epoch": 1.1232067510548522, + "grad_norm": 1.566605806350708, + "learning_rate": 9.669292039883717e-05, + "loss": 0.6973897218704224, + "step": 2662 + }, + { + "epoch": 1.1240506329113924, + "grad_norm": 1.0726850032806396, + "learning_rate": 9.66843263540651e-05, + "loss": 0.6117324829101562, + "step": 2664 + }, + { + "epoch": 1.1248945147679326, + "grad_norm": 1.2953020334243774, + "learning_rate": 9.66757215401155e-05, + "loss": 0.642676830291748, + "step": 2666 + }, + { + "epoch": 1.1257383966244725, + "grad_norm": 1.1184383630752563, + "learning_rate": 9.66671059589734e-05, + "loss": 0.6757452487945557, + "step": 2668 + }, + { + "epoch": 1.1265822784810127, + "grad_norm": 1.2732970714569092, + "learning_rate": 9.66584796126262e-05, + "loss": 0.6861951947212219, + "step": 2670 + }, + { + "epoch": 1.1274261603375528, + "grad_norm": 1.2713000774383545, + "learning_rate": 9.664984250306383e-05, + "loss": 0.6727077960968018, + "step": 2672 + }, + { + "epoch": 1.1282700421940928, + "grad_norm": 1.269827961921692, + "learning_rate": 9.664119463227874e-05, + "loss": 0.7355974912643433, + "step": 2674 + }, + { + "epoch": 1.129113924050633, + "grad_norm": 1.3067172765731812, + "learning_rate": 9.663253600226581e-05, + "loss": 0.7121313214302063, + "step": 2676 + }, + { + "epoch": 1.129957805907173, + "grad_norm": 1.2958797216415405, + "learning_rate": 9.662386661502242e-05, + "loss": 0.6671369075775146, + "step": 2678 + }, + { + "epoch": 1.130801687763713, + "grad_norm": 1.2943401336669922, + "learning_rate": 9.661518647254842e-05, + "loss": 0.6153768301010132, + "step": 2680 + }, + { + "epoch": 1.1316455696202532, + "grad_norm": 1.1744167804718018, + "learning_rate": 9.660649557684616e-05, + "loss": 0.6070778965950012, + "step": 2682 + }, + { + "epoch": 1.1324894514767934, + "grad_norm": 1.159209132194519, + "learning_rate": 9.659779392992047e-05, + "loss": 0.676887035369873, + "step": 2684 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.1937510967254639, + "learning_rate": 9.658908153377866e-05, + "loss": 0.6086745262145996, + "step": 2686 + }, + { + "epoch": 1.1341772151898735, + "grad_norm": 1.1461687088012695, + "learning_rate": 9.658035839043049e-05, + "loss": 0.6493708491325378, + "step": 2688 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 2.066361665725708, + "learning_rate": 9.657162450188824e-05, + "loss": 0.6813004016876221, + "step": 2690 + }, + { + "epoch": 1.1358649789029536, + "grad_norm": 1.086910367012024, + "learning_rate": 9.656287987016664e-05, + "loss": 0.721062183380127, + "step": 2692 + }, + { + "epoch": 1.1367088607594937, + "grad_norm": 1.1869292259216309, + "learning_rate": 9.65541244972829e-05, + "loss": 0.5975021123886108, + "step": 2694 + }, + { + "epoch": 1.1375527426160337, + "grad_norm": 1.2456518411636353, + "learning_rate": 9.654535838525674e-05, + "loss": 0.6818324327468872, + "step": 2696 + }, + { + "epoch": 1.1383966244725738, + "grad_norm": 1.5271464586257935, + "learning_rate": 9.653658153611031e-05, + "loss": 0.6844469308853149, + "step": 2698 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.1403794288635254, + "learning_rate": 9.652779395186827e-05, + "loss": 0.6388684511184692, + "step": 2700 + }, + { + "epoch": 1.139240506329114, + "eval_loss": 0.7335711717605591, + "eval_runtime": 861.9651, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 2700 + }, + { + "epoch": 1.140084388185654, + "grad_norm": 1.1091634035110474, + "learning_rate": 9.651899563455775e-05, + "loss": 0.6154619455337524, + "step": 2702 + }, + { + "epoch": 1.140928270042194, + "grad_norm": 1.3280601501464844, + "learning_rate": 9.651018658620837e-05, + "loss": 0.629319429397583, + "step": 2704 + }, + { + "epoch": 1.1417721518987343, + "grad_norm": 1.226806402206421, + "learning_rate": 9.650136680885216e-05, + "loss": 0.6088175773620605, + "step": 2706 + }, + { + "epoch": 1.1426160337552742, + "grad_norm": 1.0593408346176147, + "learning_rate": 9.649253630452372e-05, + "loss": 0.6199659705162048, + "step": 2708 + }, + { + "epoch": 1.1434599156118144, + "grad_norm": 1.1112475395202637, + "learning_rate": 9.648369507526008e-05, + "loss": 0.7233364582061768, + "step": 2710 + }, + { + "epoch": 1.1443037974683543, + "grad_norm": 1.1737885475158691, + "learning_rate": 9.647484312310068e-05, + "loss": 0.6687955856323242, + "step": 2712 + }, + { + "epoch": 1.1451476793248945, + "grad_norm": 1.194532036781311, + "learning_rate": 9.646598045008756e-05, + "loss": 0.6508969068527222, + "step": 2714 + }, + { + "epoch": 1.1459915611814346, + "grad_norm": 1.069395899772644, + "learning_rate": 9.645710705826517e-05, + "loss": 0.6408317685127258, + "step": 2716 + }, + { + "epoch": 1.1468354430379746, + "grad_norm": 1.2429133653640747, + "learning_rate": 9.644822294968037e-05, + "loss": 0.650763750076294, + "step": 2718 + }, + { + "epoch": 1.1476793248945147, + "grad_norm": 1.2950133085250854, + "learning_rate": 9.64393281263826e-05, + "loss": 0.6952191591262817, + "step": 2720 + }, + { + "epoch": 1.148523206751055, + "grad_norm": 1.1972628831863403, + "learning_rate": 9.643042259042372e-05, + "loss": 0.6772956252098083, + "step": 2722 + }, + { + "epoch": 1.1493670886075948, + "grad_norm": 1.1670407056808472, + "learning_rate": 9.642150634385805e-05, + "loss": 0.6734447479248047, + "step": 2724 + }, + { + "epoch": 1.150210970464135, + "grad_norm": 1.120302677154541, + "learning_rate": 9.641257938874243e-05, + "loss": 0.6387717127799988, + "step": 2726 + }, + { + "epoch": 1.1510548523206752, + "grad_norm": 1.1241344213485718, + "learning_rate": 9.640364172713609e-05, + "loss": 0.6592874526977539, + "step": 2728 + }, + { + "epoch": 1.1518987341772151, + "grad_norm": 1.2627261877059937, + "learning_rate": 9.639469336110083e-05, + "loss": 0.7257466912269592, + "step": 2730 + }, + { + "epoch": 1.1527426160337553, + "grad_norm": 1.0528618097305298, + "learning_rate": 9.638573429270083e-05, + "loss": 0.572188138961792, + "step": 2732 + }, + { + "epoch": 1.1535864978902954, + "grad_norm": 1.212536334991455, + "learning_rate": 9.637676452400277e-05, + "loss": 0.678981602191925, + "step": 2734 + }, + { + "epoch": 1.1544303797468354, + "grad_norm": 1.152167797088623, + "learning_rate": 9.636778405707582e-05, + "loss": 0.6375001072883606, + "step": 2736 + }, + { + "epoch": 1.1552742616033755, + "grad_norm": 1.2400429248809814, + "learning_rate": 9.635879289399161e-05, + "loss": 0.7602289319038391, + "step": 2738 + }, + { + "epoch": 1.1561181434599157, + "grad_norm": 1.3488622903823853, + "learning_rate": 9.634979103682421e-05, + "loss": 0.6209543943405151, + "step": 2740 + }, + { + "epoch": 1.1569620253164556, + "grad_norm": 1.1999555826187134, + "learning_rate": 9.634077848765019e-05, + "loss": 0.6215830445289612, + "step": 2742 + }, + { + "epoch": 1.1578059071729958, + "grad_norm": 1.2008578777313232, + "learning_rate": 9.633175524854855e-05, + "loss": 0.6634654998779297, + "step": 2744 + }, + { + "epoch": 1.158649789029536, + "grad_norm": 1.3920676708221436, + "learning_rate": 9.63227213216008e-05, + "loss": 0.7515161633491516, + "step": 2746 + }, + { + "epoch": 1.159493670886076, + "grad_norm": 1.0551656484603882, + "learning_rate": 9.631367670889089e-05, + "loss": 0.724361777305603, + "step": 2748 + }, + { + "epoch": 1.160337552742616, + "grad_norm": 1.2820028066635132, + "learning_rate": 9.630462141250523e-05, + "loss": 0.6673553586006165, + "step": 2750 + }, + { + "epoch": 1.1611814345991562, + "grad_norm": 1.1452983617782593, + "learning_rate": 9.62955554345327e-05, + "loss": 0.7029784917831421, + "step": 2752 + }, + { + "epoch": 1.1620253164556962, + "grad_norm": 1.1808624267578125, + "learning_rate": 9.628647877706466e-05, + "loss": 0.7355457544326782, + "step": 2754 + }, + { + "epoch": 1.1628691983122363, + "grad_norm": 1.0574703216552734, + "learning_rate": 9.627739144219492e-05, + "loss": 0.6144933700561523, + "step": 2756 + }, + { + "epoch": 1.1637130801687763, + "grad_norm": 1.215733528137207, + "learning_rate": 9.626829343201974e-05, + "loss": 0.6843759417533875, + "step": 2758 + }, + { + "epoch": 1.1645569620253164, + "grad_norm": 1.1667706966400146, + "learning_rate": 9.625918474863787e-05, + "loss": 0.6197049617767334, + "step": 2760 + }, + { + "epoch": 1.1654008438818566, + "grad_norm": 1.3765631914138794, + "learning_rate": 9.62500653941505e-05, + "loss": 0.715958297252655, + "step": 2762 + }, + { + "epoch": 1.1662447257383965, + "grad_norm": 1.173715591430664, + "learning_rate": 9.62409353706613e-05, + "loss": 0.7433139085769653, + "step": 2764 + }, + { + "epoch": 1.1670886075949367, + "grad_norm": 1.1837430000305176, + "learning_rate": 9.623179468027637e-05, + "loss": 0.7174371480941772, + "step": 2766 + }, + { + "epoch": 1.1679324894514769, + "grad_norm": 1.1577154397964478, + "learning_rate": 9.622264332510432e-05, + "loss": 0.7184823751449585, + "step": 2768 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 1.165246605873108, + "learning_rate": 9.621348130725617e-05, + "loss": 0.693343460559845, + "step": 2770 + }, + { + "epoch": 1.169620253164557, + "grad_norm": 1.2853080034255981, + "learning_rate": 9.620430862884542e-05, + "loss": 0.6999852061271667, + "step": 2772 + }, + { + "epoch": 1.1704641350210971, + "grad_norm": 1.1782865524291992, + "learning_rate": 9.619512529198806e-05, + "loss": 0.6034331321716309, + "step": 2774 + }, + { + "epoch": 1.171308016877637, + "grad_norm": 1.4055447578430176, + "learning_rate": 9.61859312988025e-05, + "loss": 0.7588269710540771, + "step": 2776 + }, + { + "epoch": 1.1721518987341772, + "grad_norm": 1.1148805618286133, + "learning_rate": 9.617672665140957e-05, + "loss": 0.6913981437683105, + "step": 2778 + }, + { + "epoch": 1.1729957805907172, + "grad_norm": 1.1311042308807373, + "learning_rate": 9.616751135193266e-05, + "loss": 0.5976925492286682, + "step": 2780 + }, + { + "epoch": 1.1738396624472573, + "grad_norm": 1.2378602027893066, + "learning_rate": 9.615828540249754e-05, + "loss": 0.6897050142288208, + "step": 2782 + }, + { + "epoch": 1.1746835443037975, + "grad_norm": 1.3445732593536377, + "learning_rate": 9.614904880523248e-05, + "loss": 0.6772098541259766, + "step": 2784 + }, + { + "epoch": 1.1755274261603375, + "grad_norm": 1.3380862474441528, + "learning_rate": 9.613980156226815e-05, + "loss": 0.6354818344116211, + "step": 2786 + }, + { + "epoch": 1.1763713080168776, + "grad_norm": 1.0955157279968262, + "learning_rate": 9.613054367573773e-05, + "loss": 0.6541208028793335, + "step": 2788 + }, + { + "epoch": 1.1772151898734178, + "grad_norm": 1.0176626443862915, + "learning_rate": 9.612127514777686e-05, + "loss": 0.6472887992858887, + "step": 2790 + }, + { + "epoch": 1.1780590717299577, + "grad_norm": 1.2644864320755005, + "learning_rate": 9.611199598052357e-05, + "loss": 0.7511212229728699, + "step": 2792 + }, + { + "epoch": 1.1789029535864979, + "grad_norm": 1.248197317123413, + "learning_rate": 9.61027061761184e-05, + "loss": 0.696236789226532, + "step": 2794 + }, + { + "epoch": 1.179746835443038, + "grad_norm": 1.189935564994812, + "learning_rate": 9.609340573670436e-05, + "loss": 0.5962010622024536, + "step": 2796 + }, + { + "epoch": 1.180590717299578, + "grad_norm": 1.1760492324829102, + "learning_rate": 9.608409466442685e-05, + "loss": 0.5981685519218445, + "step": 2798 + }, + { + "epoch": 1.1814345991561181, + "grad_norm": 1.1820716857910156, + "learning_rate": 9.607477296143374e-05, + "loss": 0.6186091303825378, + "step": 2800 + }, + { + "epoch": 1.1814345991561181, + "eval_loss": 0.7298192977905273, + "eval_runtime": 849.544, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 2.48, + "step": 2800 + }, + { + "epoch": 1.1822784810126583, + "grad_norm": 1.0353888273239136, + "learning_rate": 9.606544062987541e-05, + "loss": 0.5859389901161194, + "step": 2802 + }, + { + "epoch": 1.1831223628691983, + "grad_norm": 1.3141933679580688, + "learning_rate": 9.605609767190464e-05, + "loss": 0.6573460698127747, + "step": 2804 + }, + { + "epoch": 1.1839662447257384, + "grad_norm": 1.1209372282028198, + "learning_rate": 9.604674408967664e-05, + "loss": 0.6991921067237854, + "step": 2806 + }, + { + "epoch": 1.1848101265822786, + "grad_norm": 1.2830493450164795, + "learning_rate": 9.603737988534913e-05, + "loss": 0.6438087821006775, + "step": 2808 + }, + { + "epoch": 1.1856540084388185, + "grad_norm": 1.1427195072174072, + "learning_rate": 9.602800506108225e-05, + "loss": 0.6452094316482544, + "step": 2810 + }, + { + "epoch": 1.1864978902953587, + "grad_norm": 1.316420078277588, + "learning_rate": 9.601861961903857e-05, + "loss": 0.6745601296424866, + "step": 2812 + }, + { + "epoch": 1.1873417721518988, + "grad_norm": 1.1643308401107788, + "learning_rate": 9.600922356138317e-05, + "loss": 0.6761514544487, + "step": 2814 + }, + { + "epoch": 1.1881856540084388, + "grad_norm": 1.036056399345398, + "learning_rate": 9.59998168902835e-05, + "loss": 0.6453908681869507, + "step": 2816 + }, + { + "epoch": 1.189029535864979, + "grad_norm": 1.2211129665374756, + "learning_rate": 9.599039960790954e-05, + "loss": 0.6576406359672546, + "step": 2818 + }, + { + "epoch": 1.189873417721519, + "grad_norm": 1.084114670753479, + "learning_rate": 9.598097171643364e-05, + "loss": 0.6214181780815125, + "step": 2820 + }, + { + "epoch": 1.190717299578059, + "grad_norm": 1.1297314167022705, + "learning_rate": 9.597153321803064e-05, + "loss": 0.6381646990776062, + "step": 2822 + }, + { + "epoch": 1.1915611814345992, + "grad_norm": 1.2568120956420898, + "learning_rate": 9.596208411487784e-05, + "loss": 0.7129076719284058, + "step": 2824 + }, + { + "epoch": 1.1924050632911392, + "grad_norm": 1.07041335105896, + "learning_rate": 9.595262440915493e-05, + "loss": 0.7123546004295349, + "step": 2826 + }, + { + "epoch": 1.1932489451476793, + "grad_norm": 1.3950074911117554, + "learning_rate": 9.594315410304413e-05, + "loss": 0.7263038158416748, + "step": 2828 + }, + { + "epoch": 1.1940928270042195, + "grad_norm": 1.2470672130584717, + "learning_rate": 9.593367319873002e-05, + "loss": 0.6863036751747131, + "step": 2830 + }, + { + "epoch": 1.1949367088607594, + "grad_norm": 1.2065461874008179, + "learning_rate": 9.592418169839968e-05, + "loss": 0.745354175567627, + "step": 2832 + }, + { + "epoch": 1.1957805907172996, + "grad_norm": 1.1710152626037598, + "learning_rate": 9.591467960424261e-05, + "loss": 0.6401656866073608, + "step": 2834 + }, + { + "epoch": 1.1966244725738397, + "grad_norm": 1.3324087858200073, + "learning_rate": 9.590516691845077e-05, + "loss": 0.7402615547180176, + "step": 2836 + }, + { + "epoch": 1.1974683544303797, + "grad_norm": 1.0100195407867432, + "learning_rate": 9.589564364321855e-05, + "loss": 0.5723769068717957, + "step": 2838 + }, + { + "epoch": 1.1983122362869199, + "grad_norm": 1.2706246376037598, + "learning_rate": 9.588610978074277e-05, + "loss": 0.6618966460227966, + "step": 2840 + }, + { + "epoch": 1.1991561181434598, + "grad_norm": 1.1921758651733398, + "learning_rate": 9.587656533322273e-05, + "loss": 0.7090804576873779, + "step": 2842 + }, + { + "epoch": 1.2, + "grad_norm": 1.36713445186615, + "learning_rate": 9.586701030286014e-05, + "loss": 0.6930652856826782, + "step": 2844 + }, + { + "epoch": 1.2008438818565401, + "grad_norm": 1.3084295988082886, + "learning_rate": 9.585744469185917e-05, + "loss": 0.7386236190795898, + "step": 2846 + }, + { + "epoch": 1.20168776371308, + "grad_norm": 1.198922038078308, + "learning_rate": 9.584786850242642e-05, + "loss": 0.6179903149604797, + "step": 2848 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.2106369733810425, + "learning_rate": 9.583828173677092e-05, + "loss": 0.7027528882026672, + "step": 2850 + }, + { + "epoch": 1.2033755274261604, + "grad_norm": 1.2959522008895874, + "learning_rate": 9.582868439710418e-05, + "loss": 0.6612945199012756, + "step": 2852 + }, + { + "epoch": 1.2042194092827003, + "grad_norm": 1.1441705226898193, + "learning_rate": 9.58190764856401e-05, + "loss": 0.7085917592048645, + "step": 2854 + }, + { + "epoch": 1.2050632911392405, + "grad_norm": 1.1586185693740845, + "learning_rate": 9.580945800459504e-05, + "loss": 0.7480600476264954, + "step": 2856 + }, + { + "epoch": 1.2059071729957807, + "grad_norm": 1.2068266868591309, + "learning_rate": 9.579982895618783e-05, + "loss": 0.7185836434364319, + "step": 2858 + }, + { + "epoch": 1.2067510548523206, + "grad_norm": 1.2188525199890137, + "learning_rate": 9.579018934263966e-05, + "loss": 0.6737306118011475, + "step": 2860 + }, + { + "epoch": 1.2075949367088608, + "grad_norm": 1.1513181924819946, + "learning_rate": 9.578053916617423e-05, + "loss": 0.7239293456077576, + "step": 2862 + }, + { + "epoch": 1.208438818565401, + "grad_norm": 1.2063703536987305, + "learning_rate": 9.577087842901764e-05, + "loss": 0.6416276097297668, + "step": 2864 + }, + { + "epoch": 1.2092827004219409, + "grad_norm": 1.102460503578186, + "learning_rate": 9.576120713339844e-05, + "loss": 0.697213351726532, + "step": 2866 + }, + { + "epoch": 1.210126582278481, + "grad_norm": 1.2484638690948486, + "learning_rate": 9.575152528154763e-05, + "loss": 0.6664742231369019, + "step": 2868 + }, + { + "epoch": 1.2109704641350212, + "grad_norm": 1.4476624727249146, + "learning_rate": 9.57418328756986e-05, + "loss": 0.6914868354797363, + "step": 2870 + }, + { + "epoch": 1.2118143459915611, + "grad_norm": 1.0130122900009155, + "learning_rate": 9.573212991808722e-05, + "loss": 0.662024736404419, + "step": 2872 + }, + { + "epoch": 1.2126582278481013, + "grad_norm": 1.014470100402832, + "learning_rate": 9.572241641095177e-05, + "loss": 0.6330409646034241, + "step": 2874 + }, + { + "epoch": 1.2135021097046415, + "grad_norm": 1.1803333759307861, + "learning_rate": 9.571269235653298e-05, + "loss": 0.6607463955879211, + "step": 2876 + }, + { + "epoch": 1.2143459915611814, + "grad_norm": 1.261366844177246, + "learning_rate": 9.570295775707398e-05, + "loss": 0.6925629377365112, + "step": 2878 + }, + { + "epoch": 1.2151898734177216, + "grad_norm": 1.226670503616333, + "learning_rate": 9.569321261482037e-05, + "loss": 0.7070510983467102, + "step": 2880 + }, + { + "epoch": 1.2160337552742617, + "grad_norm": 1.164565920829773, + "learning_rate": 9.568345693202016e-05, + "loss": 0.7243561744689941, + "step": 2882 + }, + { + "epoch": 1.2168776371308017, + "grad_norm": 1.060331106185913, + "learning_rate": 9.567369071092382e-05, + "loss": 0.6316909790039062, + "step": 2884 + }, + { + "epoch": 1.2177215189873418, + "grad_norm": 1.1998693943023682, + "learning_rate": 9.566391395378419e-05, + "loss": 0.6139125227928162, + "step": 2886 + }, + { + "epoch": 1.2185654008438818, + "grad_norm": 1.1875834465026855, + "learning_rate": 9.565412666285661e-05, + "loss": 0.688897430896759, + "step": 2888 + }, + { + "epoch": 1.219409282700422, + "grad_norm": 1.199174404144287, + "learning_rate": 9.564432884039882e-05, + "loss": 0.684590756893158, + "step": 2890 + }, + { + "epoch": 1.220253164556962, + "grad_norm": 1.2428219318389893, + "learning_rate": 9.563452048867099e-05, + "loss": 0.67433100938797, + "step": 2892 + }, + { + "epoch": 1.221097046413502, + "grad_norm": 1.0826431512832642, + "learning_rate": 9.562470160993568e-05, + "loss": 0.6959785223007202, + "step": 2894 + }, + { + "epoch": 1.2219409282700422, + "grad_norm": 1.3140246868133545, + "learning_rate": 9.561487220645797e-05, + "loss": 0.6443175673484802, + "step": 2896 + }, + { + "epoch": 1.2227848101265824, + "grad_norm": 1.2758334875106812, + "learning_rate": 9.560503228050529e-05, + "loss": 0.6715332865715027, + "step": 2898 + }, + { + "epoch": 1.2236286919831223, + "grad_norm": 1.3326421976089478, + "learning_rate": 9.559518183434753e-05, + "loss": 0.6896081566810608, + "step": 2900 + }, + { + "epoch": 1.2236286919831223, + "eval_loss": 0.7281573414802551, + "eval_runtime": 854.563, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 2.466, + "step": 2900 + }, + { + "epoch": 1.2244725738396625, + "grad_norm": 1.3225606679916382, + "learning_rate": 9.558532087025697e-05, + "loss": 0.6797633171081543, + "step": 2902 + }, + { + "epoch": 1.2253164556962026, + "grad_norm": 1.3058340549468994, + "learning_rate": 9.55754493905084e-05, + "loss": 0.6510948538780212, + "step": 2904 + }, + { + "epoch": 1.2261603375527426, + "grad_norm": 1.140268087387085, + "learning_rate": 9.556556739737892e-05, + "loss": 0.6481176614761353, + "step": 2906 + }, + { + "epoch": 1.2270042194092827, + "grad_norm": 1.465113639831543, + "learning_rate": 9.555567489314816e-05, + "loss": 0.7533771991729736, + "step": 2908 + }, + { + "epoch": 1.2278481012658227, + "grad_norm": 1.1468979120254517, + "learning_rate": 9.554577188009812e-05, + "loss": 0.6924305558204651, + "step": 2910 + }, + { + "epoch": 1.2286919831223628, + "grad_norm": 1.2193517684936523, + "learning_rate": 9.553585836051321e-05, + "loss": 0.7082820534706116, + "step": 2912 + }, + { + "epoch": 1.229535864978903, + "grad_norm": 1.2015037536621094, + "learning_rate": 9.552593433668034e-05, + "loss": 0.6735695004463196, + "step": 2914 + }, + { + "epoch": 1.230379746835443, + "grad_norm": 1.1915435791015625, + "learning_rate": 9.551599981088874e-05, + "loss": 0.7312048673629761, + "step": 2916 + }, + { + "epoch": 1.231223628691983, + "grad_norm": 1.2849410772323608, + "learning_rate": 9.550605478543013e-05, + "loss": 0.6590308547019958, + "step": 2918 + }, + { + "epoch": 1.2320675105485233, + "grad_norm": 1.192238688468933, + "learning_rate": 9.549609926259866e-05, + "loss": 0.6237715482711792, + "step": 2920 + }, + { + "epoch": 1.2329113924050632, + "grad_norm": 1.141845703125, + "learning_rate": 9.548613324469085e-05, + "loss": 0.6546295881271362, + "step": 2922 + }, + { + "epoch": 1.2337552742616034, + "grad_norm": 1.1662311553955078, + "learning_rate": 9.547615673400566e-05, + "loss": 0.5800934433937073, + "step": 2924 + }, + { + "epoch": 1.2345991561181435, + "grad_norm": 1.120578646659851, + "learning_rate": 9.546616973284453e-05, + "loss": 0.6487136483192444, + "step": 2926 + }, + { + "epoch": 1.2354430379746835, + "grad_norm": 1.0884860754013062, + "learning_rate": 9.54561722435112e-05, + "loss": 0.7515342235565186, + "step": 2928 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 1.4208670854568481, + "learning_rate": 9.544616426831196e-05, + "loss": 0.7162003517150879, + "step": 2930 + }, + { + "epoch": 1.2371308016877638, + "grad_norm": 1.083389401435852, + "learning_rate": 9.543614580955543e-05, + "loss": 0.708450198173523, + "step": 2932 + }, + { + "epoch": 1.2379746835443037, + "grad_norm": 1.141364336013794, + "learning_rate": 9.542611686955268e-05, + "loss": 0.6255859732627869, + "step": 2934 + }, + { + "epoch": 1.238818565400844, + "grad_norm": 1.122036099433899, + "learning_rate": 9.54160774506172e-05, + "loss": 0.6485402584075928, + "step": 2936 + }, + { + "epoch": 1.239662447257384, + "grad_norm": 1.3514165878295898, + "learning_rate": 9.540602755506487e-05, + "loss": 0.6735473871231079, + "step": 2938 + }, + { + "epoch": 1.240506329113924, + "grad_norm": 1.1762629747390747, + "learning_rate": 9.539596718521403e-05, + "loss": 0.6154970526695251, + "step": 2940 + }, + { + "epoch": 1.2413502109704642, + "grad_norm": 1.1609408855438232, + "learning_rate": 9.53858963433854e-05, + "loss": 0.6410251259803772, + "step": 2942 + }, + { + "epoch": 1.2421940928270043, + "grad_norm": 1.1750361919403076, + "learning_rate": 9.537581503190214e-05, + "loss": 0.6841039657592773, + "step": 2944 + }, + { + "epoch": 1.2430379746835443, + "grad_norm": 1.3125680685043335, + "learning_rate": 9.536572325308982e-05, + "loss": 0.7293462753295898, + "step": 2946 + }, + { + "epoch": 1.2438818565400844, + "grad_norm": 1.1737277507781982, + "learning_rate": 9.53556210092764e-05, + "loss": 0.7713663578033447, + "step": 2948 + }, + { + "epoch": 1.2447257383966246, + "grad_norm": 1.1702152490615845, + "learning_rate": 9.53455083027923e-05, + "loss": 0.6612298488616943, + "step": 2950 + }, + { + "epoch": 1.2455696202531645, + "grad_norm": 1.2594486474990845, + "learning_rate": 9.533538513597028e-05, + "loss": 0.6725803017616272, + "step": 2952 + }, + { + "epoch": 1.2464135021097047, + "grad_norm": 1.180816411972046, + "learning_rate": 9.532525151114562e-05, + "loss": 0.6421069502830505, + "step": 2954 + }, + { + "epoch": 1.2472573839662446, + "grad_norm": 1.25814688205719, + "learning_rate": 9.531510743065593e-05, + "loss": 0.7042996287345886, + "step": 2956 + }, + { + "epoch": 1.2481012658227848, + "grad_norm": 1.2101783752441406, + "learning_rate": 9.530495289684122e-05, + "loss": 0.7359137535095215, + "step": 2958 + }, + { + "epoch": 1.248945147679325, + "grad_norm": 1.1438405513763428, + "learning_rate": 9.5294787912044e-05, + "loss": 0.6186386346817017, + "step": 2960 + }, + { + "epoch": 1.249789029535865, + "grad_norm": 1.163364291191101, + "learning_rate": 9.52846124786091e-05, + "loss": 0.6243056058883667, + "step": 2962 + }, + { + "epoch": 1.250632911392405, + "grad_norm": 1.0695953369140625, + "learning_rate": 9.52744265988838e-05, + "loss": 0.6568763852119446, + "step": 2964 + }, + { + "epoch": 1.2514767932489452, + "grad_norm": 1.2228879928588867, + "learning_rate": 9.52642302752178e-05, + "loss": 0.6486776471138, + "step": 2966 + }, + { + "epoch": 1.2523206751054852, + "grad_norm": 1.2262967824935913, + "learning_rate": 9.52540235099632e-05, + "loss": 0.6293455958366394, + "step": 2968 + }, + { + "epoch": 1.2531645569620253, + "grad_norm": 1.0862956047058105, + "learning_rate": 9.524380630547449e-05, + "loss": 0.6549884080886841, + "step": 2970 + }, + { + "epoch": 1.2540084388185653, + "grad_norm": 1.1721880435943604, + "learning_rate": 9.52335786641086e-05, + "loss": 0.6126490831375122, + "step": 2972 + }, + { + "epoch": 1.2548523206751054, + "grad_norm": 1.2452391386032104, + "learning_rate": 9.522334058822483e-05, + "loss": 0.7078590393066406, + "step": 2974 + }, + { + "epoch": 1.2556962025316456, + "grad_norm": 1.2290222644805908, + "learning_rate": 9.521309208018492e-05, + "loss": 0.6166214942932129, + "step": 2976 + }, + { + "epoch": 1.2565400843881855, + "grad_norm": 1.1823618412017822, + "learning_rate": 9.520283314235299e-05, + "loss": 0.666228175163269, + "step": 2978 + }, + { + "epoch": 1.2573839662447257, + "grad_norm": 1.1702475547790527, + "learning_rate": 9.51925637770956e-05, + "loss": 0.7436795830726624, + "step": 2980 + }, + { + "epoch": 1.2582278481012659, + "grad_norm": 1.0879321098327637, + "learning_rate": 9.518228398678168e-05, + "loss": 0.7120893001556396, + "step": 2982 + }, + { + "epoch": 1.2590717299578058, + "grad_norm": 1.1608418226242065, + "learning_rate": 9.517199377378261e-05, + "loss": 0.6931713223457336, + "step": 2984 + }, + { + "epoch": 1.259915611814346, + "grad_norm": 1.1289087533950806, + "learning_rate": 9.51616931404721e-05, + "loss": 0.6803538799285889, + "step": 2986 + }, + { + "epoch": 1.2607594936708861, + "grad_norm": 1.1622236967086792, + "learning_rate": 9.515138208922633e-05, + "loss": 0.6499706506729126, + "step": 2988 + }, + { + "epoch": 1.261603375527426, + "grad_norm": 1.2492594718933105, + "learning_rate": 9.514106062242386e-05, + "loss": 0.6132655739784241, + "step": 2990 + }, + { + "epoch": 1.2624472573839662, + "grad_norm": 1.1538822650909424, + "learning_rate": 9.513072874244567e-05, + "loss": 0.6309265494346619, + "step": 2992 + }, + { + "epoch": 1.2632911392405064, + "grad_norm": 1.0828478336334229, + "learning_rate": 9.512038645167509e-05, + "loss": 0.6297751665115356, + "step": 2994 + }, + { + "epoch": 1.2641350210970463, + "grad_norm": 1.2440937757492065, + "learning_rate": 9.511003375249792e-05, + "loss": 0.6335258483886719, + "step": 2996 + }, + { + "epoch": 1.2649789029535865, + "grad_norm": 1.1259970664978027, + "learning_rate": 9.50996706473023e-05, + "loss": 0.6513770818710327, + "step": 2998 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.1530309915542603, + "learning_rate": 9.508929713847884e-05, + "loss": 0.6490892767906189, + "step": 3000 + }, + { + "epoch": 1.2658227848101267, + "eval_loss": 0.72515869140625, + "eval_runtime": 868.0515, + "eval_samples_per_second": 2.427, + "eval_steps_per_second": 2.427, + "step": 3000 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.2257169485092163, + "learning_rate": 9.507891322842048e-05, + "loss": 0.6936060786247253, + "step": 3002 + }, + { + "epoch": 1.2675105485232068, + "grad_norm": 1.0380109548568726, + "learning_rate": 9.506851891952259e-05, + "loss": 0.5941951870918274, + "step": 3004 + }, + { + "epoch": 1.268354430379747, + "grad_norm": 1.2830222845077515, + "learning_rate": 9.505811421418296e-05, + "loss": 0.648429811000824, + "step": 3006 + }, + { + "epoch": 1.2691983122362869, + "grad_norm": 1.2212986946105957, + "learning_rate": 9.504769911480171e-05, + "loss": 0.6868565678596497, + "step": 3008 + }, + { + "epoch": 1.270042194092827, + "grad_norm": 1.104656457901001, + "learning_rate": 9.503727362378145e-05, + "loss": 0.6777986288070679, + "step": 3010 + }, + { + "epoch": 1.2708860759493672, + "grad_norm": 1.1449005603790283, + "learning_rate": 9.502683774352713e-05, + "loss": 0.6581128239631653, + "step": 3012 + }, + { + "epoch": 1.2717299578059071, + "grad_norm": 1.2753362655639648, + "learning_rate": 9.501639147644608e-05, + "loss": 0.689930260181427, + "step": 3014 + }, + { + "epoch": 1.2725738396624473, + "grad_norm": 1.3367106914520264, + "learning_rate": 9.500593482494809e-05, + "loss": 0.7549214363098145, + "step": 3016 + }, + { + "epoch": 1.2734177215189875, + "grad_norm": 1.2309048175811768, + "learning_rate": 9.499546779144528e-05, + "loss": 0.6713513135910034, + "step": 3018 + }, + { + "epoch": 1.2742616033755274, + "grad_norm": 1.3833240270614624, + "learning_rate": 9.49849903783522e-05, + "loss": 0.7045458555221558, + "step": 3020 + }, + { + "epoch": 1.2751054852320676, + "grad_norm": 1.1402570009231567, + "learning_rate": 9.49745025880858e-05, + "loss": 0.708249568939209, + "step": 3022 + }, + { + "epoch": 1.2759493670886077, + "grad_norm": 1.0476267337799072, + "learning_rate": 9.496400442306541e-05, + "loss": 0.616210401058197, + "step": 3024 + }, + { + "epoch": 1.2767932489451477, + "grad_norm": 1.1045979261398315, + "learning_rate": 9.495349588571274e-05, + "loss": 0.6691827178001404, + "step": 3026 + }, + { + "epoch": 1.2776371308016878, + "grad_norm": 1.1760368347167969, + "learning_rate": 9.494297697845194e-05, + "loss": 0.6198306083679199, + "step": 3028 + }, + { + "epoch": 1.2784810126582278, + "grad_norm": 1.0015549659729004, + "learning_rate": 9.493244770370946e-05, + "loss": 0.5756480097770691, + "step": 3030 + }, + { + "epoch": 1.279324894514768, + "grad_norm": 1.2190428972244263, + "learning_rate": 9.492190806391427e-05, + "loss": 0.6794419884681702, + "step": 3032 + }, + { + "epoch": 1.2801687763713079, + "grad_norm": 1.0210410356521606, + "learning_rate": 9.491135806149762e-05, + "loss": 0.5847988724708557, + "step": 3034 + }, + { + "epoch": 1.281012658227848, + "grad_norm": 1.0678503513336182, + "learning_rate": 9.490079769889319e-05, + "loss": 0.6760231256484985, + "step": 3036 + }, + { + "epoch": 1.2818565400843882, + "grad_norm": 1.1811012029647827, + "learning_rate": 9.489022697853709e-05, + "loss": 0.7188448309898376, + "step": 3038 + }, + { + "epoch": 1.2827004219409281, + "grad_norm": 1.1134302616119385, + "learning_rate": 9.487964590286776e-05, + "loss": 0.674904465675354, + "step": 3040 + }, + { + "epoch": 1.2835443037974683, + "grad_norm": 1.1868232488632202, + "learning_rate": 9.486905447432603e-05, + "loss": 0.6016344428062439, + "step": 3042 + }, + { + "epoch": 1.2843881856540085, + "grad_norm": 1.1586613655090332, + "learning_rate": 9.485845269535517e-05, + "loss": 0.6965603828430176, + "step": 3044 + }, + { + "epoch": 1.2852320675105484, + "grad_norm": 1.149837613105774, + "learning_rate": 9.48478405684008e-05, + "loss": 0.656144380569458, + "step": 3046 + }, + { + "epoch": 1.2860759493670886, + "grad_norm": 1.228752613067627, + "learning_rate": 9.48372180959109e-05, + "loss": 0.6388653516769409, + "step": 3048 + }, + { + "epoch": 1.2869198312236287, + "grad_norm": 1.2403100728988647, + "learning_rate": 9.482658528033595e-05, + "loss": 0.6255465745925903, + "step": 3050 + }, + { + "epoch": 1.2877637130801687, + "grad_norm": 1.2483839988708496, + "learning_rate": 9.481594212412865e-05, + "loss": 0.6828253269195557, + "step": 3052 + }, + { + "epoch": 1.2886075949367088, + "grad_norm": 1.4161021709442139, + "learning_rate": 9.480528862974422e-05, + "loss": 0.7072080373764038, + "step": 3054 + }, + { + "epoch": 1.289451476793249, + "grad_norm": 1.1500437259674072, + "learning_rate": 9.479462479964021e-05, + "loss": 0.6082415580749512, + "step": 3056 + }, + { + "epoch": 1.290295358649789, + "grad_norm": 1.196595549583435, + "learning_rate": 9.478395063627654e-05, + "loss": 0.6653015613555908, + "step": 3058 + }, + { + "epoch": 1.2911392405063291, + "grad_norm": 1.2832285165786743, + "learning_rate": 9.477326614211557e-05, + "loss": 0.7095832824707031, + "step": 3060 + }, + { + "epoch": 1.2919831223628693, + "grad_norm": 1.2234288454055786, + "learning_rate": 9.476257131962198e-05, + "loss": 0.7183426022529602, + "step": 3062 + }, + { + "epoch": 1.2928270042194092, + "grad_norm": 1.2350459098815918, + "learning_rate": 9.475186617126286e-05, + "loss": 0.713284432888031, + "step": 3064 + }, + { + "epoch": 1.2936708860759494, + "grad_norm": 1.2079555988311768, + "learning_rate": 9.47411506995077e-05, + "loss": 0.6580002307891846, + "step": 3066 + }, + { + "epoch": 1.2945147679324895, + "grad_norm": 1.129796028137207, + "learning_rate": 9.473042490682835e-05, + "loss": 0.5967763662338257, + "step": 3068 + }, + { + "epoch": 1.2953586497890295, + "grad_norm": 1.1706618070602417, + "learning_rate": 9.471968879569901e-05, + "loss": 0.6724388003349304, + "step": 3070 + }, + { + "epoch": 1.2962025316455696, + "grad_norm": 1.0336005687713623, + "learning_rate": 9.470894236859635e-05, + "loss": 0.6527577638626099, + "step": 3072 + }, + { + "epoch": 1.2970464135021098, + "grad_norm": 1.1124558448791504, + "learning_rate": 9.469818562799932e-05, + "loss": 0.677132785320282, + "step": 3074 + }, + { + "epoch": 1.2978902953586497, + "grad_norm": 1.158069372177124, + "learning_rate": 9.468741857638933e-05, + "loss": 0.649718165397644, + "step": 3076 + }, + { + "epoch": 1.29873417721519, + "grad_norm": 1.092926263809204, + "learning_rate": 9.46766412162501e-05, + "loss": 0.6872133612632751, + "step": 3078 + }, + { + "epoch": 1.29957805907173, + "grad_norm": 1.1324822902679443, + "learning_rate": 9.466585355006777e-05, + "loss": 0.6495246291160583, + "step": 3080 + }, + { + "epoch": 1.30042194092827, + "grad_norm": 1.5882837772369385, + "learning_rate": 9.465505558033086e-05, + "loss": 0.6730570197105408, + "step": 3082 + }, + { + "epoch": 1.3012658227848102, + "grad_norm": 0.9866069555282593, + "learning_rate": 9.464424730953023e-05, + "loss": 0.5677527785301208, + "step": 3084 + }, + { + "epoch": 1.3021097046413503, + "grad_norm": 1.1560224294662476, + "learning_rate": 9.463342874015917e-05, + "loss": 0.6247856020927429, + "step": 3086 + }, + { + "epoch": 1.3029535864978903, + "grad_norm": 1.135939359664917, + "learning_rate": 9.462259987471329e-05, + "loss": 0.6889358758926392, + "step": 3088 + }, + { + "epoch": 1.3037974683544304, + "grad_norm": 1.3935760259628296, + "learning_rate": 9.461176071569063e-05, + "loss": 0.7097522020339966, + "step": 3090 + }, + { + "epoch": 1.3046413502109704, + "grad_norm": 1.153518795967102, + "learning_rate": 9.460091126559155e-05, + "loss": 0.7044580578804016, + "step": 3092 + }, + { + "epoch": 1.3054852320675105, + "grad_norm": 1.2112717628479004, + "learning_rate": 9.45900515269188e-05, + "loss": 0.6119300723075867, + "step": 3094 + }, + { + "epoch": 1.3063291139240507, + "grad_norm": 1.295591115951538, + "learning_rate": 9.457918150217754e-05, + "loss": 0.7150222063064575, + "step": 3096 + }, + { + "epoch": 1.3071729957805907, + "grad_norm": 1.1175775527954102, + "learning_rate": 9.456830119387527e-05, + "loss": 0.6043334007263184, + "step": 3098 + }, + { + "epoch": 1.3080168776371308, + "grad_norm": 1.4022588729858398, + "learning_rate": 9.455741060452186e-05, + "loss": 0.6354425549507141, + "step": 3100 + }, + { + "epoch": 1.3080168776371308, + "eval_loss": 0.7225774526596069, + "eval_runtime": 862.4006, + "eval_samples_per_second": 2.443, + "eval_steps_per_second": 2.443, + "step": 3100 + }, + { + "epoch": 1.3088607594936708, + "grad_norm": 1.1657692193984985, + "learning_rate": 9.454650973662957e-05, + "loss": 0.7281571626663208, + "step": 3102 + }, + { + "epoch": 1.309704641350211, + "grad_norm": 1.6169127225875854, + "learning_rate": 9.453559859271301e-05, + "loss": 0.8038214445114136, + "step": 3104 + }, + { + "epoch": 1.310548523206751, + "grad_norm": 1.1256520748138428, + "learning_rate": 9.452467717528918e-05, + "loss": 0.6488606333732605, + "step": 3106 + }, + { + "epoch": 1.311392405063291, + "grad_norm": 1.1224530935287476, + "learning_rate": 9.451374548687745e-05, + "loss": 0.6897066235542297, + "step": 3108 + }, + { + "epoch": 1.3122362869198312, + "grad_norm": 1.1123055219650269, + "learning_rate": 9.450280352999952e-05, + "loss": 0.6332913041114807, + "step": 3110 + }, + { + "epoch": 1.3130801687763713, + "grad_norm": 1.1688940525054932, + "learning_rate": 9.449185130717952e-05, + "loss": 0.7426630854606628, + "step": 3112 + }, + { + "epoch": 1.3139240506329113, + "grad_norm": 1.1898044347763062, + "learning_rate": 9.44808888209439e-05, + "loss": 0.7156099677085876, + "step": 3114 + }, + { + "epoch": 1.3147679324894515, + "grad_norm": 1.3030686378479004, + "learning_rate": 9.44699160738215e-05, + "loss": 0.7150979042053223, + "step": 3116 + }, + { + "epoch": 1.3156118143459916, + "grad_norm": 1.1539074182510376, + "learning_rate": 9.445893306834352e-05, + "loss": 0.6687285900115967, + "step": 3118 + }, + { + "epoch": 1.3164556962025316, + "grad_norm": 1.311808466911316, + "learning_rate": 9.444793980704355e-05, + "loss": 0.7340983152389526, + "step": 3120 + }, + { + "epoch": 1.3172995780590717, + "grad_norm": 1.3325430154800415, + "learning_rate": 9.44369362924575e-05, + "loss": 0.6620677709579468, + "step": 3122 + }, + { + "epoch": 1.3181434599156119, + "grad_norm": 1.201518177986145, + "learning_rate": 9.442592252712365e-05, + "loss": 0.6169955134391785, + "step": 3124 + }, + { + "epoch": 1.3189873417721518, + "grad_norm": 1.2124013900756836, + "learning_rate": 9.441489851358272e-05, + "loss": 0.6696792840957642, + "step": 3126 + }, + { + "epoch": 1.319831223628692, + "grad_norm": 1.2186850309371948, + "learning_rate": 9.440386425437768e-05, + "loss": 0.7303428649902344, + "step": 3128 + }, + { + "epoch": 1.3206751054852321, + "grad_norm": 1.3780523538589478, + "learning_rate": 9.439281975205396e-05, + "loss": 0.7093026638031006, + "step": 3130 + }, + { + "epoch": 1.321518987341772, + "grad_norm": 1.233353614807129, + "learning_rate": 9.438176500915932e-05, + "loss": 0.6821767687797546, + "step": 3132 + }, + { + "epoch": 1.3223628691983123, + "grad_norm": 1.2425329685211182, + "learning_rate": 9.437070002824385e-05, + "loss": 0.700680136680603, + "step": 3134 + }, + { + "epoch": 1.3232067510548524, + "grad_norm": 1.1600432395935059, + "learning_rate": 9.435962481186003e-05, + "loss": 0.6173145771026611, + "step": 3136 + }, + { + "epoch": 1.3240506329113924, + "grad_norm": 1.279336929321289, + "learning_rate": 9.434853936256272e-05, + "loss": 0.6597106456756592, + "step": 3138 + }, + { + "epoch": 1.3248945147679325, + "grad_norm": 1.1787258386611938, + "learning_rate": 9.433744368290909e-05, + "loss": 0.6655287742614746, + "step": 3140 + }, + { + "epoch": 1.3257383966244727, + "grad_norm": 1.3658509254455566, + "learning_rate": 9.432633777545874e-05, + "loss": 0.6312944889068604, + "step": 3142 + }, + { + "epoch": 1.3265822784810126, + "grad_norm": 1.1220000982284546, + "learning_rate": 9.431522164277356e-05, + "loss": 0.6696156859397888, + "step": 3144 + }, + { + "epoch": 1.3274261603375528, + "grad_norm": 1.224761724472046, + "learning_rate": 9.430409528741783e-05, + "loss": 0.6586571335792542, + "step": 3146 + }, + { + "epoch": 1.328270042194093, + "grad_norm": 1.227510929107666, + "learning_rate": 9.429295871195821e-05, + "loss": 0.64905846118927, + "step": 3148 + }, + { + "epoch": 1.3291139240506329, + "grad_norm": 1.1359103918075562, + "learning_rate": 9.428181191896366e-05, + "loss": 0.6407933831214905, + "step": 3150 + }, + { + "epoch": 1.329957805907173, + "grad_norm": 1.2729473114013672, + "learning_rate": 9.427065491100556e-05, + "loss": 0.7004884481430054, + "step": 3152 + }, + { + "epoch": 1.3308016877637132, + "grad_norm": 1.1182841062545776, + "learning_rate": 9.42594876906576e-05, + "loss": 0.6835907101631165, + "step": 3154 + }, + { + "epoch": 1.3316455696202532, + "grad_norm": 1.2309781312942505, + "learning_rate": 9.424831026049585e-05, + "loss": 0.7476315498352051, + "step": 3156 + }, + { + "epoch": 1.3324894514767933, + "grad_norm": 1.0857728719711304, + "learning_rate": 9.423712262309873e-05, + "loss": 0.6811426281929016, + "step": 3158 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.299680233001709, + "learning_rate": 9.4225924781047e-05, + "loss": 0.6403942108154297, + "step": 3160 + }, + { + "epoch": 1.3341772151898734, + "grad_norm": 1.226472020149231, + "learning_rate": 9.421471673692382e-05, + "loss": 0.6758930683135986, + "step": 3162 + }, + { + "epoch": 1.3350210970464136, + "grad_norm": 1.1403205394744873, + "learning_rate": 9.420349849331463e-05, + "loss": 0.7119444608688354, + "step": 3164 + }, + { + "epoch": 1.3358649789029535, + "grad_norm": 1.2888442277908325, + "learning_rate": 9.419227005280729e-05, + "loss": 0.7411463260650635, + "step": 3166 + }, + { + "epoch": 1.3367088607594937, + "grad_norm": 1.1929190158843994, + "learning_rate": 9.418103141799197e-05, + "loss": 0.5992606282234192, + "step": 3168 + }, + { + "epoch": 1.3375527426160336, + "grad_norm": 1.2574355602264404, + "learning_rate": 9.416978259146122e-05, + "loss": 0.6728890538215637, + "step": 3170 + }, + { + "epoch": 1.3383966244725738, + "grad_norm": 0.9653727412223816, + "learning_rate": 9.415852357580992e-05, + "loss": 0.6294883489608765, + "step": 3172 + }, + { + "epoch": 1.339240506329114, + "grad_norm": 1.2107670307159424, + "learning_rate": 9.414725437363532e-05, + "loss": 0.6816665530204773, + "step": 3174 + }, + { + "epoch": 1.340084388185654, + "grad_norm": 1.024849534034729, + "learning_rate": 9.4135974987537e-05, + "loss": 0.6186381578445435, + "step": 3176 + }, + { + "epoch": 1.340928270042194, + "grad_norm": 1.1556614637374878, + "learning_rate": 9.41246854201169e-05, + "loss": 0.6071005463600159, + "step": 3178 + }, + { + "epoch": 1.3417721518987342, + "grad_norm": 1.2382808923721313, + "learning_rate": 9.41133856739793e-05, + "loss": 0.7871434092521667, + "step": 3180 + }, + { + "epoch": 1.3426160337552742, + "grad_norm": 1.0499578714370728, + "learning_rate": 9.410207575173082e-05, + "loss": 0.6578201651573181, + "step": 3182 + }, + { + "epoch": 1.3434599156118143, + "grad_norm": 1.2048250436782837, + "learning_rate": 9.409075565598049e-05, + "loss": 0.6271620392799377, + "step": 3184 + }, + { + "epoch": 1.3443037974683545, + "grad_norm": 1.0287591218948364, + "learning_rate": 9.407942538933958e-05, + "loss": 0.5773864388465881, + "step": 3186 + }, + { + "epoch": 1.3451476793248944, + "grad_norm": 1.1125097274780273, + "learning_rate": 9.406808495442181e-05, + "loss": 0.6745175719261169, + "step": 3188 + }, + { + "epoch": 1.3459915611814346, + "grad_norm": 1.036125898361206, + "learning_rate": 9.405673435384319e-05, + "loss": 0.6001214385032654, + "step": 3190 + }, + { + "epoch": 1.3468354430379748, + "grad_norm": 1.2771985530853271, + "learning_rate": 9.404537359022207e-05, + "loss": 0.6703945994377136, + "step": 3192 + }, + { + "epoch": 1.3476793248945147, + "grad_norm": 1.0891097784042358, + "learning_rate": 9.403400266617918e-05, + "loss": 0.6159096360206604, + "step": 3194 + }, + { + "epoch": 1.3485232067510549, + "grad_norm": 1.1926233768463135, + "learning_rate": 9.402262158433755e-05, + "loss": 0.6439315676689148, + "step": 3196 + }, + { + "epoch": 1.349367088607595, + "grad_norm": 1.272557020187378, + "learning_rate": 9.40112303473226e-05, + "loss": 0.7125352025032043, + "step": 3198 + }, + { + "epoch": 1.350210970464135, + "grad_norm": 1.052037239074707, + "learning_rate": 9.399982895776207e-05, + "loss": 0.594719648361206, + "step": 3200 + }, + { + "epoch": 1.350210970464135, + "eval_loss": 0.7200453281402588, + "eval_runtime": 846.2953, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 3200 + }, + { + "epoch": 1.3510548523206751, + "grad_norm": 1.204728126525879, + "learning_rate": 9.398841741828601e-05, + "loss": 0.6390520334243774, + "step": 3202 + }, + { + "epoch": 1.3518987341772153, + "grad_norm": 1.0873899459838867, + "learning_rate": 9.397699573152689e-05, + "loss": 0.6010531187057495, + "step": 3204 + }, + { + "epoch": 1.3527426160337552, + "grad_norm": 1.3124359846115112, + "learning_rate": 9.396556390011944e-05, + "loss": 0.724280834197998, + "step": 3206 + }, + { + "epoch": 1.3535864978902954, + "grad_norm": 1.2179948091506958, + "learning_rate": 9.395412192670075e-05, + "loss": 0.6430405378341675, + "step": 3208 + }, + { + "epoch": 1.3544303797468356, + "grad_norm": 1.2617219686508179, + "learning_rate": 9.394266981391031e-05, + "loss": 0.7188641428947449, + "step": 3210 + }, + { + "epoch": 1.3552742616033755, + "grad_norm": 1.2151501178741455, + "learning_rate": 9.393120756438988e-05, + "loss": 0.6724364757537842, + "step": 3212 + }, + { + "epoch": 1.3561181434599157, + "grad_norm": 1.221528172492981, + "learning_rate": 9.391973518078357e-05, + "loss": 0.6340664625167847, + "step": 3214 + }, + { + "epoch": 1.3569620253164558, + "grad_norm": 1.3180092573165894, + "learning_rate": 9.390825266573786e-05, + "loss": 0.6914255023002625, + "step": 3216 + }, + { + "epoch": 1.3578059071729958, + "grad_norm": 1.103994369506836, + "learning_rate": 9.38967600219015e-05, + "loss": 0.6137136220932007, + "step": 3218 + }, + { + "epoch": 1.358649789029536, + "grad_norm": 1.33389413356781, + "learning_rate": 9.38852572519257e-05, + "loss": 0.7173700332641602, + "step": 3220 + }, + { + "epoch": 1.3594936708860759, + "grad_norm": 1.1074159145355225, + "learning_rate": 9.387374435846386e-05, + "loss": 0.5942243933677673, + "step": 3222 + }, + { + "epoch": 1.360337552742616, + "grad_norm": 1.1157063245773315, + "learning_rate": 9.386222134417182e-05, + "loss": 0.6362866163253784, + "step": 3224 + }, + { + "epoch": 1.3611814345991562, + "grad_norm": 1.1717792749404907, + "learning_rate": 9.38506882117077e-05, + "loss": 0.6784523129463196, + "step": 3226 + }, + { + "epoch": 1.3620253164556961, + "grad_norm": 1.0946043729782104, + "learning_rate": 9.383914496373197e-05, + "loss": 0.6647377014160156, + "step": 3228 + }, + { + "epoch": 1.3628691983122363, + "grad_norm": 1.1519699096679688, + "learning_rate": 9.382759160290746e-05, + "loss": 0.6302075982093811, + "step": 3230 + }, + { + "epoch": 1.3637130801687762, + "grad_norm": 0.9928684830665588, + "learning_rate": 9.381602813189929e-05, + "loss": 0.5979090332984924, + "step": 3232 + }, + { + "epoch": 1.3645569620253164, + "grad_norm": 1.2488124370574951, + "learning_rate": 9.380445455337492e-05, + "loss": 0.6949353218078613, + "step": 3234 + }, + { + "epoch": 1.3654008438818566, + "grad_norm": 1.3884797096252441, + "learning_rate": 9.379287087000416e-05, + "loss": 0.7225558161735535, + "step": 3236 + }, + { + "epoch": 1.3662447257383965, + "grad_norm": 1.2981176376342773, + "learning_rate": 9.378127708445917e-05, + "loss": 0.6993390917778015, + "step": 3238 + }, + { + "epoch": 1.3670886075949367, + "grad_norm": 0.9884640574455261, + "learning_rate": 9.376967319941438e-05, + "loss": 0.6983805894851685, + "step": 3240 + }, + { + "epoch": 1.3679324894514768, + "grad_norm": 1.2051894664764404, + "learning_rate": 9.375805921754659e-05, + "loss": 0.7062534689903259, + "step": 3242 + }, + { + "epoch": 1.3687763713080168, + "grad_norm": 1.1943434476852417, + "learning_rate": 9.374643514153494e-05, + "loss": 0.6405107378959656, + "step": 3244 + }, + { + "epoch": 1.369620253164557, + "grad_norm": 1.249214768409729, + "learning_rate": 9.373480097406086e-05, + "loss": 0.6844781637191772, + "step": 3246 + }, + { + "epoch": 1.370464135021097, + "grad_norm": 1.1847131252288818, + "learning_rate": 9.372315671780813e-05, + "loss": 0.6048306226730347, + "step": 3248 + }, + { + "epoch": 1.371308016877637, + "grad_norm": 1.125545859336853, + "learning_rate": 9.37115023754629e-05, + "loss": 0.6772685050964355, + "step": 3250 + }, + { + "epoch": 1.3721518987341772, + "grad_norm": 1.466615915298462, + "learning_rate": 9.369983794971354e-05, + "loss": 0.7536272406578064, + "step": 3252 + }, + { + "epoch": 1.3729957805907174, + "grad_norm": 1.066699504852295, + "learning_rate": 9.368816344325084e-05, + "loss": 0.6640655398368835, + "step": 3254 + }, + { + "epoch": 1.3738396624472573, + "grad_norm": 1.4793988466262817, + "learning_rate": 9.367647885876787e-05, + "loss": 0.7029458284378052, + "step": 3256 + }, + { + "epoch": 1.3746835443037975, + "grad_norm": 1.258540153503418, + "learning_rate": 9.366478419896006e-05, + "loss": 0.7231863737106323, + "step": 3258 + }, + { + "epoch": 1.3755274261603376, + "grad_norm": 1.176106333732605, + "learning_rate": 9.365307946652512e-05, + "loss": 0.6679144501686096, + "step": 3260 + }, + { + "epoch": 1.3763713080168776, + "grad_norm": 1.3301753997802734, + "learning_rate": 9.364136466416316e-05, + "loss": 0.6282188296318054, + "step": 3262 + }, + { + "epoch": 1.3772151898734177, + "grad_norm": 1.3616732358932495, + "learning_rate": 9.362963979457648e-05, + "loss": 0.6870840191841125, + "step": 3264 + }, + { + "epoch": 1.378059071729958, + "grad_norm": 1.1982418298721313, + "learning_rate": 9.361790486046985e-05, + "loss": 0.6823731660842896, + "step": 3266 + }, + { + "epoch": 1.3789029535864978, + "grad_norm": 1.1869033575057983, + "learning_rate": 9.360615986455024e-05, + "loss": 0.6582897305488586, + "step": 3268 + }, + { + "epoch": 1.379746835443038, + "grad_norm": 1.1192975044250488, + "learning_rate": 9.359440480952703e-05, + "loss": 0.716654360294342, + "step": 3270 + }, + { + "epoch": 1.3805907172995782, + "grad_norm": 1.2210016250610352, + "learning_rate": 9.358263969811189e-05, + "loss": 0.6880061626434326, + "step": 3272 + }, + { + "epoch": 1.381434599156118, + "grad_norm": 1.0358284711837769, + "learning_rate": 9.357086453301878e-05, + "loss": 0.666864812374115, + "step": 3274 + }, + { + "epoch": 1.3822784810126583, + "grad_norm": 1.2790803909301758, + "learning_rate": 9.355907931696401e-05, + "loss": 0.6872087121009827, + "step": 3276 + }, + { + "epoch": 1.3831223628691984, + "grad_norm": 1.182991623878479, + "learning_rate": 9.354728405266623e-05, + "loss": 0.5929665565490723, + "step": 3278 + }, + { + "epoch": 1.3839662447257384, + "grad_norm": 1.1071184873580933, + "learning_rate": 9.353547874284634e-05, + "loss": 0.5928181409835815, + "step": 3280 + }, + { + "epoch": 1.3848101265822785, + "grad_norm": 1.3139623403549194, + "learning_rate": 9.352366339022763e-05, + "loss": 0.6783652901649475, + "step": 3282 + }, + { + "epoch": 1.3856540084388187, + "grad_norm": 1.2534632682800293, + "learning_rate": 9.351183799753567e-05, + "loss": 0.7652941346168518, + "step": 3284 + }, + { + "epoch": 1.3864978902953586, + "grad_norm": 1.4487930536270142, + "learning_rate": 9.350000256749833e-05, + "loss": 0.7430433630943298, + "step": 3286 + }, + { + "epoch": 1.3873417721518988, + "grad_norm": 1.0786021947860718, + "learning_rate": 9.348815710284584e-05, + "loss": 0.5854598879814148, + "step": 3288 + }, + { + "epoch": 1.3881856540084387, + "grad_norm": 1.0544480085372925, + "learning_rate": 9.347630160631071e-05, + "loss": 0.6365222334861755, + "step": 3290 + }, + { + "epoch": 1.389029535864979, + "grad_norm": 0.9989988207817078, + "learning_rate": 9.346443608062778e-05, + "loss": 0.6485803127288818, + "step": 3292 + }, + { + "epoch": 1.389873417721519, + "grad_norm": 1.100951910018921, + "learning_rate": 9.345256052853419e-05, + "loss": 0.6417753100395203, + "step": 3294 + }, + { + "epoch": 1.390717299578059, + "grad_norm": 1.1398471593856812, + "learning_rate": 9.344067495276942e-05, + "loss": 0.6333693861961365, + "step": 3296 + }, + { + "epoch": 1.3915611814345992, + "grad_norm": 1.1745941638946533, + "learning_rate": 9.342877935607521e-05, + "loss": 0.677288293838501, + "step": 3298 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 1.2651115655899048, + "learning_rate": 9.34168737411957e-05, + "loss": 0.7408396005630493, + "step": 3300 + }, + { + "epoch": 1.3924050632911391, + "eval_loss": 0.7173135876655579, + "eval_runtime": 853.5344, + "eval_samples_per_second": 2.469, + "eval_steps_per_second": 2.469, + "step": 3300 + }, + { + "epoch": 1.3932489451476793, + "grad_norm": 1.0747730731964111, + "learning_rate": 9.340495811087723e-05, + "loss": 0.6810371279716492, + "step": 3302 + }, + { + "epoch": 1.3940928270042194, + "grad_norm": 1.2857651710510254, + "learning_rate": 9.339303246786854e-05, + "loss": 0.6693953275680542, + "step": 3304 + }, + { + "epoch": 1.3949367088607594, + "grad_norm": 1.4544212818145752, + "learning_rate": 9.338109681492063e-05, + "loss": 0.7019274234771729, + "step": 3306 + }, + { + "epoch": 1.3957805907172995, + "grad_norm": 1.687755823135376, + "learning_rate": 9.336915115478685e-05, + "loss": 0.6074224710464478, + "step": 3308 + }, + { + "epoch": 1.3966244725738397, + "grad_norm": 1.1645431518554688, + "learning_rate": 9.33571954902228e-05, + "loss": 0.6981383562088013, + "step": 3310 + }, + { + "epoch": 1.3974683544303796, + "grad_norm": 1.6173527240753174, + "learning_rate": 9.334522982398646e-05, + "loss": 0.7282926440238953, + "step": 3312 + }, + { + "epoch": 1.3983122362869198, + "grad_norm": 1.3132909536361694, + "learning_rate": 9.333325415883804e-05, + "loss": 0.6574883460998535, + "step": 3314 + }, + { + "epoch": 1.39915611814346, + "grad_norm": 1.1629762649536133, + "learning_rate": 9.332126849754014e-05, + "loss": 0.6559937596321106, + "step": 3316 + }, + { + "epoch": 1.4, + "grad_norm": 1.1666897535324097, + "learning_rate": 9.33092728428576e-05, + "loss": 0.683718740940094, + "step": 3318 + }, + { + "epoch": 1.40084388185654, + "grad_norm": 1.2269554138183594, + "learning_rate": 9.329726719755756e-05, + "loss": 0.6909779906272888, + "step": 3320 + }, + { + "epoch": 1.4016877637130802, + "grad_norm": 1.1010066270828247, + "learning_rate": 9.328525156440952e-05, + "loss": 0.6051948666572571, + "step": 3322 + }, + { + "epoch": 1.4025316455696202, + "grad_norm": 1.127143144607544, + "learning_rate": 9.327322594618528e-05, + "loss": 0.6266679763793945, + "step": 3324 + }, + { + "epoch": 1.4033755274261603, + "grad_norm": 1.2160708904266357, + "learning_rate": 9.326119034565887e-05, + "loss": 0.6587526202201843, + "step": 3326 + }, + { + "epoch": 1.4042194092827005, + "grad_norm": 1.0853947401046753, + "learning_rate": 9.32491447656067e-05, + "loss": 0.5916946530342102, + "step": 3328 + }, + { + "epoch": 1.4050632911392404, + "grad_norm": 1.2205027341842651, + "learning_rate": 9.323708920880744e-05, + "loss": 0.6032452583312988, + "step": 3330 + }, + { + "epoch": 1.4059071729957806, + "grad_norm": 1.1964668035507202, + "learning_rate": 9.32250236780421e-05, + "loss": 0.6649114489555359, + "step": 3332 + }, + { + "epoch": 1.4067510548523208, + "grad_norm": 1.2507994174957275, + "learning_rate": 9.321294817609394e-05, + "loss": 0.7142994403839111, + "step": 3334 + }, + { + "epoch": 1.4075949367088607, + "grad_norm": 1.1310259103775024, + "learning_rate": 9.320086270574854e-05, + "loss": 0.709568977355957, + "step": 3336 + }, + { + "epoch": 1.4084388185654009, + "grad_norm": 1.2454090118408203, + "learning_rate": 9.318876726979385e-05, + "loss": 0.7800853848457336, + "step": 3338 + }, + { + "epoch": 1.409282700421941, + "grad_norm": 1.1168389320373535, + "learning_rate": 9.317666187101996e-05, + "loss": 0.6187908053398132, + "step": 3340 + }, + { + "epoch": 1.410126582278481, + "grad_norm": 1.6696287393569946, + "learning_rate": 9.316454651221942e-05, + "loss": 0.6222613453865051, + "step": 3342 + }, + { + "epoch": 1.4109704641350211, + "grad_norm": 0.9500295519828796, + "learning_rate": 9.315242119618698e-05, + "loss": 0.6116594672203064, + "step": 3344 + }, + { + "epoch": 1.4118143459915613, + "grad_norm": 1.186358094215393, + "learning_rate": 9.314028592571973e-05, + "loss": 0.633224368095398, + "step": 3346 + }, + { + "epoch": 1.4126582278481012, + "grad_norm": 1.1855978965759277, + "learning_rate": 9.312814070361705e-05, + "loss": 0.6675921082496643, + "step": 3348 + }, + { + "epoch": 1.4135021097046414, + "grad_norm": 1.2465872764587402, + "learning_rate": 9.311598553268059e-05, + "loss": 0.7268879413604736, + "step": 3350 + }, + { + "epoch": 1.4143459915611816, + "grad_norm": 1.151274561882019, + "learning_rate": 9.310382041571435e-05, + "loss": 0.6147416830062866, + "step": 3352 + }, + { + "epoch": 1.4151898734177215, + "grad_norm": 1.1226807832717896, + "learning_rate": 9.309164535552453e-05, + "loss": 0.6678543090820312, + "step": 3354 + }, + { + "epoch": 1.4160337552742617, + "grad_norm": 1.375842571258545, + "learning_rate": 9.307946035491975e-05, + "loss": 0.6334129571914673, + "step": 3356 + }, + { + "epoch": 1.4168776371308016, + "grad_norm": 1.058353066444397, + "learning_rate": 9.306726541671081e-05, + "loss": 0.6582583785057068, + "step": 3358 + }, + { + "epoch": 1.4177215189873418, + "grad_norm": 1.0511330366134644, + "learning_rate": 9.305506054371084e-05, + "loss": 0.5877419114112854, + "step": 3360 + }, + { + "epoch": 1.4185654008438817, + "grad_norm": 1.2246462106704712, + "learning_rate": 9.304284573873532e-05, + "loss": 0.711665689945221, + "step": 3362 + }, + { + "epoch": 1.4194092827004219, + "grad_norm": 1.0242294073104858, + "learning_rate": 9.303062100460193e-05, + "loss": 0.6743642687797546, + "step": 3364 + }, + { + "epoch": 1.420253164556962, + "grad_norm": 1.1432100534439087, + "learning_rate": 9.301838634413069e-05, + "loss": 0.6825576424598694, + "step": 3366 + }, + { + "epoch": 1.421097046413502, + "grad_norm": 1.0128604173660278, + "learning_rate": 9.30061417601439e-05, + "loss": 0.624455988407135, + "step": 3368 + }, + { + "epoch": 1.4219409282700421, + "grad_norm": 1.2738330364227295, + "learning_rate": 9.299388725546617e-05, + "loss": 0.7029586434364319, + "step": 3370 + }, + { + "epoch": 1.4227848101265823, + "grad_norm": 1.0857324600219727, + "learning_rate": 9.298162283292435e-05, + "loss": 0.5994319915771484, + "step": 3372 + }, + { + "epoch": 1.4236286919831223, + "grad_norm": 1.0811917781829834, + "learning_rate": 9.296934849534763e-05, + "loss": 0.6537772417068481, + "step": 3374 + }, + { + "epoch": 1.4244725738396624, + "grad_norm": 1.006913185119629, + "learning_rate": 9.295706424556745e-05, + "loss": 0.5775008201599121, + "step": 3376 + }, + { + "epoch": 1.4253164556962026, + "grad_norm": 1.2306486368179321, + "learning_rate": 9.294477008641755e-05, + "loss": 0.7445536255836487, + "step": 3378 + }, + { + "epoch": 1.4261603375527425, + "grad_norm": 1.223608374595642, + "learning_rate": 9.293246602073398e-05, + "loss": 0.6081538796424866, + "step": 3380 + }, + { + "epoch": 1.4270042194092827, + "grad_norm": 1.0933321714401245, + "learning_rate": 9.2920152051355e-05, + "loss": 0.6134634613990784, + "step": 3382 + }, + { + "epoch": 1.4278481012658228, + "grad_norm": 1.1738401651382446, + "learning_rate": 9.290782818112127e-05, + "loss": 0.5961087346076965, + "step": 3384 + }, + { + "epoch": 1.4286919831223628, + "grad_norm": 1.1493438482284546, + "learning_rate": 9.289549441287561e-05, + "loss": 0.6284122467041016, + "step": 3386 + }, + { + "epoch": 1.429535864978903, + "grad_norm": 1.1907998323440552, + "learning_rate": 9.288315074946324e-05, + "loss": 0.6654639840126038, + "step": 3388 + }, + { + "epoch": 1.4303797468354431, + "grad_norm": 1.3423025608062744, + "learning_rate": 9.287079719373157e-05, + "loss": 0.652850329875946, + "step": 3390 + }, + { + "epoch": 1.431223628691983, + "grad_norm": 1.3932039737701416, + "learning_rate": 9.285843374853034e-05, + "loss": 0.703445315361023, + "step": 3392 + }, + { + "epoch": 1.4320675105485232, + "grad_norm": 5.349400043487549, + "learning_rate": 9.284606041671155e-05, + "loss": 0.693265438079834, + "step": 3394 + }, + { + "epoch": 1.4329113924050634, + "grad_norm": 1.0921961069107056, + "learning_rate": 9.28336772011295e-05, + "loss": 0.6578536033630371, + "step": 3396 + }, + { + "epoch": 1.4337552742616033, + "grad_norm": 1.184157133102417, + "learning_rate": 9.282128410464074e-05, + "loss": 0.7092277407646179, + "step": 3398 + }, + { + "epoch": 1.4345991561181435, + "grad_norm": 1.0923491716384888, + "learning_rate": 9.280888113010415e-05, + "loss": 0.6866328120231628, + "step": 3400 + }, + { + "epoch": 1.4345991561181435, + "eval_loss": 0.715917706489563, + "eval_runtime": 868.51, + "eval_samples_per_second": 2.426, + "eval_steps_per_second": 2.426, + "step": 3400 + }, + { + "epoch": 1.4354430379746836, + "grad_norm": 1.2515597343444824, + "learning_rate": 9.279646828038083e-05, + "loss": 0.6617444157600403, + "step": 3402 + }, + { + "epoch": 1.4362869198312236, + "grad_norm": 1.2122540473937988, + "learning_rate": 9.278404555833422e-05, + "loss": 0.6373176574707031, + "step": 3404 + }, + { + "epoch": 1.4371308016877637, + "grad_norm": 1.191904902458191, + "learning_rate": 9.277161296682997e-05, + "loss": 0.6506488919258118, + "step": 3406 + }, + { + "epoch": 1.437974683544304, + "grad_norm": 1.2492214441299438, + "learning_rate": 9.275917050873606e-05, + "loss": 0.7172291874885559, + "step": 3408 + }, + { + "epoch": 1.4388185654008439, + "grad_norm": 1.0518640279769897, + "learning_rate": 9.274671818692272e-05, + "loss": 0.6180248260498047, + "step": 3410 + }, + { + "epoch": 1.439662447257384, + "grad_norm": 1.150563359260559, + "learning_rate": 9.273425600426245e-05, + "loss": 0.6828892827033997, + "step": 3412 + }, + { + "epoch": 1.4405063291139242, + "grad_norm": 1.76945960521698, + "learning_rate": 9.272178396363005e-05, + "loss": 0.6585919857025146, + "step": 3414 + }, + { + "epoch": 1.4413502109704641, + "grad_norm": 1.2367758750915527, + "learning_rate": 9.270930206790257e-05, + "loss": 0.7548692226409912, + "step": 3416 + }, + { + "epoch": 1.4421940928270043, + "grad_norm": 1.2292778491973877, + "learning_rate": 9.269681031995936e-05, + "loss": 0.7017102837562561, + "step": 3418 + }, + { + "epoch": 1.4430379746835442, + "grad_norm": 1.2193396091461182, + "learning_rate": 9.268430872268202e-05, + "loss": 0.6657648682594299, + "step": 3420 + }, + { + "epoch": 1.4438818565400844, + "grad_norm": 1.0505954027175903, + "learning_rate": 9.267179727895443e-05, + "loss": 0.6950910091400146, + "step": 3422 + }, + { + "epoch": 1.4447257383966245, + "grad_norm": 1.1560698747634888, + "learning_rate": 9.265927599166272e-05, + "loss": 0.689308226108551, + "step": 3424 + }, + { + "epoch": 1.4455696202531645, + "grad_norm": 1.189336895942688, + "learning_rate": 9.264674486369533e-05, + "loss": 0.6481659412384033, + "step": 3426 + }, + { + "epoch": 1.4464135021097047, + "grad_norm": 1.3527976274490356, + "learning_rate": 9.263420389794294e-05, + "loss": 0.6626612544059753, + "step": 3428 + }, + { + "epoch": 1.4472573839662446, + "grad_norm": 1.096303105354309, + "learning_rate": 9.262165309729854e-05, + "loss": 0.690841794013977, + "step": 3430 + }, + { + "epoch": 1.4481012658227848, + "grad_norm": 1.2131421566009521, + "learning_rate": 9.260909246465732e-05, + "loss": 0.6497649550437927, + "step": 3432 + }, + { + "epoch": 1.448945147679325, + "grad_norm": 1.1831032037734985, + "learning_rate": 9.259652200291678e-05, + "loss": 0.6236130595207214, + "step": 3434 + }, + { + "epoch": 1.4497890295358649, + "grad_norm": 0.9745979309082031, + "learning_rate": 9.25839417149767e-05, + "loss": 0.5223423838615417, + "step": 3436 + }, + { + "epoch": 1.450632911392405, + "grad_norm": 1.372460126876831, + "learning_rate": 9.257135160373912e-05, + "loss": 0.6642022728919983, + "step": 3438 + }, + { + "epoch": 1.4514767932489452, + "grad_norm": 1.421044111251831, + "learning_rate": 9.255875167210832e-05, + "loss": 0.5426992774009705, + "step": 3440 + }, + { + "epoch": 1.4523206751054851, + "grad_norm": 1.1694250106811523, + "learning_rate": 9.254614192299086e-05, + "loss": 0.6260567307472229, + "step": 3442 + }, + { + "epoch": 1.4531645569620253, + "grad_norm": 1.0892298221588135, + "learning_rate": 9.253352235929558e-05, + "loss": 0.5776100158691406, + "step": 3444 + }, + { + "epoch": 1.4540084388185655, + "grad_norm": 1.1841259002685547, + "learning_rate": 9.252089298393356e-05, + "loss": 0.6495202779769897, + "step": 3446 + }, + { + "epoch": 1.4548523206751054, + "grad_norm": 1.1133549213409424, + "learning_rate": 9.250825379981815e-05, + "loss": 0.6570594906806946, + "step": 3448 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 1.197100281715393, + "learning_rate": 9.249560480986498e-05, + "loss": 0.6496587991714478, + "step": 3450 + }, + { + "epoch": 1.4565400843881857, + "grad_norm": 1.1661107540130615, + "learning_rate": 9.248294601699193e-05, + "loss": 0.6644704341888428, + "step": 3452 + }, + { + "epoch": 1.4573839662447257, + "grad_norm": 1.2257879972457886, + "learning_rate": 9.247027742411912e-05, + "loss": 0.6451231241226196, + "step": 3454 + }, + { + "epoch": 1.4582278481012658, + "grad_norm": 1.3634982109069824, + "learning_rate": 9.245759903416897e-05, + "loss": 0.6108601093292236, + "step": 3456 + }, + { + "epoch": 1.459071729957806, + "grad_norm": 1.1802605390548706, + "learning_rate": 9.244491085006615e-05, + "loss": 0.6080004572868347, + "step": 3458 + }, + { + "epoch": 1.459915611814346, + "grad_norm": 1.280831217765808, + "learning_rate": 9.243221287473756e-05, + "loss": 0.6406423449516296, + "step": 3460 + }, + { + "epoch": 1.460759493670886, + "grad_norm": 1.3127192258834839, + "learning_rate": 9.241950511111237e-05, + "loss": 0.7320113778114319, + "step": 3462 + }, + { + "epoch": 1.4616033755274263, + "grad_norm": 1.1711835861206055, + "learning_rate": 9.240678756212204e-05, + "loss": 0.572110652923584, + "step": 3464 + }, + { + "epoch": 1.4624472573839662, + "grad_norm": 1.347143292427063, + "learning_rate": 9.239406023070028e-05, + "loss": 0.7446795105934143, + "step": 3466 + }, + { + "epoch": 1.4632911392405064, + "grad_norm": 1.4953652620315552, + "learning_rate": 9.238132311978299e-05, + "loss": 0.6709978580474854, + "step": 3468 + }, + { + "epoch": 1.4641350210970465, + "grad_norm": 1.2199387550354004, + "learning_rate": 9.236857623230842e-05, + "loss": 0.6691445112228394, + "step": 3470 + }, + { + "epoch": 1.4649789029535865, + "grad_norm": 1.0959199666976929, + "learning_rate": 9.235581957121702e-05, + "loss": 0.6964292526245117, + "step": 3472 + }, + { + "epoch": 1.4658227848101266, + "grad_norm": 1.455505609512329, + "learning_rate": 9.234305313945149e-05, + "loss": 0.6880454421043396, + "step": 3474 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.2820862531661987, + "learning_rate": 9.233027693995681e-05, + "loss": 0.6737138032913208, + "step": 3476 + }, + { + "epoch": 1.4675105485232067, + "grad_norm": 1.3459213972091675, + "learning_rate": 9.231749097568023e-05, + "loss": 0.6874006390571594, + "step": 3478 + }, + { + "epoch": 1.4683544303797469, + "grad_norm": 1.2815442085266113, + "learning_rate": 9.230469524957119e-05, + "loss": 0.7179469466209412, + "step": 3480 + }, + { + "epoch": 1.469198312236287, + "grad_norm": 1.6181597709655762, + "learning_rate": 9.229188976458145e-05, + "loss": 0.7525522112846375, + "step": 3482 + }, + { + "epoch": 1.470042194092827, + "grad_norm": 1.0633227825164795, + "learning_rate": 9.227907452366495e-05, + "loss": 0.5918128490447998, + "step": 3484 + }, + { + "epoch": 1.4708860759493672, + "grad_norm": 1.2055985927581787, + "learning_rate": 9.226624952977796e-05, + "loss": 0.6686186194419861, + "step": 3486 + }, + { + "epoch": 1.471729957805907, + "grad_norm": 1.2495088577270508, + "learning_rate": 9.225341478587893e-05, + "loss": 0.764410674571991, + "step": 3488 + }, + { + "epoch": 1.4725738396624473, + "grad_norm": 1.174229383468628, + "learning_rate": 9.22405702949286e-05, + "loss": 0.7066780924797058, + "step": 3490 + }, + { + "epoch": 1.4734177215189874, + "grad_norm": 1.0970302820205688, + "learning_rate": 9.222771605988995e-05, + "loss": 0.6740228533744812, + "step": 3492 + }, + { + "epoch": 1.4742616033755274, + "grad_norm": 1.2470436096191406, + "learning_rate": 9.221485208372822e-05, + "loss": 0.698371410369873, + "step": 3494 + }, + { + "epoch": 1.4751054852320675, + "grad_norm": 1.0750112533569336, + "learning_rate": 9.220197836941084e-05, + "loss": 0.6354188919067383, + "step": 3496 + }, + { + "epoch": 1.4759493670886075, + "grad_norm": 1.2656232118606567, + "learning_rate": 9.218909491990757e-05, + "loss": 0.7268608212471008, + "step": 3498 + }, + { + "epoch": 1.4767932489451476, + "grad_norm": 1.2389028072357178, + "learning_rate": 9.217620173819037e-05, + "loss": 0.6652966141700745, + "step": 3500 + }, + { + "epoch": 1.4767932489451476, + "eval_loss": 0.7155047059059143, + "eval_runtime": 855.8428, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 2.462, + "step": 3500 + }, + { + "epoch": 1.4776371308016878, + "grad_norm": 1.218304991722107, + "learning_rate": 9.216329882723343e-05, + "loss": 0.6845020651817322, + "step": 3502 + }, + { + "epoch": 1.4784810126582277, + "grad_norm": 1.123903512954712, + "learning_rate": 9.21503861900132e-05, + "loss": 0.6972519755363464, + "step": 3504 + }, + { + "epoch": 1.479324894514768, + "grad_norm": 1.1827739477157593, + "learning_rate": 9.213746382950839e-05, + "loss": 0.6699702739715576, + "step": 3506 + }, + { + "epoch": 1.480168776371308, + "grad_norm": 0.9934872984886169, + "learning_rate": 9.212453174869995e-05, + "loss": 0.5623225569725037, + "step": 3508 + }, + { + "epoch": 1.481012658227848, + "grad_norm": 1.221093773841858, + "learning_rate": 9.211158995057105e-05, + "loss": 0.6527173519134521, + "step": 3510 + }, + { + "epoch": 1.4818565400843882, + "grad_norm": 1.4569166898727417, + "learning_rate": 9.209863843810711e-05, + "loss": 0.7015712261199951, + "step": 3512 + }, + { + "epoch": 1.4827004219409283, + "grad_norm": 1.0764813423156738, + "learning_rate": 9.208567721429581e-05, + "loss": 0.6442505717277527, + "step": 3514 + }, + { + "epoch": 1.4835443037974683, + "grad_norm": 2.1307506561279297, + "learning_rate": 9.207270628212704e-05, + "loss": 0.666451096534729, + "step": 3516 + }, + { + "epoch": 1.4843881856540084, + "grad_norm": 1.180590271949768, + "learning_rate": 9.205972564459296e-05, + "loss": 0.6354807019233704, + "step": 3518 + }, + { + "epoch": 1.4852320675105486, + "grad_norm": 1.2999447584152222, + "learning_rate": 9.204673530468795e-05, + "loss": 0.6080324053764343, + "step": 3520 + }, + { + "epoch": 1.4860759493670885, + "grad_norm": 1.1680655479431152, + "learning_rate": 9.203373526540862e-05, + "loss": 0.6411244869232178, + "step": 3522 + }, + { + "epoch": 1.4869198312236287, + "grad_norm": 1.0565013885498047, + "learning_rate": 9.202072552975383e-05, + "loss": 0.6498287916183472, + "step": 3524 + }, + { + "epoch": 1.4877637130801689, + "grad_norm": 1.246267318725586, + "learning_rate": 9.20077061007247e-05, + "loss": 0.633613109588623, + "step": 3526 + }, + { + "epoch": 1.4886075949367088, + "grad_norm": 1.0626300573349, + "learning_rate": 9.199467698132453e-05, + "loss": 0.6102107167243958, + "step": 3528 + }, + { + "epoch": 1.489451476793249, + "grad_norm": 1.256600260734558, + "learning_rate": 9.198163817455892e-05, + "loss": 0.669352114200592, + "step": 3530 + }, + { + "epoch": 1.4902953586497891, + "grad_norm": 1.143188238143921, + "learning_rate": 9.196858968343565e-05, + "loss": 0.6305804252624512, + "step": 3532 + }, + { + "epoch": 1.491139240506329, + "grad_norm": 1.1471205949783325, + "learning_rate": 9.195553151096475e-05, + "loss": 0.6256994605064392, + "step": 3534 + }, + { + "epoch": 1.4919831223628692, + "grad_norm": 1.1771589517593384, + "learning_rate": 9.194246366015851e-05, + "loss": 0.6395107507705688, + "step": 3536 + }, + { + "epoch": 1.4928270042194094, + "grad_norm": 1.1997097730636597, + "learning_rate": 9.192938613403144e-05, + "loss": 0.6875160932540894, + "step": 3538 + }, + { + "epoch": 1.4936708860759493, + "grad_norm": 1.3962169885635376, + "learning_rate": 9.191629893560024e-05, + "loss": 0.7216510772705078, + "step": 3540 + }, + { + "epoch": 1.4945147679324895, + "grad_norm": 1.1835654973983765, + "learning_rate": 9.19032020678839e-05, + "loss": 0.6870693564414978, + "step": 3542 + }, + { + "epoch": 1.4953586497890297, + "grad_norm": 1.112331509590149, + "learning_rate": 9.18900955339036e-05, + "loss": 0.6266092658042908, + "step": 3544 + }, + { + "epoch": 1.4962025316455696, + "grad_norm": 1.0298354625701904, + "learning_rate": 9.187697933668278e-05, + "loss": 0.5906343460083008, + "step": 3546 + }, + { + "epoch": 1.4970464135021098, + "grad_norm": 1.2650012969970703, + "learning_rate": 9.186385347924709e-05, + "loss": 0.6203610897064209, + "step": 3548 + }, + { + "epoch": 1.49789029535865, + "grad_norm": 1.1208417415618896, + "learning_rate": 9.185071796462441e-05, + "loss": 0.6841281652450562, + "step": 3550 + }, + { + "epoch": 1.4987341772151899, + "grad_norm": 1.1319488286972046, + "learning_rate": 9.183757279584486e-05, + "loss": 0.7089514136314392, + "step": 3552 + }, + { + "epoch": 1.49957805907173, + "grad_norm": 1.1104235649108887, + "learning_rate": 9.182441797594076e-05, + "loss": 0.6663861870765686, + "step": 3554 + }, + { + "epoch": 1.5004219409282702, + "grad_norm": 1.161412000656128, + "learning_rate": 9.18112535079467e-05, + "loss": 0.6713237762451172, + "step": 3556 + }, + { + "epoch": 1.5012658227848101, + "grad_norm": 1.2925246953964233, + "learning_rate": 9.179807939489945e-05, + "loss": 0.6665274500846863, + "step": 3558 + }, + { + "epoch": 1.50210970464135, + "grad_norm": 1.0968270301818848, + "learning_rate": 9.178489563983802e-05, + "loss": 0.6881593465805054, + "step": 3560 + }, + { + "epoch": 1.5029535864978905, + "grad_norm": 1.111439824104309, + "learning_rate": 9.177170224580368e-05, + "loss": 0.631568431854248, + "step": 3562 + }, + { + "epoch": 1.5037974683544304, + "grad_norm": 1.6731075048446655, + "learning_rate": 9.175849921583986e-05, + "loss": 0.6896167397499084, + "step": 3564 + }, + { + "epoch": 1.5046413502109703, + "grad_norm": 1.226739525794983, + "learning_rate": 9.174528655299226e-05, + "loss": 0.6285277605056763, + "step": 3566 + }, + { + "epoch": 1.5054852320675105, + "grad_norm": 1.2030941247940063, + "learning_rate": 9.17320642603088e-05, + "loss": 0.6256678700447083, + "step": 3568 + }, + { + "epoch": 1.5063291139240507, + "grad_norm": 1.1980781555175781, + "learning_rate": 9.171883234083958e-05, + "loss": 0.6895992159843445, + "step": 3570 + }, + { + "epoch": 1.5071729957805906, + "grad_norm": 1.2083429098129272, + "learning_rate": 9.170559079763696e-05, + "loss": 0.6642275452613831, + "step": 3572 + }, + { + "epoch": 1.5080168776371308, + "grad_norm": 1.134020209312439, + "learning_rate": 9.169233963375552e-05, + "loss": 0.7441924214363098, + "step": 3574 + }, + { + "epoch": 1.508860759493671, + "grad_norm": 1.8178621530532837, + "learning_rate": 9.167907885225204e-05, + "loss": 0.6435995101928711, + "step": 3576 + }, + { + "epoch": 1.5097046413502109, + "grad_norm": 1.3850326538085938, + "learning_rate": 9.166580845618553e-05, + "loss": 0.6933603882789612, + "step": 3578 + }, + { + "epoch": 1.510548523206751, + "grad_norm": 1.2500641345977783, + "learning_rate": 9.165252844861723e-05, + "loss": 0.6686714887619019, + "step": 3580 + }, + { + "epoch": 1.5113924050632912, + "grad_norm": 1.0226643085479736, + "learning_rate": 9.163923883261056e-05, + "loss": 0.607890248298645, + "step": 3582 + }, + { + "epoch": 1.5122362869198311, + "grad_norm": 1.233402132987976, + "learning_rate": 9.162593961123118e-05, + "loss": 0.6604583859443665, + "step": 3584 + }, + { + "epoch": 1.5130801687763713, + "grad_norm": 1.2609056234359741, + "learning_rate": 9.161263078754698e-05, + "loss": 0.6756428480148315, + "step": 3586 + }, + { + "epoch": 1.5139240506329115, + "grad_norm": 1.22673761844635, + "learning_rate": 9.159931236462805e-05, + "loss": 0.6990940570831299, + "step": 3588 + }, + { + "epoch": 1.5147679324894514, + "grad_norm": 1.1386182308197021, + "learning_rate": 9.158598434554668e-05, + "loss": 0.6436648964881897, + "step": 3590 + }, + { + "epoch": 1.5156118143459916, + "grad_norm": 1.1136831045150757, + "learning_rate": 9.157264673337739e-05, + "loss": 0.6420145034790039, + "step": 3592 + }, + { + "epoch": 1.5164556962025317, + "grad_norm": 1.1957908868789673, + "learning_rate": 9.155929953119693e-05, + "loss": 0.6518592834472656, + "step": 3594 + }, + { + "epoch": 1.5172995780590717, + "grad_norm": 1.1049647331237793, + "learning_rate": 9.154594274208422e-05, + "loss": 0.6891129612922668, + "step": 3596 + }, + { + "epoch": 1.5181434599156118, + "grad_norm": 1.243675947189331, + "learning_rate": 9.153257636912043e-05, + "loss": 0.6945107579231262, + "step": 3598 + }, + { + "epoch": 1.518987341772152, + "grad_norm": 1.2633713483810425, + "learning_rate": 9.15192004153889e-05, + "loss": 0.7011660933494568, + "step": 3600 + }, + { + "epoch": 1.518987341772152, + "eval_loss": 0.7118256688117981, + "eval_runtime": 851.3079, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 2.475, + "step": 3600 + }, + { + "epoch": 1.519831223628692, + "grad_norm": 1.2995525598526, + "learning_rate": 9.150581488397525e-05, + "loss": 0.6843758821487427, + "step": 3602 + }, + { + "epoch": 1.520675105485232, + "grad_norm": 1.3140910863876343, + "learning_rate": 9.149241977796723e-05, + "loss": 0.6699353456497192, + "step": 3604 + }, + { + "epoch": 1.5215189873417723, + "grad_norm": 1.2674909830093384, + "learning_rate": 9.147901510045485e-05, + "loss": 0.7269271612167358, + "step": 3606 + }, + { + "epoch": 1.5223628691983122, + "grad_norm": 1.0232038497924805, + "learning_rate": 9.146560085453031e-05, + "loss": 0.5556837916374207, + "step": 3608 + }, + { + "epoch": 1.5232067510548524, + "grad_norm": 1.2598992586135864, + "learning_rate": 9.1452177043288e-05, + "loss": 0.7273092269897461, + "step": 3610 + }, + { + "epoch": 1.5240506329113925, + "grad_norm": 1.2002917528152466, + "learning_rate": 9.143874366982455e-05, + "loss": 0.6897470355033875, + "step": 3612 + }, + { + "epoch": 1.5248945147679325, + "grad_norm": 1.0959099531173706, + "learning_rate": 9.142530073723878e-05, + "loss": 0.6060715913772583, + "step": 3614 + }, + { + "epoch": 1.5257383966244724, + "grad_norm": 1.9890750646591187, + "learning_rate": 9.141184824863173e-05, + "loss": 0.6585046052932739, + "step": 3616 + }, + { + "epoch": 1.5265822784810128, + "grad_norm": 1.1460137367248535, + "learning_rate": 9.139838620710663e-05, + "loss": 0.6022046804428101, + "step": 3618 + }, + { + "epoch": 1.5274261603375527, + "grad_norm": 1.193206548690796, + "learning_rate": 9.138491461576888e-05, + "loss": 0.6332581639289856, + "step": 3620 + }, + { + "epoch": 1.5282700421940927, + "grad_norm": 1.2813689708709717, + "learning_rate": 9.137143347772614e-05, + "loss": 0.6690208315849304, + "step": 3622 + }, + { + "epoch": 1.529113924050633, + "grad_norm": 1.0950052738189697, + "learning_rate": 9.135794279608827e-05, + "loss": 0.6034293174743652, + "step": 3624 + }, + { + "epoch": 1.529957805907173, + "grad_norm": 1.208884358406067, + "learning_rate": 9.134444257396729e-05, + "loss": 0.7077960968017578, + "step": 3626 + }, + { + "epoch": 1.530801687763713, + "grad_norm": 1.093759298324585, + "learning_rate": 9.133093281447742e-05, + "loss": 0.6741147637367249, + "step": 3628 + }, + { + "epoch": 1.5316455696202531, + "grad_norm": 1.1280012130737305, + "learning_rate": 9.131741352073514e-05, + "loss": 0.6816818118095398, + "step": 3630 + }, + { + "epoch": 1.5324894514767933, + "grad_norm": 1.2868385314941406, + "learning_rate": 9.130388469585907e-05, + "loss": 0.7149180769920349, + "step": 3632 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.9654553532600403, + "learning_rate": 9.129034634297007e-05, + "loss": 0.613467812538147, + "step": 3634 + }, + { + "epoch": 1.5341772151898734, + "grad_norm": 1.8958736658096313, + "learning_rate": 9.127679846519115e-05, + "loss": 0.7034116387367249, + "step": 3636 + }, + { + "epoch": 1.5350210970464135, + "grad_norm": 1.305284857749939, + "learning_rate": 9.126324106564757e-05, + "loss": 0.7076106667518616, + "step": 3638 + }, + { + "epoch": 1.5358649789029535, + "grad_norm": 1.1843762397766113, + "learning_rate": 9.124967414746675e-05, + "loss": 0.6671180725097656, + "step": 3640 + }, + { + "epoch": 1.5367088607594936, + "grad_norm": 1.0460047721862793, + "learning_rate": 9.123609771377832e-05, + "loss": 0.667533814907074, + "step": 3642 + }, + { + "epoch": 1.5375527426160338, + "grad_norm": 1.0441135168075562, + "learning_rate": 9.122251176771409e-05, + "loss": 0.6454499959945679, + "step": 3644 + }, + { + "epoch": 1.5383966244725737, + "grad_norm": 1.5647634267807007, + "learning_rate": 9.120891631240811e-05, + "loss": 0.677007794380188, + "step": 3646 + }, + { + "epoch": 1.539240506329114, + "grad_norm": 1.0650273561477661, + "learning_rate": 9.119531135099655e-05, + "loss": 0.7017449736595154, + "step": 3648 + }, + { + "epoch": 1.540084388185654, + "grad_norm": 1.2904767990112305, + "learning_rate": 9.118169688661784e-05, + "loss": 0.683830738067627, + "step": 3650 + }, + { + "epoch": 1.540928270042194, + "grad_norm": 1.1278672218322754, + "learning_rate": 9.116807292241257e-05, + "loss": 0.5923286080360413, + "step": 3652 + }, + { + "epoch": 1.5417721518987342, + "grad_norm": 1.1107184886932373, + "learning_rate": 9.115443946152352e-05, + "loss": 0.6595140099525452, + "step": 3654 + }, + { + "epoch": 1.5426160337552743, + "grad_norm": 1.0917898416519165, + "learning_rate": 9.114079650709566e-05, + "loss": 0.655241072177887, + "step": 3656 + }, + { + "epoch": 1.5434599156118143, + "grad_norm": 1.1922433376312256, + "learning_rate": 9.11271440622762e-05, + "loss": 0.5987096428871155, + "step": 3658 + }, + { + "epoch": 1.5443037974683544, + "grad_norm": 0.9974617958068848, + "learning_rate": 9.111348213021445e-05, + "loss": 0.5710145235061646, + "step": 3660 + }, + { + "epoch": 1.5451476793248946, + "grad_norm": 1.133683443069458, + "learning_rate": 9.109981071406197e-05, + "loss": 0.6067734360694885, + "step": 3662 + }, + { + "epoch": 1.5459915611814345, + "grad_norm": 1.1958736181259155, + "learning_rate": 9.108612981697248e-05, + "loss": 0.622981071472168, + "step": 3664 + }, + { + "epoch": 1.5468354430379747, + "grad_norm": 1.234328031539917, + "learning_rate": 9.107243944210194e-05, + "loss": 0.6520710587501526, + "step": 3666 + }, + { + "epoch": 1.5476793248945149, + "grad_norm": 1.0374714136123657, + "learning_rate": 9.105873959260842e-05, + "loss": 0.5993341207504272, + "step": 3668 + }, + { + "epoch": 1.5485232067510548, + "grad_norm": 0.9987428784370422, + "learning_rate": 9.104503027165223e-05, + "loss": 0.6564813852310181, + "step": 3670 + }, + { + "epoch": 1.549367088607595, + "grad_norm": 1.0823339223861694, + "learning_rate": 9.103131148239584e-05, + "loss": 0.61710524559021, + "step": 3672 + }, + { + "epoch": 1.5502109704641351, + "grad_norm": 1.3481065034866333, + "learning_rate": 9.101758322800391e-05, + "loss": 0.687752366065979, + "step": 3674 + }, + { + "epoch": 1.551054852320675, + "grad_norm": 1.2243965864181519, + "learning_rate": 9.10038455116433e-05, + "loss": 0.5981095433235168, + "step": 3676 + }, + { + "epoch": 1.5518987341772152, + "grad_norm": 1.1384631395339966, + "learning_rate": 9.0990098336483e-05, + "loss": 0.7181004285812378, + "step": 3678 + }, + { + "epoch": 1.5527426160337554, + "grad_norm": 1.042925477027893, + "learning_rate": 9.097634170569426e-05, + "loss": 0.6137188076972961, + "step": 3680 + }, + { + "epoch": 1.5535864978902953, + "grad_norm": 1.372023105621338, + "learning_rate": 9.096257562245045e-05, + "loss": 0.6761168241500854, + "step": 3682 + }, + { + "epoch": 1.5544303797468353, + "grad_norm": 1.0574673414230347, + "learning_rate": 9.094880008992714e-05, + "loss": 0.614276647567749, + "step": 3684 + }, + { + "epoch": 1.5552742616033757, + "grad_norm": 1.2894645929336548, + "learning_rate": 9.093501511130208e-05, + "loss": 0.668122410774231, + "step": 3686 + }, + { + "epoch": 1.5561181434599156, + "grad_norm": 1.2241230010986328, + "learning_rate": 9.092122068975523e-05, + "loss": 0.6305631399154663, + "step": 3688 + }, + { + "epoch": 1.5569620253164556, + "grad_norm": 1.1316208839416504, + "learning_rate": 9.090741682846866e-05, + "loss": 0.633276641368866, + "step": 3690 + }, + { + "epoch": 1.557805907172996, + "grad_norm": 1.2857953310012817, + "learning_rate": 9.089360353062666e-05, + "loss": 0.6657599806785583, + "step": 3692 + }, + { + "epoch": 1.5586497890295359, + "grad_norm": 1.2325671911239624, + "learning_rate": 9.087978079941573e-05, + "loss": 0.6379332542419434, + "step": 3694 + }, + { + "epoch": 1.5594936708860758, + "grad_norm": 1.3286080360412598, + "learning_rate": 9.086594863802445e-05, + "loss": 0.6841909885406494, + "step": 3696 + }, + { + "epoch": 1.560337552742616, + "grad_norm": 1.261890172958374, + "learning_rate": 9.085210704964368e-05, + "loss": 0.6735964417457581, + "step": 3698 + }, + { + "epoch": 1.5611814345991561, + "grad_norm": 1.0922305583953857, + "learning_rate": 9.083825603746639e-05, + "loss": 0.6602351665496826, + "step": 3700 + }, + { + "epoch": 1.5611814345991561, + "eval_loss": 0.7099412679672241, + "eval_runtime": 857.2273, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 3700 + }, + { + "epoch": 1.562025316455696, + "grad_norm": 1.1113468408584595, + "learning_rate": 9.082439560468774e-05, + "loss": 0.6590834259986877, + "step": 3702 + }, + { + "epoch": 1.5628691983122363, + "grad_norm": 1.1476659774780273, + "learning_rate": 9.081052575450508e-05, + "loss": 0.6397460103034973, + "step": 3704 + }, + { + "epoch": 1.5637130801687764, + "grad_norm": 1.2270452976226807, + "learning_rate": 9.07966464901179e-05, + "loss": 0.6337460279464722, + "step": 3706 + }, + { + "epoch": 1.5645569620253164, + "grad_norm": 1.233667016029358, + "learning_rate": 9.07827578147279e-05, + "loss": 0.680374801158905, + "step": 3708 + }, + { + "epoch": 1.5654008438818565, + "grad_norm": 1.0761466026306152, + "learning_rate": 9.076885973153891e-05, + "loss": 0.6234241724014282, + "step": 3710 + }, + { + "epoch": 1.5662447257383967, + "grad_norm": 0.9219012260437012, + "learning_rate": 9.075495224375697e-05, + "loss": 0.6096800565719604, + "step": 3712 + }, + { + "epoch": 1.5670886075949366, + "grad_norm": 1.151168942451477, + "learning_rate": 9.074103535459026e-05, + "loss": 0.649919867515564, + "step": 3714 + }, + { + "epoch": 1.5679324894514768, + "grad_norm": 1.1380470991134644, + "learning_rate": 9.072710906724914e-05, + "loss": 0.6704574227333069, + "step": 3716 + }, + { + "epoch": 1.568776371308017, + "grad_norm": 1.2184447050094604, + "learning_rate": 9.071317338494614e-05, + "loss": 0.6619362831115723, + "step": 3718 + }, + { + "epoch": 1.5696202531645569, + "grad_norm": 1.131170630455017, + "learning_rate": 9.069922831089594e-05, + "loss": 0.6179121732711792, + "step": 3720 + }, + { + "epoch": 1.570464135021097, + "grad_norm": 1.2668405771255493, + "learning_rate": 9.06852738483154e-05, + "loss": 0.594958484172821, + "step": 3722 + }, + { + "epoch": 1.5713080168776372, + "grad_norm": 1.1624782085418701, + "learning_rate": 9.067131000042359e-05, + "loss": 0.6323778629302979, + "step": 3724 + }, + { + "epoch": 1.5721518987341772, + "grad_norm": 1.2936128377914429, + "learning_rate": 9.065733677044166e-05, + "loss": 0.628058910369873, + "step": 3726 + }, + { + "epoch": 1.5729957805907173, + "grad_norm": 1.1847784519195557, + "learning_rate": 9.064335416159296e-05, + "loss": 0.6472614407539368, + "step": 3728 + }, + { + "epoch": 1.5738396624472575, + "grad_norm": 1.8903449773788452, + "learning_rate": 9.062936217710305e-05, + "loss": 0.6395491361618042, + "step": 3730 + }, + { + "epoch": 1.5746835443037974, + "grad_norm": 1.1150785684585571, + "learning_rate": 9.061536082019956e-05, + "loss": 0.6911961436271667, + "step": 3732 + }, + { + "epoch": 1.5755274261603376, + "grad_norm": 1.1206107139587402, + "learning_rate": 9.060135009411239e-05, + "loss": 0.7051874399185181, + "step": 3734 + }, + { + "epoch": 1.5763713080168777, + "grad_norm": 1.27924382686615, + "learning_rate": 9.05873300020735e-05, + "loss": 0.7012752890586853, + "step": 3736 + }, + { + "epoch": 1.5772151898734177, + "grad_norm": 1.3970832824707031, + "learning_rate": 9.057330054731707e-05, + "loss": 0.7185142040252686, + "step": 3738 + }, + { + "epoch": 1.5780590717299579, + "grad_norm": 0.9732457995414734, + "learning_rate": 9.055926173307945e-05, + "loss": 0.6298858523368835, + "step": 3740 + }, + { + "epoch": 1.578902953586498, + "grad_norm": 1.230928897857666, + "learning_rate": 9.054521356259909e-05, + "loss": 0.7142943739891052, + "step": 3742 + }, + { + "epoch": 1.579746835443038, + "grad_norm": 1.1297426223754883, + "learning_rate": 9.053115603911664e-05, + "loss": 0.6535376310348511, + "step": 3744 + }, + { + "epoch": 1.580590717299578, + "grad_norm": 1.2132076025009155, + "learning_rate": 9.051708916587491e-05, + "loss": 0.6236510872840881, + "step": 3746 + }, + { + "epoch": 1.5814345991561183, + "grad_norm": 1.201319932937622, + "learning_rate": 9.050301294611885e-05, + "loss": 0.6752219200134277, + "step": 3748 + }, + { + "epoch": 1.5822784810126582, + "grad_norm": 1.2969163656234741, + "learning_rate": 9.048892738309559e-05, + "loss": 0.7248554825782776, + "step": 3750 + }, + { + "epoch": 1.5831223628691982, + "grad_norm": 1.0721957683563232, + "learning_rate": 9.047483248005439e-05, + "loss": 0.6488997340202332, + "step": 3752 + }, + { + "epoch": 1.5839662447257385, + "grad_norm": 0.9988508820533752, + "learning_rate": 9.046072824024667e-05, + "loss": 0.6191130876541138, + "step": 3754 + }, + { + "epoch": 1.5848101265822785, + "grad_norm": 1.260183572769165, + "learning_rate": 9.0446614666926e-05, + "loss": 0.6681985259056091, + "step": 3756 + }, + { + "epoch": 1.5856540084388184, + "grad_norm": 1.1288834810256958, + "learning_rate": 9.043249176334812e-05, + "loss": 0.662024736404419, + "step": 3758 + }, + { + "epoch": 1.5864978902953588, + "grad_norm": 1.4384263753890991, + "learning_rate": 9.04183595327709e-05, + "loss": 0.609916627407074, + "step": 3760 + }, + { + "epoch": 1.5873417721518988, + "grad_norm": 1.1109941005706787, + "learning_rate": 9.04042179784544e-05, + "loss": 0.6532528400421143, + "step": 3762 + }, + { + "epoch": 1.5881856540084387, + "grad_norm": 1.0959233045578003, + "learning_rate": 9.039006710366078e-05, + "loss": 0.7136290669441223, + "step": 3764 + }, + { + "epoch": 1.5890295358649789, + "grad_norm": 1.2313964366912842, + "learning_rate": 9.037590691165439e-05, + "loss": 0.6907190084457397, + "step": 3766 + }, + { + "epoch": 1.589873417721519, + "grad_norm": 1.3127682209014893, + "learning_rate": 9.036173740570172e-05, + "loss": 0.7114790678024292, + "step": 3768 + }, + { + "epoch": 1.590717299578059, + "grad_norm": 1.0038903951644897, + "learning_rate": 9.034755858907138e-05, + "loss": 0.6257581114768982, + "step": 3770 + }, + { + "epoch": 1.5915611814345991, + "grad_norm": 1.1058061122894287, + "learning_rate": 9.033337046503416e-05, + "loss": 0.578145444393158, + "step": 3772 + }, + { + "epoch": 1.5924050632911393, + "grad_norm": 1.0893515348434448, + "learning_rate": 9.0319173036863e-05, + "loss": 0.6312620043754578, + "step": 3774 + }, + { + "epoch": 1.5932489451476792, + "grad_norm": 1.1091047525405884, + "learning_rate": 9.030496630783297e-05, + "loss": 0.6799508333206177, + "step": 3776 + }, + { + "epoch": 1.5940928270042194, + "grad_norm": 1.1103609800338745, + "learning_rate": 9.029075028122127e-05, + "loss": 0.678726315498352, + "step": 3778 + }, + { + "epoch": 1.5949367088607596, + "grad_norm": 1.1918376684188843, + "learning_rate": 9.027652496030728e-05, + "loss": 0.7357890009880066, + "step": 3780 + }, + { + "epoch": 1.5957805907172995, + "grad_norm": 1.0541924238204956, + "learning_rate": 9.026229034837253e-05, + "loss": 0.6079391241073608, + "step": 3782 + }, + { + "epoch": 1.5966244725738397, + "grad_norm": 1.195845603942871, + "learning_rate": 9.024804644870062e-05, + "loss": 0.7173702120780945, + "step": 3784 + }, + { + "epoch": 1.5974683544303798, + "grad_norm": 1.1362866163253784, + "learning_rate": 9.023379326457737e-05, + "loss": 0.6431670188903809, + "step": 3786 + }, + { + "epoch": 1.5983122362869198, + "grad_norm": 1.2327499389648438, + "learning_rate": 9.021953079929074e-05, + "loss": 0.6346777677536011, + "step": 3788 + }, + { + "epoch": 1.59915611814346, + "grad_norm": 1.1623177528381348, + "learning_rate": 9.020525905613078e-05, + "loss": 0.6852784156799316, + "step": 3790 + }, + { + "epoch": 1.6, + "grad_norm": 1.0258424282073975, + "learning_rate": 9.019097803838971e-05, + "loss": 0.6357095241546631, + "step": 3792 + }, + { + "epoch": 1.60084388185654, + "grad_norm": 1.0825177431106567, + "learning_rate": 9.017668774936188e-05, + "loss": 0.6663659811019897, + "step": 3794 + }, + { + "epoch": 1.6016877637130802, + "grad_norm": 1.1190401315689087, + "learning_rate": 9.016238819234381e-05, + "loss": 0.6009758710861206, + "step": 3796 + }, + { + "epoch": 1.6025316455696204, + "grad_norm": 1.09871244430542, + "learning_rate": 9.01480793706341e-05, + "loss": 0.6907890439033508, + "step": 3798 + }, + { + "epoch": 1.6033755274261603, + "grad_norm": 1.2046958208084106, + "learning_rate": 9.013376128753354e-05, + "loss": 0.6709389090538025, + "step": 3800 + }, + { + "epoch": 1.6033755274261603, + "eval_loss": 0.7080941200256348, + "eval_runtime": 865.6774, + "eval_samples_per_second": 2.434, + "eval_steps_per_second": 2.434, + "step": 3800 + }, + { + "epoch": 1.6042194092827005, + "grad_norm": 1.0671489238739014, + "learning_rate": 9.011943394634505e-05, + "loss": 0.653937041759491, + "step": 3802 + }, + { + "epoch": 1.6050632911392406, + "grad_norm": 1.4205375909805298, + "learning_rate": 9.010509735037364e-05, + "loss": 0.6647229194641113, + "step": 3804 + }, + { + "epoch": 1.6059071729957806, + "grad_norm": 1.3793799877166748, + "learning_rate": 9.009075150292652e-05, + "loss": 0.6981267929077148, + "step": 3806 + }, + { + "epoch": 1.6067510548523207, + "grad_norm": 1.0534380674362183, + "learning_rate": 9.007639640731298e-05, + "loss": 0.6151314973831177, + "step": 3808 + }, + { + "epoch": 1.6075949367088609, + "grad_norm": 1.1359853744506836, + "learning_rate": 9.006203206684447e-05, + "loss": 0.6671237349510193, + "step": 3810 + }, + { + "epoch": 1.6084388185654008, + "grad_norm": 1.2385475635528564, + "learning_rate": 9.004765848483456e-05, + "loss": 0.7145646810531616, + "step": 3812 + }, + { + "epoch": 1.6092827004219408, + "grad_norm": 1.1323930025100708, + "learning_rate": 9.003327566459899e-05, + "loss": 0.6524789929389954, + "step": 3814 + }, + { + "epoch": 1.6101265822784812, + "grad_norm": 1.1863508224487305, + "learning_rate": 9.001888360945555e-05, + "loss": 0.7574670314788818, + "step": 3816 + }, + { + "epoch": 1.610970464135021, + "grad_norm": 1.0288994312286377, + "learning_rate": 9.000448232272425e-05, + "loss": 0.5858811736106873, + "step": 3818 + }, + { + "epoch": 1.611814345991561, + "grad_norm": 1.2674148082733154, + "learning_rate": 8.999007180772719e-05, + "loss": 0.6834250688552856, + "step": 3820 + }, + { + "epoch": 1.6126582278481014, + "grad_norm": 1.2014318704605103, + "learning_rate": 8.997565206778856e-05, + "loss": 0.6435309052467346, + "step": 3822 + }, + { + "epoch": 1.6135021097046414, + "grad_norm": 1.205741286277771, + "learning_rate": 8.996122310623476e-05, + "loss": 0.6212471127510071, + "step": 3824 + }, + { + "epoch": 1.6143459915611813, + "grad_norm": 1.0866186618804932, + "learning_rate": 8.994678492639426e-05, + "loss": 0.6832143664360046, + "step": 3826 + }, + { + "epoch": 1.6151898734177215, + "grad_norm": 1.0786924362182617, + "learning_rate": 8.993233753159768e-05, + "loss": 0.6129988431930542, + "step": 3828 + }, + { + "epoch": 1.6160337552742616, + "grad_norm": 1.176597237586975, + "learning_rate": 8.991788092517775e-05, + "loss": 0.6376019716262817, + "step": 3830 + }, + { + "epoch": 1.6168776371308016, + "grad_norm": 1.149990200996399, + "learning_rate": 8.99034151104693e-05, + "loss": 0.7300569415092468, + "step": 3832 + }, + { + "epoch": 1.6177215189873417, + "grad_norm": 1.0655301809310913, + "learning_rate": 8.988894009080936e-05, + "loss": 0.6163336634635925, + "step": 3834 + }, + { + "epoch": 1.618565400843882, + "grad_norm": 1.1596909761428833, + "learning_rate": 8.987445586953703e-05, + "loss": 0.6459008455276489, + "step": 3836 + }, + { + "epoch": 1.6194092827004218, + "grad_norm": 1.201897382736206, + "learning_rate": 8.985996244999352e-05, + "loss": 0.6166399121284485, + "step": 3838 + }, + { + "epoch": 1.620253164556962, + "grad_norm": 1.1000950336456299, + "learning_rate": 8.984545983552219e-05, + "loss": 0.6438087224960327, + "step": 3840 + }, + { + "epoch": 1.6210970464135022, + "grad_norm": 0.9962409734725952, + "learning_rate": 8.983094802946854e-05, + "loss": 0.6238043308258057, + "step": 3842 + }, + { + "epoch": 1.621940928270042, + "grad_norm": 1.2501682043075562, + "learning_rate": 8.981642703518015e-05, + "loss": 0.6445946097373962, + "step": 3844 + }, + { + "epoch": 1.6227848101265823, + "grad_norm": 1.2027913331985474, + "learning_rate": 8.980189685600673e-05, + "loss": 0.7147613167762756, + "step": 3846 + }, + { + "epoch": 1.6236286919831224, + "grad_norm": 1.1382197141647339, + "learning_rate": 8.97873574953001e-05, + "loss": 0.6531714200973511, + "step": 3848 + }, + { + "epoch": 1.6244725738396624, + "grad_norm": 1.2600723505020142, + "learning_rate": 8.977280895641425e-05, + "loss": 0.6811055541038513, + "step": 3850 + }, + { + "epoch": 1.6253164556962025, + "grad_norm": 0.9908071160316467, + "learning_rate": 8.97582512427052e-05, + "loss": 0.6142261624336243, + "step": 3852 + }, + { + "epoch": 1.6261603375527427, + "grad_norm": 1.171557068824768, + "learning_rate": 8.974368435753117e-05, + "loss": 0.6408987045288086, + "step": 3854 + }, + { + "epoch": 1.6270042194092826, + "grad_norm": 1.1839419603347778, + "learning_rate": 8.972910830425247e-05, + "loss": 0.7352069616317749, + "step": 3856 + }, + { + "epoch": 1.6278481012658228, + "grad_norm": 1.233730673789978, + "learning_rate": 8.971452308623148e-05, + "loss": 0.7663040161132812, + "step": 3858 + }, + { + "epoch": 1.628691983122363, + "grad_norm": 1.3636224269866943, + "learning_rate": 8.969992870683273e-05, + "loss": 0.6496971249580383, + "step": 3860 + }, + { + "epoch": 1.629535864978903, + "grad_norm": 1.2819573879241943, + "learning_rate": 8.96853251694229e-05, + "loss": 0.6079609394073486, + "step": 3862 + }, + { + "epoch": 1.630379746835443, + "grad_norm": 1.087265968322754, + "learning_rate": 8.967071247737071e-05, + "loss": 0.6299422979354858, + "step": 3864 + }, + { + "epoch": 1.6312236286919832, + "grad_norm": 1.24200439453125, + "learning_rate": 8.965609063404706e-05, + "loss": 0.6691840291023254, + "step": 3866 + }, + { + "epoch": 1.6320675105485232, + "grad_norm": 1.0771806240081787, + "learning_rate": 8.96414596428249e-05, + "loss": 0.6623613238334656, + "step": 3868 + }, + { + "epoch": 1.6329113924050633, + "grad_norm": 1.1830974817276, + "learning_rate": 8.962681950707932e-05, + "loss": 0.6663276553153992, + "step": 3870 + }, + { + "epoch": 1.6337552742616035, + "grad_norm": 1.1107177734375, + "learning_rate": 8.961217023018754e-05, + "loss": 0.6426810622215271, + "step": 3872 + }, + { + "epoch": 1.6345991561181434, + "grad_norm": 1.2528507709503174, + "learning_rate": 8.959751181552886e-05, + "loss": 0.7113696336746216, + "step": 3874 + }, + { + "epoch": 1.6354430379746834, + "grad_norm": 1.0656070709228516, + "learning_rate": 8.958284426648467e-05, + "loss": 0.6211581230163574, + "step": 3876 + }, + { + "epoch": 1.6362869198312238, + "grad_norm": 1.0627381801605225, + "learning_rate": 8.956816758643852e-05, + "loss": 0.5950066447257996, + "step": 3878 + }, + { + "epoch": 1.6371308016877637, + "grad_norm": 0.9812912344932556, + "learning_rate": 8.955348177877603e-05, + "loss": 0.6519815325737, + "step": 3880 + }, + { + "epoch": 1.6379746835443036, + "grad_norm": 1.1843842267990112, + "learning_rate": 8.953878684688493e-05, + "loss": 0.6830767393112183, + "step": 3882 + }, + { + "epoch": 1.638818565400844, + "grad_norm": 1.0393236875534058, + "learning_rate": 8.952408279415507e-05, + "loss": 0.5920302271842957, + "step": 3884 + }, + { + "epoch": 1.639662447257384, + "grad_norm": 0.9931944608688354, + "learning_rate": 8.950936962397838e-05, + "loss": 0.6269177198410034, + "step": 3886 + }, + { + "epoch": 1.640506329113924, + "grad_norm": 1.1461358070373535, + "learning_rate": 8.949464733974891e-05, + "loss": 0.7021532654762268, + "step": 3888 + }, + { + "epoch": 1.6413502109704643, + "grad_norm": 1.2654093503952026, + "learning_rate": 8.947991594486279e-05, + "loss": 0.7331246733665466, + "step": 3890 + }, + { + "epoch": 1.6421940928270042, + "grad_norm": 1.1487081050872803, + "learning_rate": 8.946517544271831e-05, + "loss": 0.6438513994216919, + "step": 3892 + }, + { + "epoch": 1.6430379746835442, + "grad_norm": 1.0876784324645996, + "learning_rate": 8.945042583671579e-05, + "loss": 0.6779276728630066, + "step": 3894 + }, + { + "epoch": 1.6438818565400843, + "grad_norm": 1.2382020950317383, + "learning_rate": 8.943566713025768e-05, + "loss": 0.7255419492721558, + "step": 3896 + }, + { + "epoch": 1.6447257383966245, + "grad_norm": 1.3502718210220337, + "learning_rate": 8.942089932674855e-05, + "loss": 0.7068934440612793, + "step": 3898 + }, + { + "epoch": 1.6455696202531644, + "grad_norm": 1.050878643989563, + "learning_rate": 8.940612242959503e-05, + "loss": 0.608700156211853, + "step": 3900 + }, + { + "epoch": 1.6455696202531644, + "eval_loss": 0.7049403786659241, + "eval_runtime": 854.9866, + "eval_samples_per_second": 2.464, + "eval_steps_per_second": 2.464, + "step": 3900 + }, + { + "epoch": 1.6464135021097046, + "grad_norm": 1.0536954402923584, + "learning_rate": 8.939133644220588e-05, + "loss": 0.6257222890853882, + "step": 3902 + }, + { + "epoch": 1.6472573839662448, + "grad_norm": 1.1903947591781616, + "learning_rate": 8.937654136799195e-05, + "loss": 0.6823404431343079, + "step": 3904 + }, + { + "epoch": 1.6481012658227847, + "grad_norm": 1.225679874420166, + "learning_rate": 8.936173721036616e-05, + "loss": 0.6596478819847107, + "step": 3906 + }, + { + "epoch": 1.6489451476793249, + "grad_norm": 1.0071430206298828, + "learning_rate": 8.934692397274354e-05, + "loss": 0.5638422966003418, + "step": 3908 + }, + { + "epoch": 1.649789029535865, + "grad_norm": 1.0146223306655884, + "learning_rate": 8.933210165854125e-05, + "loss": 0.5743419528007507, + "step": 3910 + }, + { + "epoch": 1.650632911392405, + "grad_norm": 1.122976541519165, + "learning_rate": 8.931727027117848e-05, + "loss": 0.6775169372558594, + "step": 3912 + }, + { + "epoch": 1.6514767932489451, + "grad_norm": 0.9223271012306213, + "learning_rate": 8.930242981407656e-05, + "loss": 0.5984215140342712, + "step": 3914 + }, + { + "epoch": 1.6523206751054853, + "grad_norm": 1.1599735021591187, + "learning_rate": 8.928758029065891e-05, + "loss": 0.6342158913612366, + "step": 3916 + }, + { + "epoch": 1.6531645569620252, + "grad_norm": 1.2680121660232544, + "learning_rate": 8.927272170435101e-05, + "loss": 0.678507924079895, + "step": 3918 + }, + { + "epoch": 1.6540084388185654, + "grad_norm": 1.3628549575805664, + "learning_rate": 8.925785405858047e-05, + "loss": 0.6739710569381714, + "step": 3920 + }, + { + "epoch": 1.6548523206751056, + "grad_norm": 1.163482427597046, + "learning_rate": 8.924297735677694e-05, + "loss": 0.7050020098686218, + "step": 3922 + }, + { + "epoch": 1.6556962025316455, + "grad_norm": 1.2057000398635864, + "learning_rate": 8.922809160237222e-05, + "loss": 0.6847540140151978, + "step": 3924 + }, + { + "epoch": 1.6565400843881857, + "grad_norm": 1.2784082889556885, + "learning_rate": 8.921319679880016e-05, + "loss": 0.7079069018363953, + "step": 3926 + }, + { + "epoch": 1.6573839662447258, + "grad_norm": 1.1701157093048096, + "learning_rate": 8.919829294949671e-05, + "loss": 0.665060818195343, + "step": 3928 + }, + { + "epoch": 1.6582278481012658, + "grad_norm": 1.3886606693267822, + "learning_rate": 8.918338005789988e-05, + "loss": 0.7547550201416016, + "step": 3930 + }, + { + "epoch": 1.659071729957806, + "grad_norm": 0.9504727721214294, + "learning_rate": 8.91684581274498e-05, + "loss": 0.5718522667884827, + "step": 3932 + }, + { + "epoch": 1.659915611814346, + "grad_norm": 1.1185030937194824, + "learning_rate": 8.915352716158869e-05, + "loss": 0.5984254479408264, + "step": 3934 + }, + { + "epoch": 1.660759493670886, + "grad_norm": 1.1489602327346802, + "learning_rate": 8.913858716376081e-05, + "loss": 0.6749780774116516, + "step": 3936 + }, + { + "epoch": 1.6616033755274262, + "grad_norm": 1.389431118965149, + "learning_rate": 8.912363813741255e-05, + "loss": 0.6537864804267883, + "step": 3938 + }, + { + "epoch": 1.6624472573839664, + "grad_norm": 1.0958757400512695, + "learning_rate": 8.910868008599235e-05, + "loss": 0.6033569574356079, + "step": 3940 + }, + { + "epoch": 1.6632911392405063, + "grad_norm": 1.2735344171524048, + "learning_rate": 8.909371301295075e-05, + "loss": 0.7404987215995789, + "step": 3942 + }, + { + "epoch": 1.6641350210970463, + "grad_norm": 1.123336911201477, + "learning_rate": 8.907873692174038e-05, + "loss": 0.6265006065368652, + "step": 3944 + }, + { + "epoch": 1.6649789029535866, + "grad_norm": 1.259470820426941, + "learning_rate": 8.90637518158159e-05, + "loss": 0.650705099105835, + "step": 3946 + }, + { + "epoch": 1.6658227848101266, + "grad_norm": 1.4020485877990723, + "learning_rate": 8.904875769863412e-05, + "loss": 0.7813970446586609, + "step": 3948 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.1709671020507812, + "learning_rate": 8.903375457365389e-05, + "loss": 0.6499447822570801, + "step": 3950 + }, + { + "epoch": 1.667510548523207, + "grad_norm": 1.085585355758667, + "learning_rate": 8.901874244433612e-05, + "loss": 0.6141875386238098, + "step": 3952 + }, + { + "epoch": 1.6683544303797468, + "grad_norm": 1.2340166568756104, + "learning_rate": 8.900372131414386e-05, + "loss": 0.7080221176147461, + "step": 3954 + }, + { + "epoch": 1.6691983122362868, + "grad_norm": 1.148576259613037, + "learning_rate": 8.898869118654216e-05, + "loss": 0.6340513229370117, + "step": 3956 + }, + { + "epoch": 1.6700421940928272, + "grad_norm": 1.2231999635696411, + "learning_rate": 8.89736520649982e-05, + "loss": 0.6999116539955139, + "step": 3958 + }, + { + "epoch": 1.6708860759493671, + "grad_norm": 1.1600396633148193, + "learning_rate": 8.895860395298121e-05, + "loss": 0.7177759408950806, + "step": 3960 + }, + { + "epoch": 1.671729957805907, + "grad_norm": 1.3019158840179443, + "learning_rate": 8.894354685396251e-05, + "loss": 0.6485702395439148, + "step": 3962 + }, + { + "epoch": 1.6725738396624472, + "grad_norm": 1.0153226852416992, + "learning_rate": 8.892848077141546e-05, + "loss": 0.6189450025558472, + "step": 3964 + }, + { + "epoch": 1.6734177215189874, + "grad_norm": 1.1953094005584717, + "learning_rate": 8.891340570881555e-05, + "loss": 0.6756728291511536, + "step": 3966 + }, + { + "epoch": 1.6742616033755273, + "grad_norm": 1.3376187086105347, + "learning_rate": 8.889832166964027e-05, + "loss": 0.6851167678833008, + "step": 3968 + }, + { + "epoch": 1.6751054852320675, + "grad_norm": 1.0045926570892334, + "learning_rate": 8.888322865736924e-05, + "loss": 0.5991915464401245, + "step": 3970 + }, + { + "epoch": 1.6759493670886076, + "grad_norm": 1.2115750312805176, + "learning_rate": 8.886812667548414e-05, + "loss": 0.713362455368042, + "step": 3972 + }, + { + "epoch": 1.6767932489451476, + "grad_norm": 1.1887929439544678, + "learning_rate": 8.88530157274687e-05, + "loss": 0.7058883309364319, + "step": 3974 + }, + { + "epoch": 1.6776371308016877, + "grad_norm": 1.1465295553207397, + "learning_rate": 8.883789581680868e-05, + "loss": 0.6501380801200867, + "step": 3976 + }, + { + "epoch": 1.678481012658228, + "grad_norm": 1.184693694114685, + "learning_rate": 8.882276694699204e-05, + "loss": 0.6109840273857117, + "step": 3978 + }, + { + "epoch": 1.6793248945147679, + "grad_norm": 1.2034777402877808, + "learning_rate": 8.880762912150862e-05, + "loss": 0.6815584897994995, + "step": 3980 + }, + { + "epoch": 1.680168776371308, + "grad_norm": 1.1312000751495361, + "learning_rate": 8.879248234385052e-05, + "loss": 0.6859248876571655, + "step": 3982 + }, + { + "epoch": 1.6810126582278482, + "grad_norm": 1.2273681163787842, + "learning_rate": 8.877732661751173e-05, + "loss": 0.6426702737808228, + "step": 3984 + }, + { + "epoch": 1.6818565400843881, + "grad_norm": 1.2550326585769653, + "learning_rate": 8.876216194598844e-05, + "loss": 0.6462456583976746, + "step": 3986 + }, + { + "epoch": 1.6827004219409283, + "grad_norm": 1.3111321926116943, + "learning_rate": 8.874698833277884e-05, + "loss": 0.6293925046920776, + "step": 3988 + }, + { + "epoch": 1.6835443037974684, + "grad_norm": 1.037883996963501, + "learning_rate": 8.873180578138316e-05, + "loss": 0.59798264503479, + "step": 3990 + }, + { + "epoch": 1.6843881856540084, + "grad_norm": 1.2411901950836182, + "learning_rate": 8.871661429530376e-05, + "loss": 0.6741529703140259, + "step": 3992 + }, + { + "epoch": 1.6852320675105485, + "grad_norm": 1.206354022026062, + "learning_rate": 8.8701413878045e-05, + "loss": 0.5972680449485779, + "step": 3994 + }, + { + "epoch": 1.6860759493670887, + "grad_norm": 1.1922144889831543, + "learning_rate": 8.868620453311334e-05, + "loss": 0.5879245400428772, + "step": 3996 + }, + { + "epoch": 1.6869198312236287, + "grad_norm": 1.3499996662139893, + "learning_rate": 8.867098626401729e-05, + "loss": 0.7381167411804199, + "step": 3998 + }, + { + "epoch": 1.6877637130801688, + "grad_norm": 1.3601514101028442, + "learning_rate": 8.865575907426737e-05, + "loss": 0.6590276956558228, + "step": 4000 + }, + { + "epoch": 1.6877637130801688, + "eval_loss": 0.7027890682220459, + "eval_runtime": 848.7529, + "eval_samples_per_second": 2.482, + "eval_steps_per_second": 2.482, + "step": 4000 + }, + { + "epoch": 1.688607594936709, + "grad_norm": 1.1060529947280884, + "learning_rate": 8.864052296737624e-05, + "loss": 0.5958077907562256, + "step": 4002 + }, + { + "epoch": 1.689451476793249, + "grad_norm": 1.2067371606826782, + "learning_rate": 8.862527794685858e-05, + "loss": 0.6802279353141785, + "step": 4004 + }, + { + "epoch": 1.690295358649789, + "grad_norm": 1.0094636678695679, + "learning_rate": 8.86100240162311e-05, + "loss": 0.5701603889465332, + "step": 4006 + }, + { + "epoch": 1.6911392405063292, + "grad_norm": 1.0976500511169434, + "learning_rate": 8.85947611790126e-05, + "loss": 0.6580625176429749, + "step": 4008 + }, + { + "epoch": 1.6919831223628692, + "grad_norm": 0.9448981285095215, + "learning_rate": 8.857948943872392e-05, + "loss": 0.5947542190551758, + "step": 4010 + }, + { + "epoch": 1.6928270042194091, + "grad_norm": 1.219609260559082, + "learning_rate": 8.856420879888796e-05, + "loss": 0.6361464262008667, + "step": 4012 + }, + { + "epoch": 1.6936708860759495, + "grad_norm": 1.2395503520965576, + "learning_rate": 8.854891926302966e-05, + "loss": 0.608664333820343, + "step": 4014 + }, + { + "epoch": 1.6945147679324895, + "grad_norm": 1.1300057172775269, + "learning_rate": 8.853362083467604e-05, + "loss": 0.6932460069656372, + "step": 4016 + }, + { + "epoch": 1.6953586497890294, + "grad_norm": 1.2300254106521606, + "learning_rate": 8.851831351735616e-05, + "loss": 0.646004855632782, + "step": 4018 + }, + { + "epoch": 1.6962025316455698, + "grad_norm": 1.2328956127166748, + "learning_rate": 8.85029973146011e-05, + "loss": 0.6760826110839844, + "step": 4020 + }, + { + "epoch": 1.6970464135021097, + "grad_norm": 1.1252286434173584, + "learning_rate": 8.848767222994401e-05, + "loss": 0.5943224430084229, + "step": 4022 + }, + { + "epoch": 1.6978902953586497, + "grad_norm": 1.1587592363357544, + "learning_rate": 8.847233826692012e-05, + "loss": 0.7535276412963867, + "step": 4024 + }, + { + "epoch": 1.6987341772151898, + "grad_norm": 1.0294606685638428, + "learning_rate": 8.845699542906667e-05, + "loss": 0.5903090834617615, + "step": 4026 + }, + { + "epoch": 1.69957805907173, + "grad_norm": 1.1940597295761108, + "learning_rate": 8.844164371992295e-05, + "loss": 0.6031379699707031, + "step": 4028 + }, + { + "epoch": 1.70042194092827, + "grad_norm": 1.0416409969329834, + "learning_rate": 8.842628314303031e-05, + "loss": 0.6185168623924255, + "step": 4030 + }, + { + "epoch": 1.70126582278481, + "grad_norm": 1.8715689182281494, + "learning_rate": 8.841091370193214e-05, + "loss": 0.6325570344924927, + "step": 4032 + }, + { + "epoch": 1.7021097046413503, + "grad_norm": 1.230658769607544, + "learning_rate": 8.839553540017387e-05, + "loss": 0.7413952350616455, + "step": 4034 + }, + { + "epoch": 1.7029535864978902, + "grad_norm": 1.298003077507019, + "learning_rate": 8.838014824130299e-05, + "loss": 0.6973189115524292, + "step": 4036 + }, + { + "epoch": 1.7037974683544304, + "grad_norm": 1.0246652364730835, + "learning_rate": 8.836475222886902e-05, + "loss": 0.6582493185997009, + "step": 4038 + }, + { + "epoch": 1.7046413502109705, + "grad_norm": 1.3652594089508057, + "learning_rate": 8.834934736642351e-05, + "loss": 0.6934399008750916, + "step": 4040 + }, + { + "epoch": 1.7054852320675105, + "grad_norm": 1.029778242111206, + "learning_rate": 8.833393365752007e-05, + "loss": 0.6437561511993408, + "step": 4042 + }, + { + "epoch": 1.7063291139240506, + "grad_norm": 1.1993004083633423, + "learning_rate": 8.831851110571437e-05, + "loss": 0.605059027671814, + "step": 4044 + }, + { + "epoch": 1.7071729957805908, + "grad_norm": 1.286389946937561, + "learning_rate": 8.830307971456406e-05, + "loss": 0.7035017609596252, + "step": 4046 + }, + { + "epoch": 1.7080168776371307, + "grad_norm": 1.1211459636688232, + "learning_rate": 8.82876394876289e-05, + "loss": 0.6429924964904785, + "step": 4048 + }, + { + "epoch": 1.7088607594936709, + "grad_norm": 1.1284868717193604, + "learning_rate": 8.827219042847064e-05, + "loss": 0.6454769968986511, + "step": 4050 + }, + { + "epoch": 1.709704641350211, + "grad_norm": 1.1934884786605835, + "learning_rate": 8.825673254065306e-05, + "loss": 0.707233190536499, + "step": 4052 + }, + { + "epoch": 1.710548523206751, + "grad_norm": 1.1560680866241455, + "learning_rate": 8.824126582774203e-05, + "loss": 0.6790444254875183, + "step": 4054 + }, + { + "epoch": 1.7113924050632912, + "grad_norm": 1.1924364566802979, + "learning_rate": 8.822579029330541e-05, + "loss": 0.6115295886993408, + "step": 4056 + }, + { + "epoch": 1.7122362869198313, + "grad_norm": 1.107370138168335, + "learning_rate": 8.82103059409131e-05, + "loss": 0.7039182186126709, + "step": 4058 + }, + { + "epoch": 1.7130801687763713, + "grad_norm": 1.2554657459259033, + "learning_rate": 8.819481277413707e-05, + "loss": 0.6580052971839905, + "step": 4060 + }, + { + "epoch": 1.7139240506329114, + "grad_norm": 1.2873135805130005, + "learning_rate": 8.817931079655127e-05, + "loss": 0.6042479276657104, + "step": 4062 + }, + { + "epoch": 1.7147679324894516, + "grad_norm": 1.027056097984314, + "learning_rate": 8.816380001173172e-05, + "loss": 0.5992372632026672, + "step": 4064 + }, + { + "epoch": 1.7156118143459915, + "grad_norm": 1.0694721937179565, + "learning_rate": 8.814828042325644e-05, + "loss": 0.7078655362129211, + "step": 4066 + }, + { + "epoch": 1.7164556962025317, + "grad_norm": 1.194984793663025, + "learning_rate": 8.813275203470555e-05, + "loss": 0.6618752479553223, + "step": 4068 + }, + { + "epoch": 1.7172995780590719, + "grad_norm": 1.1713165044784546, + "learning_rate": 8.811721484966109e-05, + "loss": 0.6328625679016113, + "step": 4070 + }, + { + "epoch": 1.7181434599156118, + "grad_norm": 0.9993656277656555, + "learning_rate": 8.810166887170724e-05, + "loss": 0.5916416645050049, + "step": 4072 + }, + { + "epoch": 1.7189873417721517, + "grad_norm": 1.172642707824707, + "learning_rate": 8.808611410443011e-05, + "loss": 0.6490002274513245, + "step": 4074 + }, + { + "epoch": 1.7198312236286921, + "grad_norm": 1.1404821872711182, + "learning_rate": 8.807055055141793e-05, + "loss": 0.6571791172027588, + "step": 4076 + }, + { + "epoch": 1.720675105485232, + "grad_norm": 1.2104214429855347, + "learning_rate": 8.80549782162609e-05, + "loss": 0.6233854293823242, + "step": 4078 + }, + { + "epoch": 1.721518987341772, + "grad_norm": 1.1691396236419678, + "learning_rate": 8.803939710255126e-05, + "loss": 0.6331531405448914, + "step": 4080 + }, + { + "epoch": 1.7223628691983124, + "grad_norm": 1.263174057006836, + "learning_rate": 8.802380721388325e-05, + "loss": 0.6321156620979309, + "step": 4082 + }, + { + "epoch": 1.7232067510548523, + "grad_norm": 1.0685606002807617, + "learning_rate": 8.80082085538532e-05, + "loss": 0.644904613494873, + "step": 4084 + }, + { + "epoch": 1.7240506329113923, + "grad_norm": 1.2289735078811646, + "learning_rate": 8.799260112605938e-05, + "loss": 0.6743831634521484, + "step": 4086 + }, + { + "epoch": 1.7248945147679327, + "grad_norm": 1.0661355257034302, + "learning_rate": 8.797698493410216e-05, + "loss": 0.6866999268531799, + "step": 4088 + }, + { + "epoch": 1.7257383966244726, + "grad_norm": 1.1001228094100952, + "learning_rate": 8.796135998158386e-05, + "loss": 0.691387414932251, + "step": 4090 + }, + { + "epoch": 1.7265822784810125, + "grad_norm": 1.1078115701675415, + "learning_rate": 8.794572627210887e-05, + "loss": 0.5882864594459534, + "step": 4092 + }, + { + "epoch": 1.7274261603375527, + "grad_norm": 1.0483999252319336, + "learning_rate": 8.79300838092836e-05, + "loss": 0.6192089319229126, + "step": 4094 + }, + { + "epoch": 1.7282700421940929, + "grad_norm": 1.1194913387298584, + "learning_rate": 8.791443259671645e-05, + "loss": 0.603322446346283, + "step": 4096 + }, + { + "epoch": 1.7291139240506328, + "grad_norm": 1.1800397634506226, + "learning_rate": 8.789877263801787e-05, + "loss": 0.6141818165779114, + "step": 4098 + }, + { + "epoch": 1.729957805907173, + "grad_norm": 1.261768102645874, + "learning_rate": 8.78831039368003e-05, + "loss": 0.6707983016967773, + "step": 4100 + }, + { + "epoch": 1.729957805907173, + "eval_loss": 0.7022181153297424, + "eval_runtime": 844.6405, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 2.495, + "step": 4100 + }, + { + "epoch": 1.7308016877637131, + "grad_norm": 1.2505232095718384, + "learning_rate": 8.786742649667822e-05, + "loss": 0.6440353989601135, + "step": 4102 + }, + { + "epoch": 1.731645569620253, + "grad_norm": 1.2631809711456299, + "learning_rate": 8.78517403212681e-05, + "loss": 0.6712808012962341, + "step": 4104 + }, + { + "epoch": 1.7324894514767932, + "grad_norm": 1.2781071662902832, + "learning_rate": 8.783604541418845e-05, + "loss": 0.6854958534240723, + "step": 4106 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 1.1065936088562012, + "learning_rate": 8.782034177905976e-05, + "loss": 0.6281477808952332, + "step": 4108 + }, + { + "epoch": 1.7341772151898733, + "grad_norm": 1.010961890220642, + "learning_rate": 8.780462941950457e-05, + "loss": 0.6835165619850159, + "step": 4110 + }, + { + "epoch": 1.7350210970464135, + "grad_norm": 1.1467366218566895, + "learning_rate": 8.778890833914744e-05, + "loss": 0.6674962639808655, + "step": 4112 + }, + { + "epoch": 1.7358649789029537, + "grad_norm": 1.0221859216690063, + "learning_rate": 8.77731785416149e-05, + "loss": 0.5967551469802856, + "step": 4114 + }, + { + "epoch": 1.7367088607594936, + "grad_norm": 1.347937822341919, + "learning_rate": 8.775744003053552e-05, + "loss": 0.7356855869293213, + "step": 4116 + }, + { + "epoch": 1.7375527426160338, + "grad_norm": 1.2952557802200317, + "learning_rate": 8.774169280953988e-05, + "loss": 0.6932644844055176, + "step": 4118 + }, + { + "epoch": 1.738396624472574, + "grad_norm": 1.0157089233398438, + "learning_rate": 8.772593688226052e-05, + "loss": 0.5917407870292664, + "step": 4120 + }, + { + "epoch": 1.7392405063291139, + "grad_norm": 1.1537878513336182, + "learning_rate": 8.77101722523321e-05, + "loss": 0.6335760354995728, + "step": 4122 + }, + { + "epoch": 1.740084388185654, + "grad_norm": 1.0989667177200317, + "learning_rate": 8.769439892339115e-05, + "loss": 0.6892110109329224, + "step": 4124 + }, + { + "epoch": 1.7409282700421942, + "grad_norm": 1.1293572187423706, + "learning_rate": 8.767861689907633e-05, + "loss": 0.5966230630874634, + "step": 4126 + }, + { + "epoch": 1.7417721518987341, + "grad_norm": 1.1167775392532349, + "learning_rate": 8.76628261830282e-05, + "loss": 0.5981804728507996, + "step": 4128 + }, + { + "epoch": 1.7426160337552743, + "grad_norm": 1.0572419166564941, + "learning_rate": 8.76470267788894e-05, + "loss": 0.5539529919624329, + "step": 4130 + }, + { + "epoch": 1.7434599156118145, + "grad_norm": 0.937256932258606, + "learning_rate": 8.763121869030456e-05, + "loss": 0.6238219141960144, + "step": 4132 + }, + { + "epoch": 1.7443037974683544, + "grad_norm": 1.082932472229004, + "learning_rate": 8.761540192092029e-05, + "loss": 0.6033329963684082, + "step": 4134 + }, + { + "epoch": 1.7451476793248946, + "grad_norm": 1.0495184659957886, + "learning_rate": 8.75995764743852e-05, + "loss": 0.5567626357078552, + "step": 4136 + }, + { + "epoch": 1.7459915611814347, + "grad_norm": 1.3143779039382935, + "learning_rate": 8.758374235434994e-05, + "loss": 0.6759346127510071, + "step": 4138 + }, + { + "epoch": 1.7468354430379747, + "grad_norm": 1.2385786771774292, + "learning_rate": 8.756789956446713e-05, + "loss": 0.6439400315284729, + "step": 4140 + }, + { + "epoch": 1.7476793248945146, + "grad_norm": 1.0453747510910034, + "learning_rate": 8.75520481083914e-05, + "loss": 0.627493679523468, + "step": 4142 + }, + { + "epoch": 1.748523206751055, + "grad_norm": 1.09946608543396, + "learning_rate": 8.753618798977935e-05, + "loss": 0.677209198474884, + "step": 4144 + }, + { + "epoch": 1.749367088607595, + "grad_norm": 1.2207063436508179, + "learning_rate": 8.752031921228965e-05, + "loss": 0.6874014735221863, + "step": 4146 + }, + { + "epoch": 1.7502109704641349, + "grad_norm": 1.2520697116851807, + "learning_rate": 8.750444177958288e-05, + "loss": 0.6332831382751465, + "step": 4148 + }, + { + "epoch": 1.7510548523206753, + "grad_norm": 1.2463186979293823, + "learning_rate": 8.748855569532168e-05, + "loss": 0.682744562625885, + "step": 4150 + }, + { + "epoch": 1.7518987341772152, + "grad_norm": 1.1895235776901245, + "learning_rate": 8.747266096317069e-05, + "loss": 0.7006803750991821, + "step": 4152 + }, + { + "epoch": 1.7527426160337551, + "grad_norm": 1.1627185344696045, + "learning_rate": 8.745675758679646e-05, + "loss": 0.6751191020011902, + "step": 4154 + }, + { + "epoch": 1.7535864978902953, + "grad_norm": 1.324127197265625, + "learning_rate": 8.744084556986764e-05, + "loss": 0.661848247051239, + "step": 4156 + }, + { + "epoch": 1.7544303797468355, + "grad_norm": 1.226809024810791, + "learning_rate": 8.74249249160548e-05, + "loss": 0.7057217955589294, + "step": 4158 + }, + { + "epoch": 1.7552742616033754, + "grad_norm": 1.2341214418411255, + "learning_rate": 8.740899562903056e-05, + "loss": 0.6856105923652649, + "step": 4160 + }, + { + "epoch": 1.7561181434599156, + "grad_norm": 1.3907564878463745, + "learning_rate": 8.739305771246946e-05, + "loss": 0.6616930365562439, + "step": 4162 + }, + { + "epoch": 1.7569620253164557, + "grad_norm": 1.2756825685501099, + "learning_rate": 8.737711117004812e-05, + "loss": 0.5791551470756531, + "step": 4164 + }, + { + "epoch": 1.7578059071729957, + "grad_norm": 1.2861095666885376, + "learning_rate": 8.736115600544506e-05, + "loss": 0.7074756622314453, + "step": 4166 + }, + { + "epoch": 1.7586497890295358, + "grad_norm": 1.2198424339294434, + "learning_rate": 8.734519222234083e-05, + "loss": 0.6494167447090149, + "step": 4168 + }, + { + "epoch": 1.759493670886076, + "grad_norm": 1.19169020652771, + "learning_rate": 8.732921982441799e-05, + "loss": 0.6546841859817505, + "step": 4170 + }, + { + "epoch": 1.760337552742616, + "grad_norm": 1.11533784866333, + "learning_rate": 8.731323881536108e-05, + "loss": 0.6701815724372864, + "step": 4172 + }, + { + "epoch": 1.761181434599156, + "grad_norm": 1.2148140668869019, + "learning_rate": 8.729724919885657e-05, + "loss": 0.6678179502487183, + "step": 4174 + }, + { + "epoch": 1.7620253164556963, + "grad_norm": 1.1968709230422974, + "learning_rate": 8.728125097859298e-05, + "loss": 0.6505144834518433, + "step": 4176 + }, + { + "epoch": 1.7628691983122362, + "grad_norm": 1.0954766273498535, + "learning_rate": 8.726524415826079e-05, + "loss": 0.6531696915626526, + "step": 4178 + }, + { + "epoch": 1.7637130801687764, + "grad_norm": 1.5149537324905396, + "learning_rate": 8.724922874155246e-05, + "loss": 0.710014283657074, + "step": 4180 + }, + { + "epoch": 1.7645569620253165, + "grad_norm": 1.145113229751587, + "learning_rate": 8.723320473216245e-05, + "loss": 0.714016318321228, + "step": 4182 + }, + { + "epoch": 1.7654008438818565, + "grad_norm": 0.9454524517059326, + "learning_rate": 8.721717213378719e-05, + "loss": 0.6775414347648621, + "step": 4184 + }, + { + "epoch": 1.7662447257383966, + "grad_norm": 1.1414754390716553, + "learning_rate": 8.720113095012507e-05, + "loss": 0.6279728412628174, + "step": 4186 + }, + { + "epoch": 1.7670886075949368, + "grad_norm": 1.212802767753601, + "learning_rate": 8.718508118487652e-05, + "loss": 0.5894309282302856, + "step": 4188 + }, + { + "epoch": 1.7679324894514767, + "grad_norm": 1.5213478803634644, + "learning_rate": 8.716902284174388e-05, + "loss": 0.6124046444892883, + "step": 4190 + }, + { + "epoch": 1.768776371308017, + "grad_norm": 0.9973840713500977, + "learning_rate": 8.715295592443154e-05, + "loss": 0.5990801453590393, + "step": 4192 + }, + { + "epoch": 1.769620253164557, + "grad_norm": 1.1084294319152832, + "learning_rate": 8.713688043664579e-05, + "loss": 0.6485559344291687, + "step": 4194 + }, + { + "epoch": 1.770464135021097, + "grad_norm": 1.1401913166046143, + "learning_rate": 8.712079638209493e-05, + "loss": 0.7083099484443665, + "step": 4196 + }, + { + "epoch": 1.7713080168776372, + "grad_norm": 1.278105616569519, + "learning_rate": 8.71047037644893e-05, + "loss": 0.7237915992736816, + "step": 4198 + }, + { + "epoch": 1.7721518987341773, + "grad_norm": 1.2407530546188354, + "learning_rate": 8.708860258754108e-05, + "loss": 0.6259870529174805, + "step": 4200 + }, + { + "epoch": 1.7721518987341773, + "eval_loss": 0.6993561387062073, + "eval_runtime": 542.0281, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 3.887, + "step": 4200 + }, + { + "epoch": 1.7729957805907173, + "grad_norm": 1.102859616279602, + "learning_rate": 8.707249285496457e-05, + "loss": 0.6604248285293579, + "step": 4202 + }, + { + "epoch": 1.7738396624472574, + "grad_norm": 1.2478244304656982, + "learning_rate": 8.705637457047594e-05, + "loss": 0.6799775958061218, + "step": 4204 + }, + { + "epoch": 1.7746835443037976, + "grad_norm": 1.1178022623062134, + "learning_rate": 8.704024773779338e-05, + "loss": 0.6136477589607239, + "step": 4206 + }, + { + "epoch": 1.7755274261603375, + "grad_norm": 1.904076337814331, + "learning_rate": 8.702411236063703e-05, + "loss": 0.6568390130996704, + "step": 4208 + }, + { + "epoch": 1.7763713080168775, + "grad_norm": 1.0902835130691528, + "learning_rate": 8.700796844272903e-05, + "loss": 0.6404406428337097, + "step": 4210 + }, + { + "epoch": 1.7772151898734179, + "grad_norm": 1.1858288049697876, + "learning_rate": 8.699181598779347e-05, + "loss": 0.6924911737442017, + "step": 4212 + }, + { + "epoch": 1.7780590717299578, + "grad_norm": 1.0015727281570435, + "learning_rate": 8.69756549995564e-05, + "loss": 0.572692334651947, + "step": 4214 + }, + { + "epoch": 1.7789029535864977, + "grad_norm": 1.440079689025879, + "learning_rate": 8.695948548174583e-05, + "loss": 0.7196018695831299, + "step": 4216 + }, + { + "epoch": 1.7797468354430381, + "grad_norm": 1.1320992708206177, + "learning_rate": 8.69433074380918e-05, + "loss": 0.5870906710624695, + "step": 4218 + }, + { + "epoch": 1.780590717299578, + "grad_norm": 1.3156964778900146, + "learning_rate": 8.692712087232626e-05, + "loss": 0.6501539349555969, + "step": 4220 + }, + { + "epoch": 1.781434599156118, + "grad_norm": 1.1869803667068481, + "learning_rate": 8.691092578818311e-05, + "loss": 0.7017278075218201, + "step": 4222 + }, + { + "epoch": 1.7822784810126582, + "grad_norm": 0.9708380699157715, + "learning_rate": 8.689472218939829e-05, + "loss": 0.5954802632331848, + "step": 4224 + }, + { + "epoch": 1.7831223628691983, + "grad_norm": 1.0753228664398193, + "learning_rate": 8.687851007970962e-05, + "loss": 0.6494144797325134, + "step": 4226 + }, + { + "epoch": 1.7839662447257383, + "grad_norm": 1.1038413047790527, + "learning_rate": 8.686228946285695e-05, + "loss": 0.7247282862663269, + "step": 4228 + }, + { + "epoch": 1.7848101265822784, + "grad_norm": 0.9666786789894104, + "learning_rate": 8.684606034258206e-05, + "loss": 0.5673812627792358, + "step": 4230 + }, + { + "epoch": 1.7856540084388186, + "grad_norm": 1.1972676515579224, + "learning_rate": 8.682982272262869e-05, + "loss": 0.5950504541397095, + "step": 4232 + }, + { + "epoch": 1.7864978902953585, + "grad_norm": 1.23736572265625, + "learning_rate": 8.681357660674255e-05, + "loss": 0.6477514505386353, + "step": 4234 + }, + { + "epoch": 1.7873417721518987, + "grad_norm": 1.0238158702850342, + "learning_rate": 8.679732199867127e-05, + "loss": 0.6180200576782227, + "step": 4236 + }, + { + "epoch": 1.7881856540084389, + "grad_norm": 1.0333375930786133, + "learning_rate": 8.678105890216455e-05, + "loss": 0.5771099328994751, + "step": 4238 + }, + { + "epoch": 1.7890295358649788, + "grad_norm": 1.30390202999115, + "learning_rate": 8.676478732097393e-05, + "loss": 0.6592516899108887, + "step": 4240 + }, + { + "epoch": 1.789873417721519, + "grad_norm": 1.115160346031189, + "learning_rate": 8.674850725885294e-05, + "loss": 0.6662757396697998, + "step": 4242 + }, + { + "epoch": 1.7907172995780591, + "grad_norm": 1.2130142450332642, + "learning_rate": 8.67322187195571e-05, + "loss": 0.6673333048820496, + "step": 4244 + }, + { + "epoch": 1.791561181434599, + "grad_norm": 1.1505554914474487, + "learning_rate": 8.671592170684386e-05, + "loss": 0.6698325872421265, + "step": 4246 + }, + { + "epoch": 1.7924050632911392, + "grad_norm": 1.0758062601089478, + "learning_rate": 8.669961622447262e-05, + "loss": 0.6216199398040771, + "step": 4248 + }, + { + "epoch": 1.7932489451476794, + "grad_norm": 0.9300920367240906, + "learning_rate": 8.668330227620475e-05, + "loss": 0.6460495591163635, + "step": 4250 + }, + { + "epoch": 1.7940928270042193, + "grad_norm": 1.3860046863555908, + "learning_rate": 8.666697986580357e-05, + "loss": 0.6949506998062134, + "step": 4252 + }, + { + "epoch": 1.7949367088607595, + "grad_norm": 1.2287555932998657, + "learning_rate": 8.665064899703433e-05, + "loss": 0.6320405602455139, + "step": 4254 + }, + { + "epoch": 1.7957805907172997, + "grad_norm": 1.1585466861724854, + "learning_rate": 8.663430967366426e-05, + "loss": 0.6635019779205322, + "step": 4256 + }, + { + "epoch": 1.7966244725738396, + "grad_norm": 1.1007941961288452, + "learning_rate": 8.661796189946252e-05, + "loss": 0.645052969455719, + "step": 4258 + }, + { + "epoch": 1.7974683544303798, + "grad_norm": 1.2059847116470337, + "learning_rate": 8.660160567820023e-05, + "loss": 0.70420902967453, + "step": 4260 + }, + { + "epoch": 1.79831223628692, + "grad_norm": 1.0648717880249023, + "learning_rate": 8.658524101365044e-05, + "loss": 0.6263765096664429, + "step": 4262 + }, + { + "epoch": 1.7991561181434599, + "grad_norm": 1.017052412033081, + "learning_rate": 8.656886790958821e-05, + "loss": 0.6199937462806702, + "step": 4264 + }, + { + "epoch": 1.8, + "grad_norm": 1.1153450012207031, + "learning_rate": 8.655248636979045e-05, + "loss": 0.5891271233558655, + "step": 4266 + }, + { + "epoch": 1.8008438818565402, + "grad_norm": 1.0661747455596924, + "learning_rate": 8.65360963980361e-05, + "loss": 0.5442121028900146, + "step": 4268 + }, + { + "epoch": 1.8016877637130801, + "grad_norm": 1.3049758672714233, + "learning_rate": 8.6519697998106e-05, + "loss": 0.6988245248794556, + "step": 4270 + }, + { + "epoch": 1.80253164556962, + "grad_norm": 1.2679938077926636, + "learning_rate": 8.650329117378294e-05, + "loss": 0.7260398864746094, + "step": 4272 + }, + { + "epoch": 1.8033755274261605, + "grad_norm": 1.0899536609649658, + "learning_rate": 8.648687592885168e-05, + "loss": 0.5757678151130676, + "step": 4274 + }, + { + "epoch": 1.8042194092827004, + "grad_norm": 1.4088575839996338, + "learning_rate": 8.647045226709887e-05, + "loss": 0.7042108178138733, + "step": 4276 + }, + { + "epoch": 1.8050632911392404, + "grad_norm": 1.2143783569335938, + "learning_rate": 8.645402019231316e-05, + "loss": 0.641275942325592, + "step": 4278 + }, + { + "epoch": 1.8059071729957807, + "grad_norm": 1.4072896242141724, + "learning_rate": 8.64375797082851e-05, + "loss": 0.7657124996185303, + "step": 4280 + }, + { + "epoch": 1.8067510548523207, + "grad_norm": 1.2563380002975464, + "learning_rate": 8.642113081880718e-05, + "loss": 0.713768720626831, + "step": 4282 + }, + { + "epoch": 1.8075949367088606, + "grad_norm": 1.1195416450500488, + "learning_rate": 8.64046735276739e-05, + "loss": 0.6276429295539856, + "step": 4284 + }, + { + "epoch": 1.808438818565401, + "grad_norm": 1.2472422122955322, + "learning_rate": 8.638820783868158e-05, + "loss": 0.5641238689422607, + "step": 4286 + }, + { + "epoch": 1.809282700421941, + "grad_norm": 1.1974313259124756, + "learning_rate": 8.637173375562855e-05, + "loss": 0.6312015056610107, + "step": 4288 + }, + { + "epoch": 1.810126582278481, + "grad_norm": 1.1673604249954224, + "learning_rate": 8.63552512823151e-05, + "loss": 0.6674410104751587, + "step": 4290 + }, + { + "epoch": 1.810970464135021, + "grad_norm": 1.199095368385315, + "learning_rate": 8.633876042254337e-05, + "loss": 0.6772016286849976, + "step": 4292 + }, + { + "epoch": 1.8118143459915612, + "grad_norm": 1.2302746772766113, + "learning_rate": 8.632226118011752e-05, + "loss": 0.6621671915054321, + "step": 4294 + }, + { + "epoch": 1.8126582278481012, + "grad_norm": 1.304010033607483, + "learning_rate": 8.63057535588436e-05, + "loss": 0.6965363621711731, + "step": 4296 + }, + { + "epoch": 1.8135021097046413, + "grad_norm": 1.223366618156433, + "learning_rate": 8.62892375625296e-05, + "loss": 0.6300807595252991, + "step": 4298 + }, + { + "epoch": 1.8143459915611815, + "grad_norm": 1.028496265411377, + "learning_rate": 8.627271319498544e-05, + "loss": 0.5610660910606384, + "step": 4300 + }, + { + "epoch": 1.8143459915611815, + "eval_loss": 0.6981000900268555, + "eval_runtime": 514.4659, + "eval_samples_per_second": 4.096, + "eval_steps_per_second": 4.096, + "step": 4300 + }, + { + "epoch": 1.8151898734177214, + "grad_norm": 1.2050007581710815, + "learning_rate": 8.625618046002298e-05, + "loss": 0.6666551232337952, + "step": 4302 + }, + { + "epoch": 1.8160337552742616, + "grad_norm": 1.1233220100402832, + "learning_rate": 8.6239639361456e-05, + "loss": 0.6631835103034973, + "step": 4304 + }, + { + "epoch": 1.8168776371308017, + "grad_norm": 1.1262956857681274, + "learning_rate": 8.622308990310021e-05, + "loss": 0.6395270228385925, + "step": 4306 + }, + { + "epoch": 1.8177215189873417, + "grad_norm": 1.0448222160339355, + "learning_rate": 8.620653208877328e-05, + "loss": 0.6165015697479248, + "step": 4308 + }, + { + "epoch": 1.8185654008438819, + "grad_norm": 1.1555759906768799, + "learning_rate": 8.618996592229473e-05, + "loss": 0.5915844440460205, + "step": 4310 + }, + { + "epoch": 1.819409282700422, + "grad_norm": 1.5407506227493286, + "learning_rate": 8.617339140748608e-05, + "loss": 0.6491456627845764, + "step": 4312 + }, + { + "epoch": 1.820253164556962, + "grad_norm": 1.3690788745880127, + "learning_rate": 8.615680854817077e-05, + "loss": 0.6053901314735413, + "step": 4314 + }, + { + "epoch": 1.8210970464135021, + "grad_norm": 1.052583932876587, + "learning_rate": 8.614021734817413e-05, + "loss": 0.5821644067764282, + "step": 4316 + }, + { + "epoch": 1.8219409282700423, + "grad_norm": 1.090567708015442, + "learning_rate": 8.612361781132344e-05, + "loss": 0.645878255367279, + "step": 4318 + }, + { + "epoch": 1.8227848101265822, + "grad_norm": 1.122719645500183, + "learning_rate": 8.610700994144787e-05, + "loss": 0.6883123517036438, + "step": 4320 + }, + { + "epoch": 1.8236286919831224, + "grad_norm": 1.3273001909255981, + "learning_rate": 8.609039374237856e-05, + "loss": 0.6918330788612366, + "step": 4322 + }, + { + "epoch": 1.8244725738396625, + "grad_norm": 1.0628443956375122, + "learning_rate": 8.607376921794855e-05, + "loss": 0.6292204856872559, + "step": 4324 + }, + { + "epoch": 1.8253164556962025, + "grad_norm": 1.287466287612915, + "learning_rate": 8.605713637199279e-05, + "loss": 0.6136105060577393, + "step": 4326 + }, + { + "epoch": 1.8261603375527427, + "grad_norm": 1.1399345397949219, + "learning_rate": 8.604049520834816e-05, + "loss": 0.6099681854248047, + "step": 4328 + }, + { + "epoch": 1.8270042194092828, + "grad_norm": 1.1131435632705688, + "learning_rate": 8.602384573085345e-05, + "loss": 0.6267056465148926, + "step": 4330 + }, + { + "epoch": 1.8278481012658228, + "grad_norm": 1.1312925815582275, + "learning_rate": 8.600718794334939e-05, + "loss": 0.609437882900238, + "step": 4332 + }, + { + "epoch": 1.828691983122363, + "grad_norm": 1.3711494207382202, + "learning_rate": 8.599052184967859e-05, + "loss": 0.727881669998169, + "step": 4334 + }, + { + "epoch": 1.829535864978903, + "grad_norm": 1.1403605937957764, + "learning_rate": 8.597384745368562e-05, + "loss": 0.6771696209907532, + "step": 4336 + }, + { + "epoch": 1.830379746835443, + "grad_norm": 1.2769951820373535, + "learning_rate": 8.595716475921693e-05, + "loss": 0.6812924742698669, + "step": 4338 + }, + { + "epoch": 1.831223628691983, + "grad_norm": 1.055721402168274, + "learning_rate": 8.59404737701209e-05, + "loss": 0.6403515338897705, + "step": 4340 + }, + { + "epoch": 1.8320675105485233, + "grad_norm": 1.1047639846801758, + "learning_rate": 8.592377449024784e-05, + "loss": 0.663240373134613, + "step": 4342 + }, + { + "epoch": 1.8329113924050633, + "grad_norm": 1.0808883905410767, + "learning_rate": 8.590706692344991e-05, + "loss": 0.6398993134498596, + "step": 4344 + }, + { + "epoch": 1.8337552742616032, + "grad_norm": 1.2433407306671143, + "learning_rate": 8.589035107358125e-05, + "loss": 0.6838348507881165, + "step": 4346 + }, + { + "epoch": 1.8345991561181436, + "grad_norm": 1.031216025352478, + "learning_rate": 8.58736269444979e-05, + "loss": 0.640884280204773, + "step": 4348 + }, + { + "epoch": 1.8354430379746836, + "grad_norm": 1.1417057514190674, + "learning_rate": 8.585689454005776e-05, + "loss": 0.6346741914749146, + "step": 4350 + }, + { + "epoch": 1.8362869198312235, + "grad_norm": 1.210988998413086, + "learning_rate": 8.584015386412072e-05, + "loss": 0.6209521889686584, + "step": 4352 + }, + { + "epoch": 1.8371308016877637, + "grad_norm": 1.2120760679244995, + "learning_rate": 8.582340492054847e-05, + "loss": 0.6699252128601074, + "step": 4354 + }, + { + "epoch": 1.8379746835443038, + "grad_norm": 1.1768114566802979, + "learning_rate": 8.580664771320475e-05, + "loss": 0.6472980380058289, + "step": 4356 + }, + { + "epoch": 1.8388185654008438, + "grad_norm": 1.060070276260376, + "learning_rate": 8.578988224595506e-05, + "loss": 0.6440452933311462, + "step": 4358 + }, + { + "epoch": 1.839662447257384, + "grad_norm": 1.1366443634033203, + "learning_rate": 8.57731085226669e-05, + "loss": 0.5894474387168884, + "step": 4360 + }, + { + "epoch": 1.840506329113924, + "grad_norm": 1.1571751832962036, + "learning_rate": 8.575632654720963e-05, + "loss": 0.5868900418281555, + "step": 4362 + }, + { + "epoch": 1.841350210970464, + "grad_norm": 1.1983840465545654, + "learning_rate": 8.573953632345453e-05, + "loss": 0.5841533541679382, + "step": 4364 + }, + { + "epoch": 1.8421940928270042, + "grad_norm": 1.101806640625, + "learning_rate": 8.572273785527481e-05, + "loss": 0.5503215193748474, + "step": 4366 + }, + { + "epoch": 1.8430379746835444, + "grad_norm": 1.0327471494674683, + "learning_rate": 8.570593114654552e-05, + "loss": 0.6131128072738647, + "step": 4368 + }, + { + "epoch": 1.8438818565400843, + "grad_norm": 1.1421098709106445, + "learning_rate": 8.568911620114368e-05, + "loss": 0.6614060401916504, + "step": 4370 + }, + { + "epoch": 1.8447257383966245, + "grad_norm": 1.1707026958465576, + "learning_rate": 8.567229302294814e-05, + "loss": 0.6392307877540588, + "step": 4372 + }, + { + "epoch": 1.8455696202531646, + "grad_norm": 1.1704418659210205, + "learning_rate": 8.565546161583969e-05, + "loss": 0.6560825109481812, + "step": 4374 + }, + { + "epoch": 1.8464135021097046, + "grad_norm": 1.3618037700653076, + "learning_rate": 8.563862198370103e-05, + "loss": 0.6996290683746338, + "step": 4376 + }, + { + "epoch": 1.8472573839662447, + "grad_norm": 1.116645097732544, + "learning_rate": 8.562177413041674e-05, + "loss": 0.6776535511016846, + "step": 4378 + }, + { + "epoch": 1.8481012658227849, + "grad_norm": 1.1669151782989502, + "learning_rate": 8.560491805987327e-05, + "loss": 0.6390423774719238, + "step": 4380 + }, + { + "epoch": 1.8489451476793248, + "grad_norm": 1.2188117504119873, + "learning_rate": 8.558805377595904e-05, + "loss": 0.6554020047187805, + "step": 4382 + }, + { + "epoch": 1.849789029535865, + "grad_norm": 1.216829776763916, + "learning_rate": 8.557118128256425e-05, + "loss": 0.6291787624359131, + "step": 4384 + }, + { + "epoch": 1.8506329113924052, + "grad_norm": 1.0431596040725708, + "learning_rate": 8.555430058358111e-05, + "loss": 0.6484442949295044, + "step": 4386 + }, + { + "epoch": 1.851476793248945, + "grad_norm": 1.3015289306640625, + "learning_rate": 8.553741168290367e-05, + "loss": 0.7034047842025757, + "step": 4388 + }, + { + "epoch": 1.8523206751054853, + "grad_norm": 1.2062040567398071, + "learning_rate": 8.552051458442785e-05, + "loss": 0.644135594367981, + "step": 4390 + }, + { + "epoch": 1.8531645569620254, + "grad_norm": 1.238461971282959, + "learning_rate": 8.55036092920515e-05, + "loss": 0.6767282485961914, + "step": 4392 + }, + { + "epoch": 1.8540084388185654, + "grad_norm": 1.2978830337524414, + "learning_rate": 8.548669580967435e-05, + "loss": 0.7292267680168152, + "step": 4394 + }, + { + "epoch": 1.8548523206751055, + "grad_norm": 1.1448328495025635, + "learning_rate": 8.546977414119801e-05, + "loss": 0.6788421273231506, + "step": 4396 + }, + { + "epoch": 1.8556962025316457, + "grad_norm": 1.0685368776321411, + "learning_rate": 8.5452844290526e-05, + "loss": 0.6745942234992981, + "step": 4398 + }, + { + "epoch": 1.8565400843881856, + "grad_norm": 1.125707983970642, + "learning_rate": 8.543590626156368e-05, + "loss": 0.6351125836372375, + "step": 4400 + }, + { + "epoch": 1.8565400843881856, + "eval_loss": 0.6961485147476196, + "eval_runtime": 513.5724, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 4400 + }, + { + "epoch": 1.8573839662447258, + "grad_norm": 1.072179913520813, + "learning_rate": 8.541896005821835e-05, + "loss": 0.5840762257575989, + "step": 4402 + }, + { + "epoch": 1.858227848101266, + "grad_norm": 1.2572803497314453, + "learning_rate": 8.540200568439915e-05, + "loss": 0.6431074738502502, + "step": 4404 + }, + { + "epoch": 1.859071729957806, + "grad_norm": 1.3294413089752197, + "learning_rate": 8.538504314401718e-05, + "loss": 0.708808183670044, + "step": 4406 + }, + { + "epoch": 1.8599156118143458, + "grad_norm": 1.1775587797164917, + "learning_rate": 8.536807244098533e-05, + "loss": 0.6580085754394531, + "step": 4408 + }, + { + "epoch": 1.8607594936708862, + "grad_norm": 1.1880089044570923, + "learning_rate": 8.53510935792184e-05, + "loss": 0.6500136256217957, + "step": 4410 + }, + { + "epoch": 1.8616033755274262, + "grad_norm": 1.2166204452514648, + "learning_rate": 8.533410656263313e-05, + "loss": 0.6922352313995361, + "step": 4412 + }, + { + "epoch": 1.862447257383966, + "grad_norm": 1.0405415296554565, + "learning_rate": 8.531711139514808e-05, + "loss": 0.6761626601219177, + "step": 4414 + }, + { + "epoch": 1.8632911392405065, + "grad_norm": 1.0674270391464233, + "learning_rate": 8.530010808068371e-05, + "loss": 0.672576904296875, + "step": 4416 + }, + { + "epoch": 1.8641350210970464, + "grad_norm": 1.0584741830825806, + "learning_rate": 8.528309662316236e-05, + "loss": 0.5521218180656433, + "step": 4418 + }, + { + "epoch": 1.8649789029535864, + "grad_norm": 1.3619039058685303, + "learning_rate": 8.526607702650824e-05, + "loss": 0.6546680927276611, + "step": 4420 + }, + { + "epoch": 1.8658227848101265, + "grad_norm": 0.9904745221138, + "learning_rate": 8.524904929464745e-05, + "loss": 0.6043933629989624, + "step": 4422 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 1.3046703338623047, + "learning_rate": 8.523201343150795e-05, + "loss": 0.7106801271438599, + "step": 4424 + }, + { + "epoch": 1.8675105485232066, + "grad_norm": 1.1166832447052002, + "learning_rate": 8.52149694410196e-05, + "loss": 0.6456703543663025, + "step": 4426 + }, + { + "epoch": 1.8683544303797468, + "grad_norm": 1.1260632276535034, + "learning_rate": 8.519791732711412e-05, + "loss": 0.5963318347930908, + "step": 4428 + }, + { + "epoch": 1.869198312236287, + "grad_norm": 1.0990599393844604, + "learning_rate": 8.51808570937251e-05, + "loss": 0.6295356750488281, + "step": 4430 + }, + { + "epoch": 1.870042194092827, + "grad_norm": 1.3689274787902832, + "learning_rate": 8.516378874478801e-05, + "loss": 0.6984617114067078, + "step": 4432 + }, + { + "epoch": 1.870886075949367, + "grad_norm": 1.0986580848693848, + "learning_rate": 8.514671228424018e-05, + "loss": 0.5598900318145752, + "step": 4434 + }, + { + "epoch": 1.8717299578059072, + "grad_norm": 0.9570761322975159, + "learning_rate": 8.512962771602085e-05, + "loss": 0.6286435723304749, + "step": 4436 + }, + { + "epoch": 1.8725738396624472, + "grad_norm": 1.1480669975280762, + "learning_rate": 8.511253504407107e-05, + "loss": 0.5956313014030457, + "step": 4438 + }, + { + "epoch": 1.8734177215189873, + "grad_norm": 1.1132479906082153, + "learning_rate": 8.50954342723338e-05, + "loss": 0.6523844599723816, + "step": 4440 + }, + { + "epoch": 1.8742616033755275, + "grad_norm": 1.1569167375564575, + "learning_rate": 8.507832540475387e-05, + "loss": 0.6231355667114258, + "step": 4442 + }, + { + "epoch": 1.8751054852320674, + "grad_norm": 1.1327043771743774, + "learning_rate": 8.506120844527796e-05, + "loss": 0.660773754119873, + "step": 4444 + }, + { + "epoch": 1.8759493670886076, + "grad_norm": 0.8939630389213562, + "learning_rate": 8.504408339785463e-05, + "loss": 0.6319235563278198, + "step": 4446 + }, + { + "epoch": 1.8767932489451478, + "grad_norm": 1.1910638809204102, + "learning_rate": 8.50269502664343e-05, + "loss": 0.6753001809120178, + "step": 4448 + }, + { + "epoch": 1.8776371308016877, + "grad_norm": 1.1502408981323242, + "learning_rate": 8.500980905496923e-05, + "loss": 0.6300671696662903, + "step": 4450 + }, + { + "epoch": 1.8784810126582279, + "grad_norm": 1.0639009475708008, + "learning_rate": 8.49926597674136e-05, + "loss": 0.6196691989898682, + "step": 4452 + }, + { + "epoch": 1.879324894514768, + "grad_norm": 1.1072754859924316, + "learning_rate": 8.497550240772341e-05, + "loss": 0.7029181122779846, + "step": 4454 + }, + { + "epoch": 1.880168776371308, + "grad_norm": 1.0440188646316528, + "learning_rate": 8.495833697985652e-05, + "loss": 0.65432208776474, + "step": 4456 + }, + { + "epoch": 1.8810126582278481, + "grad_norm": 1.0646617412567139, + "learning_rate": 8.494116348777269e-05, + "loss": 0.6446614861488342, + "step": 4458 + }, + { + "epoch": 1.8818565400843883, + "grad_norm": 1.2163805961608887, + "learning_rate": 8.492398193543349e-05, + "loss": 0.6430497765541077, + "step": 4460 + }, + { + "epoch": 1.8827004219409282, + "grad_norm": 1.2715297937393188, + "learning_rate": 8.490679232680241e-05, + "loss": 0.6609845161437988, + "step": 4462 + }, + { + "epoch": 1.8835443037974684, + "grad_norm": 1.0435588359832764, + "learning_rate": 8.488959466584469e-05, + "loss": 0.5791062712669373, + "step": 4464 + }, + { + "epoch": 1.8843881856540086, + "grad_norm": 1.229202151298523, + "learning_rate": 8.487238895652759e-05, + "loss": 0.6312171220779419, + "step": 4466 + }, + { + "epoch": 1.8852320675105485, + "grad_norm": 1.0713022947311401, + "learning_rate": 8.485517520282008e-05, + "loss": 0.6698815226554871, + "step": 4468 + }, + { + "epoch": 1.8860759493670884, + "grad_norm": 1.0172312259674072, + "learning_rate": 8.483795340869305e-05, + "loss": 0.6283810138702393, + "step": 4470 + }, + { + "epoch": 1.8869198312236288, + "grad_norm": 1.2880207300186157, + "learning_rate": 8.482072357811926e-05, + "loss": 0.6659437417984009, + "step": 4472 + }, + { + "epoch": 1.8877637130801688, + "grad_norm": 1.0840508937835693, + "learning_rate": 8.480348571507329e-05, + "loss": 0.6190289258956909, + "step": 4474 + }, + { + "epoch": 1.8886075949367087, + "grad_norm": 1.1101994514465332, + "learning_rate": 8.478623982353156e-05, + "loss": 0.5760066509246826, + "step": 4476 + }, + { + "epoch": 1.889451476793249, + "grad_norm": 1.2388770580291748, + "learning_rate": 8.476898590747237e-05, + "loss": 0.6151811480522156, + "step": 4478 + }, + { + "epoch": 1.890295358649789, + "grad_norm": 0.9986408948898315, + "learning_rate": 8.475172397087591e-05, + "loss": 0.5991593599319458, + "step": 4480 + }, + { + "epoch": 1.891139240506329, + "grad_norm": 1.1380778551101685, + "learning_rate": 8.473445401772415e-05, + "loss": 0.7262179255485535, + "step": 4482 + }, + { + "epoch": 1.8919831223628694, + "grad_norm": 1.3933676481246948, + "learning_rate": 8.471717605200092e-05, + "loss": 0.5806916356086731, + "step": 4484 + }, + { + "epoch": 1.8928270042194093, + "grad_norm": 1.0242944955825806, + "learning_rate": 8.469989007769194e-05, + "loss": 0.617904782295227, + "step": 4486 + }, + { + "epoch": 1.8936708860759492, + "grad_norm": 1.0909028053283691, + "learning_rate": 8.468259609878475e-05, + "loss": 0.6488202810287476, + "step": 4488 + }, + { + "epoch": 1.8945147679324894, + "grad_norm": 1.042611002922058, + "learning_rate": 8.466529411926874e-05, + "loss": 0.6015118956565857, + "step": 4490 + }, + { + "epoch": 1.8953586497890296, + "grad_norm": 1.3965784311294556, + "learning_rate": 8.46479841431351e-05, + "loss": 0.7035272717475891, + "step": 4492 + }, + { + "epoch": 1.8962025316455695, + "grad_norm": 1.1486462354660034, + "learning_rate": 8.463066617437698e-05, + "loss": 0.6611229777336121, + "step": 4494 + }, + { + "epoch": 1.8970464135021097, + "grad_norm": 1.0845859050750732, + "learning_rate": 8.461334021698925e-05, + "loss": 0.6378056406974792, + "step": 4496 + }, + { + "epoch": 1.8978902953586498, + "grad_norm": 0.936612069606781, + "learning_rate": 8.459600627496869e-05, + "loss": 0.642429769039154, + "step": 4498 + }, + { + "epoch": 1.8987341772151898, + "grad_norm": 1.1905454397201538, + "learning_rate": 8.457866435231391e-05, + "loss": 0.6341768503189087, + "step": 4500 + }, + { + "epoch": 1.8987341772151898, + "eval_loss": 0.6938078999519348, + "eval_runtime": 513.615, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 4.102, + "step": 4500 + }, + { + "epoch": 1.89957805907173, + "grad_norm": 0.9778118133544922, + "learning_rate": 8.456131445302538e-05, + "loss": 0.5973100662231445, + "step": 4502 + }, + { + "epoch": 1.90042194092827, + "grad_norm": 0.9587083458900452, + "learning_rate": 8.454395658110536e-05, + "loss": 0.5982911586761475, + "step": 4504 + }, + { + "epoch": 1.90126582278481, + "grad_norm": 1.327643871307373, + "learning_rate": 8.452659074055798e-05, + "loss": 0.6858586668968201, + "step": 4506 + }, + { + "epoch": 1.9021097046413502, + "grad_norm": 1.0740257501602173, + "learning_rate": 8.450921693538922e-05, + "loss": 0.6172328591346741, + "step": 4508 + }, + { + "epoch": 1.9029535864978904, + "grad_norm": 1.0705101490020752, + "learning_rate": 8.449183516960685e-05, + "loss": 0.5349634289741516, + "step": 4510 + }, + { + "epoch": 1.9037974683544303, + "grad_norm": 0.9151237607002258, + "learning_rate": 8.447444544722058e-05, + "loss": 0.5769277811050415, + "step": 4512 + }, + { + "epoch": 1.9046413502109705, + "grad_norm": 1.139900803565979, + "learning_rate": 8.44570477722418e-05, + "loss": 0.6579093933105469, + "step": 4514 + }, + { + "epoch": 1.9054852320675106, + "grad_norm": 1.2481658458709717, + "learning_rate": 8.443964214868387e-05, + "loss": 0.6748929619789124, + "step": 4516 + }, + { + "epoch": 1.9063291139240506, + "grad_norm": 1.1661686897277832, + "learning_rate": 8.442222858056193e-05, + "loss": 0.6492021083831787, + "step": 4518 + }, + { + "epoch": 1.9071729957805907, + "grad_norm": 1.241477370262146, + "learning_rate": 8.440480707189295e-05, + "loss": 0.635409951210022, + "step": 4520 + }, + { + "epoch": 1.908016877637131, + "grad_norm": 1.1102054119110107, + "learning_rate": 8.438737762669573e-05, + "loss": 0.631928026676178, + "step": 4522 + }, + { + "epoch": 1.9088607594936708, + "grad_norm": 1.0638107061386108, + "learning_rate": 8.43699402489909e-05, + "loss": 0.604518473148346, + "step": 4524 + }, + { + "epoch": 1.909704641350211, + "grad_norm": 1.0270655155181885, + "learning_rate": 8.435249494280096e-05, + "loss": 0.61314457654953, + "step": 4526 + }, + { + "epoch": 1.9105485232067512, + "grad_norm": 1.1840111017227173, + "learning_rate": 8.433504171215018e-05, + "loss": 0.661663293838501, + "step": 4528 + }, + { + "epoch": 1.9113924050632911, + "grad_norm": 1.1404399871826172, + "learning_rate": 8.43175805610647e-05, + "loss": 0.7026967406272888, + "step": 4530 + }, + { + "epoch": 1.9122362869198313, + "grad_norm": 1.2371265888214111, + "learning_rate": 8.430011149357246e-05, + "loss": 0.6599440574645996, + "step": 4532 + }, + { + "epoch": 1.9130801687763714, + "grad_norm": 1.0042651891708374, + "learning_rate": 8.428263451370326e-05, + "loss": 0.5728344321250916, + "step": 4534 + }, + { + "epoch": 1.9139240506329114, + "grad_norm": 1.04367196559906, + "learning_rate": 8.426514962548866e-05, + "loss": 0.6495450735092163, + "step": 4536 + }, + { + "epoch": 1.9147679324894513, + "grad_norm": 1.0867135524749756, + "learning_rate": 8.424765683296215e-05, + "loss": 0.6406553387641907, + "step": 4538 + }, + { + "epoch": 1.9156118143459917, + "grad_norm": 1.0751310586929321, + "learning_rate": 8.423015614015892e-05, + "loss": 0.6692186594009399, + "step": 4540 + }, + { + "epoch": 1.9164556962025316, + "grad_norm": 1.13556969165802, + "learning_rate": 8.421264755111607e-05, + "loss": 0.6029785871505737, + "step": 4542 + }, + { + "epoch": 1.9172995780590716, + "grad_norm": 1.1560977697372437, + "learning_rate": 8.419513106987251e-05, + "loss": 0.6457844972610474, + "step": 4544 + }, + { + "epoch": 1.918143459915612, + "grad_norm": 1.2192902565002441, + "learning_rate": 8.417760670046893e-05, + "loss": 0.7082147598266602, + "step": 4546 + }, + { + "epoch": 1.918987341772152, + "grad_norm": 1.1170696020126343, + "learning_rate": 8.41600744469479e-05, + "loss": 0.6919234991073608, + "step": 4548 + }, + { + "epoch": 1.9198312236286919, + "grad_norm": 1.061253547668457, + "learning_rate": 8.414253431335373e-05, + "loss": 0.6310052871704102, + "step": 4550 + }, + { + "epoch": 1.920675105485232, + "grad_norm": 1.0671885013580322, + "learning_rate": 8.412498630373263e-05, + "loss": 0.6330236792564392, + "step": 4552 + }, + { + "epoch": 1.9215189873417722, + "grad_norm": 1.2085163593292236, + "learning_rate": 8.410743042213256e-05, + "loss": 0.7031015157699585, + "step": 4554 + }, + { + "epoch": 1.9223628691983121, + "grad_norm": 1.2682013511657715, + "learning_rate": 8.408986667260334e-05, + "loss": 0.7078304290771484, + "step": 4556 + }, + { + "epoch": 1.9232067510548523, + "grad_norm": 1.2966876029968262, + "learning_rate": 8.407229505919658e-05, + "loss": 0.6542860865592957, + "step": 4558 + }, + { + "epoch": 1.9240506329113924, + "grad_norm": 1.1086169481277466, + "learning_rate": 8.405471558596573e-05, + "loss": 0.5856828093528748, + "step": 4560 + }, + { + "epoch": 1.9248945147679324, + "grad_norm": 1.3175504207611084, + "learning_rate": 8.403712825696604e-05, + "loss": 0.7382104992866516, + "step": 4562 + }, + { + "epoch": 1.9257383966244725, + "grad_norm": 1.163164496421814, + "learning_rate": 8.401953307625454e-05, + "loss": 0.6862360239028931, + "step": 4564 + }, + { + "epoch": 1.9265822784810127, + "grad_norm": 1.207650899887085, + "learning_rate": 8.400193004789013e-05, + "loss": 0.7442302703857422, + "step": 4566 + }, + { + "epoch": 1.9274261603375527, + "grad_norm": 1.1570589542388916, + "learning_rate": 8.398431917593345e-05, + "loss": 0.595226526260376, + "step": 4568 + }, + { + "epoch": 1.9282700421940928, + "grad_norm": 1.091927170753479, + "learning_rate": 8.396670046444704e-05, + "loss": 0.6360410451889038, + "step": 4570 + }, + { + "epoch": 1.929113924050633, + "grad_norm": 1.149559497833252, + "learning_rate": 8.394907391749516e-05, + "loss": 0.6343122124671936, + "step": 4572 + }, + { + "epoch": 1.929957805907173, + "grad_norm": 1.0585254430770874, + "learning_rate": 8.393143953914395e-05, + "loss": 0.7394745349884033, + "step": 4574 + }, + { + "epoch": 1.930801687763713, + "grad_norm": 1.1648521423339844, + "learning_rate": 8.391379733346128e-05, + "loss": 0.6489678025245667, + "step": 4576 + }, + { + "epoch": 1.9316455696202532, + "grad_norm": 1.1756316423416138, + "learning_rate": 8.389614730451692e-05, + "loss": 0.6687861084938049, + "step": 4578 + }, + { + "epoch": 1.9324894514767932, + "grad_norm": 0.9857237339019775, + "learning_rate": 8.387848945638235e-05, + "loss": 0.523727536201477, + "step": 4580 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 1.1038693189620972, + "learning_rate": 8.386082379313092e-05, + "loss": 0.6545047760009766, + "step": 4582 + }, + { + "epoch": 1.9341772151898735, + "grad_norm": 1.0780832767486572, + "learning_rate": 8.384315031883774e-05, + "loss": 0.6067036390304565, + "step": 4584 + }, + { + "epoch": 1.9350210970464135, + "grad_norm": 1.2915070056915283, + "learning_rate": 8.382546903757975e-05, + "loss": 0.6880824565887451, + "step": 4586 + }, + { + "epoch": 1.9358649789029536, + "grad_norm": 1.1243441104888916, + "learning_rate": 8.380777995343568e-05, + "loss": 0.7319117188453674, + "step": 4588 + }, + { + "epoch": 1.9367088607594938, + "grad_norm": 1.1143072843551636, + "learning_rate": 8.379008307048609e-05, + "loss": 0.6845395565032959, + "step": 4590 + }, + { + "epoch": 1.9375527426160337, + "grad_norm": 1.039494276046753, + "learning_rate": 8.377237839281327e-05, + "loss": 0.6653600335121155, + "step": 4592 + }, + { + "epoch": 1.9383966244725739, + "grad_norm": 1.299617886543274, + "learning_rate": 8.375466592450136e-05, + "loss": 0.6352495551109314, + "step": 4594 + }, + { + "epoch": 1.939240506329114, + "grad_norm": 0.9918657541275024, + "learning_rate": 8.373694566963631e-05, + "loss": 0.5660957098007202, + "step": 4596 + }, + { + "epoch": 1.940084388185654, + "grad_norm": 1.0540478229522705, + "learning_rate": 8.371921763230579e-05, + "loss": 0.6296496987342834, + "step": 4598 + }, + { + "epoch": 1.9409282700421941, + "grad_norm": 1.1309545040130615, + "learning_rate": 8.370148181659939e-05, + "loss": 0.6672025918960571, + "step": 4600 + }, + { + "epoch": 1.9409282700421941, + "eval_loss": 0.6930755376815796, + "eval_runtime": 617.8927, + "eval_samples_per_second": 3.41, + "eval_steps_per_second": 3.41, + "step": 4600 + }, + { + "epoch": 1.9417721518987343, + "grad_norm": 1.2338588237762451, + "learning_rate": 8.368373822660836e-05, + "loss": 0.6200884580612183, + "step": 4602 + }, + { + "epoch": 1.9426160337552743, + "grad_norm": 1.1756945848464966, + "learning_rate": 8.366598686642582e-05, + "loss": 0.653294026851654, + "step": 4604 + }, + { + "epoch": 1.9434599156118142, + "grad_norm": 1.032018780708313, + "learning_rate": 8.364822774014671e-05, + "loss": 0.5670395493507385, + "step": 4606 + }, + { + "epoch": 1.9443037974683546, + "grad_norm": 1.045280933380127, + "learning_rate": 8.363046085186766e-05, + "loss": 0.6819197535514832, + "step": 4608 + }, + { + "epoch": 1.9451476793248945, + "grad_norm": 1.3223930597305298, + "learning_rate": 8.36126862056872e-05, + "loss": 0.6952820420265198, + "step": 4610 + }, + { + "epoch": 1.9459915611814345, + "grad_norm": 1.0048432350158691, + "learning_rate": 8.359490380570556e-05, + "loss": 0.5291440486907959, + "step": 4612 + }, + { + "epoch": 1.9468354430379748, + "grad_norm": 1.1477346420288086, + "learning_rate": 8.357711365602483e-05, + "loss": 0.6857813000679016, + "step": 4614 + }, + { + "epoch": 1.9476793248945148, + "grad_norm": 0.959985077381134, + "learning_rate": 8.355931576074882e-05, + "loss": 0.5581508278846741, + "step": 4616 + }, + { + "epoch": 1.9485232067510547, + "grad_norm": 1.1104289293289185, + "learning_rate": 8.35415101239832e-05, + "loss": 0.6536211371421814, + "step": 4618 + }, + { + "epoch": 1.9493670886075949, + "grad_norm": 1.2344517707824707, + "learning_rate": 8.352369674983535e-05, + "loss": 0.6570560336112976, + "step": 4620 + }, + { + "epoch": 1.950210970464135, + "grad_norm": 1.3411606550216675, + "learning_rate": 8.350587564241451e-05, + "loss": 0.6070495247840881, + "step": 4622 + }, + { + "epoch": 1.951054852320675, + "grad_norm": 1.1713159084320068, + "learning_rate": 8.348804680583166e-05, + "loss": 0.6444135904312134, + "step": 4624 + }, + { + "epoch": 1.9518987341772152, + "grad_norm": 1.127242922782898, + "learning_rate": 8.347021024419954e-05, + "loss": 0.6517419815063477, + "step": 4626 + }, + { + "epoch": 1.9527426160337553, + "grad_norm": 1.0733028650283813, + "learning_rate": 8.345236596163274e-05, + "loss": 0.6174065470695496, + "step": 4628 + }, + { + "epoch": 1.9535864978902953, + "grad_norm": 1.1114680767059326, + "learning_rate": 8.343451396224757e-05, + "loss": 0.7163593769073486, + "step": 4630 + }, + { + "epoch": 1.9544303797468354, + "grad_norm": 1.0839568376541138, + "learning_rate": 8.341665425016216e-05, + "loss": 0.698553204536438, + "step": 4632 + }, + { + "epoch": 1.9552742616033756, + "grad_norm": 1.17001211643219, + "learning_rate": 8.339878682949638e-05, + "loss": 0.6224857568740845, + "step": 4634 + }, + { + "epoch": 1.9561181434599155, + "grad_norm": 3.483793020248413, + "learning_rate": 8.338091170437193e-05, + "loss": 0.5931200981140137, + "step": 4636 + }, + { + "epoch": 1.9569620253164557, + "grad_norm": 1.1575394868850708, + "learning_rate": 8.336302887891224e-05, + "loss": 0.6031442284584045, + "step": 4638 + }, + { + "epoch": 1.9578059071729959, + "grad_norm": 1.1494992971420288, + "learning_rate": 8.334513835724252e-05, + "loss": 0.6101768016815186, + "step": 4640 + }, + { + "epoch": 1.9586497890295358, + "grad_norm": 1.3858197927474976, + "learning_rate": 8.332724014348981e-05, + "loss": 0.6571711301803589, + "step": 4642 + }, + { + "epoch": 1.959493670886076, + "grad_norm": 1.1094943284988403, + "learning_rate": 8.330933424178284e-05, + "loss": 0.6391071677207947, + "step": 4644 + }, + { + "epoch": 1.9603375527426161, + "grad_norm": 1.1640198230743408, + "learning_rate": 8.329142065625218e-05, + "loss": 0.6542805433273315, + "step": 4646 + }, + { + "epoch": 1.961181434599156, + "grad_norm": 1.1080211400985718, + "learning_rate": 8.327349939103016e-05, + "loss": 0.6053075194358826, + "step": 4648 + }, + { + "epoch": 1.9620253164556962, + "grad_norm": 1.0137052536010742, + "learning_rate": 8.325557045025085e-05, + "loss": 0.6009573340415955, + "step": 4650 + }, + { + "epoch": 1.9628691983122364, + "grad_norm": 1.0867283344268799, + "learning_rate": 8.323763383805012e-05, + "loss": 0.5993483066558838, + "step": 4652 + }, + { + "epoch": 1.9637130801687763, + "grad_norm": 1.0577161312103271, + "learning_rate": 8.321968955856562e-05, + "loss": 0.6788463592529297, + "step": 4654 + }, + { + "epoch": 1.9645569620253165, + "grad_norm": 1.2002183198928833, + "learning_rate": 8.320173761593672e-05, + "loss": 0.5786917209625244, + "step": 4656 + }, + { + "epoch": 1.9654008438818567, + "grad_norm": 1.2266993522644043, + "learning_rate": 8.318377801430461e-05, + "loss": 0.7437994480133057, + "step": 4658 + }, + { + "epoch": 1.9662447257383966, + "grad_norm": 1.007582187652588, + "learning_rate": 8.316581075781223e-05, + "loss": 0.6763550639152527, + "step": 4660 + }, + { + "epoch": 1.9670886075949368, + "grad_norm": 1.2374811172485352, + "learning_rate": 8.314783585060425e-05, + "loss": 0.6953140497207642, + "step": 4662 + }, + { + "epoch": 1.967932489451477, + "grad_norm": 1.1791057586669922, + "learning_rate": 8.312985329682717e-05, + "loss": 0.6867341995239258, + "step": 4664 + }, + { + "epoch": 1.9687763713080169, + "grad_norm": 1.1903331279754639, + "learning_rate": 8.31118631006292e-05, + "loss": 0.6445001363754272, + "step": 4666 + }, + { + "epoch": 1.9696202531645568, + "grad_norm": 1.1731067895889282, + "learning_rate": 8.309386526616034e-05, + "loss": 0.6500589847564697, + "step": 4668 + }, + { + "epoch": 1.9704641350210972, + "grad_norm": 0.9470233917236328, + "learning_rate": 8.307585979757233e-05, + "loss": 0.6215718984603882, + "step": 4670 + }, + { + "epoch": 1.9713080168776371, + "grad_norm": 1.2900800704956055, + "learning_rate": 8.305784669901872e-05, + "loss": 0.6396787762641907, + "step": 4672 + }, + { + "epoch": 1.972151898734177, + "grad_norm": 1.1729133129119873, + "learning_rate": 8.303982597465474e-05, + "loss": 0.6581959128379822, + "step": 4674 + }, + { + "epoch": 1.9729957805907175, + "grad_norm": 1.1450555324554443, + "learning_rate": 8.302179762863746e-05, + "loss": 0.7013490796089172, + "step": 4676 + }, + { + "epoch": 1.9738396624472574, + "grad_norm": 1.1506338119506836, + "learning_rate": 8.300376166512567e-05, + "loss": 0.6796102523803711, + "step": 4678 + }, + { + "epoch": 1.9746835443037973, + "grad_norm": 1.149979591369629, + "learning_rate": 8.298571808827991e-05, + "loss": 0.6960519552230835, + "step": 4680 + }, + { + "epoch": 1.9755274261603377, + "grad_norm": 1.1078912019729614, + "learning_rate": 8.296766690226249e-05, + "loss": 0.6789507865905762, + "step": 4682 + }, + { + "epoch": 1.9763713080168777, + "grad_norm": 1.0199202299118042, + "learning_rate": 8.294960811123747e-05, + "loss": 0.5962659120559692, + "step": 4684 + }, + { + "epoch": 1.9772151898734176, + "grad_norm": 1.2226134538650513, + "learning_rate": 8.293154171937068e-05, + "loss": 0.6483094692230225, + "step": 4686 + }, + { + "epoch": 1.9780590717299578, + "grad_norm": 1.184095025062561, + "learning_rate": 8.291346773082965e-05, + "loss": 0.6750242710113525, + "step": 4688 + }, + { + "epoch": 1.978902953586498, + "grad_norm": 1.1018693447113037, + "learning_rate": 8.289538614978375e-05, + "loss": 0.7094066739082336, + "step": 4690 + }, + { + "epoch": 1.9797468354430379, + "grad_norm": 1.0342390537261963, + "learning_rate": 8.287729698040403e-05, + "loss": 0.6554126739501953, + "step": 4692 + }, + { + "epoch": 1.980590717299578, + "grad_norm": 1.0603563785552979, + "learning_rate": 8.285920022686332e-05, + "loss": 0.5493529438972473, + "step": 4694 + }, + { + "epoch": 1.9814345991561182, + "grad_norm": 1.139609932899475, + "learning_rate": 8.284109589333617e-05, + "loss": 0.6824741363525391, + "step": 4696 + }, + { + "epoch": 1.9822784810126581, + "grad_norm": 1.2167822122573853, + "learning_rate": 8.282298398399895e-05, + "loss": 0.7121000289916992, + "step": 4698 + }, + { + "epoch": 1.9831223628691983, + "grad_norm": 1.109857201576233, + "learning_rate": 8.280486450302968e-05, + "loss": 0.6711249351501465, + "step": 4700 + }, + { + "epoch": 1.9831223628691983, + "eval_loss": 0.6923081278800964, + "eval_runtime": 514.7729, + "eval_samples_per_second": 4.093, + "eval_steps_per_second": 4.093, + "step": 4700 + }, + { + "epoch": 1.9839662447257385, + "grad_norm": 1.1387107372283936, + "learning_rate": 8.27867374546082e-05, + "loss": 0.581635594367981, + "step": 4702 + }, + { + "epoch": 1.9848101265822784, + "grad_norm": 1.2519257068634033, + "learning_rate": 8.27686028429161e-05, + "loss": 0.6867302060127258, + "step": 4704 + }, + { + "epoch": 1.9856540084388186, + "grad_norm": 1.0927205085754395, + "learning_rate": 8.275046067213663e-05, + "loss": 0.6494556665420532, + "step": 4706 + }, + { + "epoch": 1.9864978902953587, + "grad_norm": 1.042035698890686, + "learning_rate": 8.273231094645487e-05, + "loss": 0.6949493288993835, + "step": 4708 + }, + { + "epoch": 1.9873417721518987, + "grad_norm": 1.0220824480056763, + "learning_rate": 8.271415367005762e-05, + "loss": 0.6535884737968445, + "step": 4710 + }, + { + "epoch": 1.9881856540084388, + "grad_norm": 1.3023611307144165, + "learning_rate": 8.269598884713339e-05, + "loss": 0.6635278463363647, + "step": 4712 + }, + { + "epoch": 1.989029535864979, + "grad_norm": 1.2526965141296387, + "learning_rate": 8.267781648187248e-05, + "loss": 0.7194697856903076, + "step": 4714 + }, + { + "epoch": 1.989873417721519, + "grad_norm": 1.0388038158416748, + "learning_rate": 8.265963657846691e-05, + "loss": 0.6355333924293518, + "step": 4716 + }, + { + "epoch": 1.990717299578059, + "grad_norm": 1.0852965116500854, + "learning_rate": 8.264144914111041e-05, + "loss": 0.6898305416107178, + "step": 4718 + }, + { + "epoch": 1.9915611814345993, + "grad_norm": 1.0714049339294434, + "learning_rate": 8.262325417399847e-05, + "loss": 0.6202836036682129, + "step": 4720 + }, + { + "epoch": 1.9924050632911392, + "grad_norm": 1.0767238140106201, + "learning_rate": 8.260505168132835e-05, + "loss": 0.6160458326339722, + "step": 4722 + }, + { + "epoch": 1.9932489451476794, + "grad_norm": 0.9605211615562439, + "learning_rate": 8.258684166729899e-05, + "loss": 0.6049920916557312, + "step": 4724 + }, + { + "epoch": 1.9940928270042195, + "grad_norm": 1.0580185651779175, + "learning_rate": 8.256862413611113e-05, + "loss": 0.5622014999389648, + "step": 4726 + }, + { + "epoch": 1.9949367088607595, + "grad_norm": 1.1039034128189087, + "learning_rate": 8.255039909196713e-05, + "loss": 0.6678924560546875, + "step": 4728 + }, + { + "epoch": 1.9957805907172996, + "grad_norm": 1.1482586860656738, + "learning_rate": 8.253216653907123e-05, + "loss": 0.658260703086853, + "step": 4730 + }, + { + "epoch": 1.9966244725738398, + "grad_norm": 1.135349988937378, + "learning_rate": 8.251392648162929e-05, + "loss": 0.6461613178253174, + "step": 4732 + }, + { + "epoch": 1.9974683544303797, + "grad_norm": 1.0155420303344727, + "learning_rate": 8.249567892384895e-05, + "loss": 0.6837426424026489, + "step": 4734 + }, + { + "epoch": 1.9983122362869197, + "grad_norm": 1.3392970561981201, + "learning_rate": 8.247742386993958e-05, + "loss": 0.6091697812080383, + "step": 4736 + }, + { + "epoch": 1.99915611814346, + "grad_norm": 1.0509974956512451, + "learning_rate": 8.245916132411226e-05, + "loss": 0.6539653539657593, + "step": 4738 + }, + { + "epoch": 2.0, + "grad_norm": 0.9777396321296692, + "learning_rate": 8.244089129057982e-05, + "loss": 0.5630147457122803, + "step": 4740 + }, + { + "epoch": 2.00084388185654, + "grad_norm": 1.1639164686203003, + "learning_rate": 8.24226137735568e-05, + "loss": 0.6190353631973267, + "step": 4742 + }, + { + "epoch": 2.0016877637130803, + "grad_norm": 1.119614839553833, + "learning_rate": 8.240432877725947e-05, + "loss": 0.6282529234886169, + "step": 4744 + }, + { + "epoch": 2.0025316455696203, + "grad_norm": 1.114739179611206, + "learning_rate": 8.238603630590581e-05, + "loss": 0.6176725625991821, + "step": 4746 + }, + { + "epoch": 2.00337552742616, + "grad_norm": 1.0543076992034912, + "learning_rate": 8.236773636371557e-05, + "loss": 0.5182007551193237, + "step": 4748 + }, + { + "epoch": 2.0042194092827006, + "grad_norm": 1.060389518737793, + "learning_rate": 8.234942895491019e-05, + "loss": 0.532536506652832, + "step": 4750 + }, + { + "epoch": 2.0050632911392405, + "grad_norm": 1.0824412107467651, + "learning_rate": 8.233111408371282e-05, + "loss": 0.5474061369895935, + "step": 4752 + }, + { + "epoch": 2.0059071729957805, + "grad_norm": 1.1450858116149902, + "learning_rate": 8.231279175434838e-05, + "loss": 0.586384654045105, + "step": 4754 + }, + { + "epoch": 2.006751054852321, + "grad_norm": 1.1225577592849731, + "learning_rate": 8.229446197104345e-05, + "loss": 0.6469444036483765, + "step": 4756 + }, + { + "epoch": 2.007594936708861, + "grad_norm": 1.7292449474334717, + "learning_rate": 8.227612473802637e-05, + "loss": 0.5371572971343994, + "step": 4758 + }, + { + "epoch": 2.0084388185654007, + "grad_norm": 1.1743781566619873, + "learning_rate": 8.22577800595272e-05, + "loss": 0.558707058429718, + "step": 4760 + }, + { + "epoch": 2.009282700421941, + "grad_norm": 1.0385273694992065, + "learning_rate": 8.223942793977769e-05, + "loss": 0.5943514108657837, + "step": 4762 + }, + { + "epoch": 2.010126582278481, + "grad_norm": 1.1302000284194946, + "learning_rate": 8.222106838301131e-05, + "loss": 0.5630753636360168, + "step": 4764 + }, + { + "epoch": 2.010970464135021, + "grad_norm": 1.140005111694336, + "learning_rate": 8.220270139346327e-05, + "loss": 0.527510404586792, + "step": 4766 + }, + { + "epoch": 2.0118143459915614, + "grad_norm": 1.1979734897613525, + "learning_rate": 8.21843269753705e-05, + "loss": 0.6315013766288757, + "step": 4768 + }, + { + "epoch": 2.0126582278481013, + "grad_norm": 1.3759459257125854, + "learning_rate": 8.21659451329716e-05, + "loss": 0.6225199699401855, + "step": 4770 + }, + { + "epoch": 2.0135021097046413, + "grad_norm": 1.330600380897522, + "learning_rate": 8.21475558705069e-05, + "loss": 0.6838938593864441, + "step": 4772 + }, + { + "epoch": 2.014345991561181, + "grad_norm": 1.2365351915359497, + "learning_rate": 8.21291591922185e-05, + "loss": 0.606302797794342, + "step": 4774 + }, + { + "epoch": 2.0151898734177216, + "grad_norm": 1.1886142492294312, + "learning_rate": 8.211075510235011e-05, + "loss": 0.6194182634353638, + "step": 4776 + }, + { + "epoch": 2.0160337552742615, + "grad_norm": 1.1414743661880493, + "learning_rate": 8.209234360514721e-05, + "loss": 0.639540433883667, + "step": 4778 + }, + { + "epoch": 2.0168776371308015, + "grad_norm": 1.2877455949783325, + "learning_rate": 8.2073924704857e-05, + "loss": 0.6350902318954468, + "step": 4780 + }, + { + "epoch": 2.017721518987342, + "grad_norm": 1.095578908920288, + "learning_rate": 8.205549840572834e-05, + "loss": 0.5152000784873962, + "step": 4782 + }, + { + "epoch": 2.018565400843882, + "grad_norm": 1.0043798685073853, + "learning_rate": 8.203706471201183e-05, + "loss": 0.46245837211608887, + "step": 4784 + }, + { + "epoch": 2.0194092827004217, + "grad_norm": 1.2133857011795044, + "learning_rate": 8.201862362795979e-05, + "loss": 0.6471722722053528, + "step": 4786 + }, + { + "epoch": 2.020253164556962, + "grad_norm": 1.0835390090942383, + "learning_rate": 8.200017515782619e-05, + "loss": 0.5790625214576721, + "step": 4788 + }, + { + "epoch": 2.021097046413502, + "grad_norm": 1.0176091194152832, + "learning_rate": 8.198171930586678e-05, + "loss": 0.5826238989830017, + "step": 4790 + }, + { + "epoch": 2.021940928270042, + "grad_norm": 1.1581370830535889, + "learning_rate": 8.196325607633893e-05, + "loss": 0.5781272649765015, + "step": 4792 + }, + { + "epoch": 2.0227848101265824, + "grad_norm": 1.243381142616272, + "learning_rate": 8.194478547350178e-05, + "loss": 0.6600401997566223, + "step": 4794 + }, + { + "epoch": 2.0236286919831223, + "grad_norm": 1.0718560218811035, + "learning_rate": 8.192630750161612e-05, + "loss": 0.5291268825531006, + "step": 4796 + }, + { + "epoch": 2.0244725738396623, + "grad_norm": 1.2338320016860962, + "learning_rate": 8.190782216494448e-05, + "loss": 0.6564924120903015, + "step": 4798 + }, + { + "epoch": 2.0253164556962027, + "grad_norm": 0.978547990322113, + "learning_rate": 8.188932946775107e-05, + "loss": 0.5471183657646179, + "step": 4800 + }, + { + "epoch": 2.0253164556962027, + "eval_loss": 0.6924457550048828, + "eval_runtime": 514.0427, + "eval_samples_per_second": 4.099, + "eval_steps_per_second": 4.099, + "step": 4800 + }, + { + "epoch": 2.0261603375527426, + "grad_norm": 1.1782792806625366, + "learning_rate": 8.18708294143018e-05, + "loss": 0.567442774772644, + "step": 4802 + }, + { + "epoch": 2.0270042194092825, + "grad_norm": 1.0768574476242065, + "learning_rate": 8.185232200886426e-05, + "loss": 0.6005180478096008, + "step": 4804 + }, + { + "epoch": 2.027848101265823, + "grad_norm": 1.3096717596054077, + "learning_rate": 8.18338072557078e-05, + "loss": 0.616436779499054, + "step": 4806 + }, + { + "epoch": 2.028691983122363, + "grad_norm": 1.0233508348464966, + "learning_rate": 8.181528515910336e-05, + "loss": 0.49587416648864746, + "step": 4808 + }, + { + "epoch": 2.029535864978903, + "grad_norm": 1.0800065994262695, + "learning_rate": 8.179675572332366e-05, + "loss": 0.5758571624755859, + "step": 4810 + }, + { + "epoch": 2.030379746835443, + "grad_norm": 1.09299898147583, + "learning_rate": 8.177821895264309e-05, + "loss": 0.561736524105072, + "step": 4812 + }, + { + "epoch": 2.031223628691983, + "grad_norm": 1.1439210176467896, + "learning_rate": 8.175967485133771e-05, + "loss": 0.5249468088150024, + "step": 4814 + }, + { + "epoch": 2.032067510548523, + "grad_norm": 1.15841805934906, + "learning_rate": 8.174112342368532e-05, + "loss": 0.6429001688957214, + "step": 4816 + }, + { + "epoch": 2.0329113924050635, + "grad_norm": 1.1720670461654663, + "learning_rate": 8.172256467396533e-05, + "loss": 0.60152667760849, + "step": 4818 + }, + { + "epoch": 2.0337552742616034, + "grad_norm": 1.2652091979980469, + "learning_rate": 8.170399860645892e-05, + "loss": 0.5553541779518127, + "step": 4820 + }, + { + "epoch": 2.0345991561181433, + "grad_norm": 1.0768507719039917, + "learning_rate": 8.168542522544893e-05, + "loss": 0.5369323492050171, + "step": 4822 + }, + { + "epoch": 2.0354430379746837, + "grad_norm": 0.9906469583511353, + "learning_rate": 8.166684453521986e-05, + "loss": 0.5468952655792236, + "step": 4824 + }, + { + "epoch": 2.0362869198312237, + "grad_norm": 1.3448988199234009, + "learning_rate": 8.164825654005792e-05, + "loss": 0.5795659422874451, + "step": 4826 + }, + { + "epoch": 2.0371308016877636, + "grad_norm": 1.2502341270446777, + "learning_rate": 8.162966124425103e-05, + "loss": 0.6465779542922974, + "step": 4828 + }, + { + "epoch": 2.037974683544304, + "grad_norm": 1.1512303352355957, + "learning_rate": 8.161105865208875e-05, + "loss": 0.5509394407272339, + "step": 4830 + }, + { + "epoch": 2.038818565400844, + "grad_norm": 1.2513408660888672, + "learning_rate": 8.159244876786232e-05, + "loss": 0.5515735745429993, + "step": 4832 + }, + { + "epoch": 2.039662447257384, + "grad_norm": 1.3035682439804077, + "learning_rate": 8.157383159586473e-05, + "loss": 0.757799506187439, + "step": 4834 + }, + { + "epoch": 2.0405063291139243, + "grad_norm": 1.1136540174484253, + "learning_rate": 8.155520714039056e-05, + "loss": 0.607295036315918, + "step": 4836 + }, + { + "epoch": 2.041350210970464, + "grad_norm": 1.220146656036377, + "learning_rate": 8.153657540573613e-05, + "loss": 0.5769712328910828, + "step": 4838 + }, + { + "epoch": 2.042194092827004, + "grad_norm": 1.2104195356369019, + "learning_rate": 8.151793639619944e-05, + "loss": 0.5746933817863464, + "step": 4840 + }, + { + "epoch": 2.043037974683544, + "grad_norm": 1.241708517074585, + "learning_rate": 8.149929011608014e-05, + "loss": 0.5932332277297974, + "step": 4842 + }, + { + "epoch": 2.0438818565400845, + "grad_norm": 1.1172713041305542, + "learning_rate": 8.148063656967955e-05, + "loss": 0.583284318447113, + "step": 4844 + }, + { + "epoch": 2.0447257383966244, + "grad_norm": 1.0867618322372437, + "learning_rate": 8.14619757613007e-05, + "loss": 0.5589476823806763, + "step": 4846 + }, + { + "epoch": 2.0455696202531644, + "grad_norm": 1.2470483779907227, + "learning_rate": 8.14433076952483e-05, + "loss": 0.6118156313896179, + "step": 4848 + }, + { + "epoch": 2.0464135021097047, + "grad_norm": 1.0908832550048828, + "learning_rate": 8.142463237582868e-05, + "loss": 0.5815895795822144, + "step": 4850 + }, + { + "epoch": 2.0472573839662447, + "grad_norm": 1.2589281797409058, + "learning_rate": 8.140594980734989e-05, + "loss": 0.6232373714447021, + "step": 4852 + }, + { + "epoch": 2.0481012658227846, + "grad_norm": 1.234152913093567, + "learning_rate": 8.138725999412165e-05, + "loss": 0.5992053151130676, + "step": 4854 + }, + { + "epoch": 2.048945147679325, + "grad_norm": 1.3304446935653687, + "learning_rate": 8.136856294045533e-05, + "loss": 0.6494496464729309, + "step": 4856 + }, + { + "epoch": 2.049789029535865, + "grad_norm": 1.1871088743209839, + "learning_rate": 8.134985865066398e-05, + "loss": 0.6263431906700134, + "step": 4858 + }, + { + "epoch": 2.050632911392405, + "grad_norm": 1.1454699039459229, + "learning_rate": 8.133114712906234e-05, + "loss": 0.6036502122879028, + "step": 4860 + }, + { + "epoch": 2.0514767932489453, + "grad_norm": 1.2953420877456665, + "learning_rate": 8.131242837996675e-05, + "loss": 0.5674451589584351, + "step": 4862 + }, + { + "epoch": 2.052320675105485, + "grad_norm": 1.1874405145645142, + "learning_rate": 8.129370240769534e-05, + "loss": 0.5616317987442017, + "step": 4864 + }, + { + "epoch": 2.053164556962025, + "grad_norm": 1.2936227321624756, + "learning_rate": 8.127496921656777e-05, + "loss": 0.6495023369789124, + "step": 4866 + }, + { + "epoch": 2.0540084388185655, + "grad_norm": 1.1935228109359741, + "learning_rate": 8.125622881090544e-05, + "loss": 0.6028099060058594, + "step": 4868 + }, + { + "epoch": 2.0548523206751055, + "grad_norm": 0.9932331442832947, + "learning_rate": 8.123748119503143e-05, + "loss": 0.476296067237854, + "step": 4870 + }, + { + "epoch": 2.0556962025316454, + "grad_norm": 1.3878839015960693, + "learning_rate": 8.121872637327042e-05, + "loss": 0.6191902756690979, + "step": 4872 + }, + { + "epoch": 2.056540084388186, + "grad_norm": 1.1185581684112549, + "learning_rate": 8.11999643499488e-05, + "loss": 0.566487729549408, + "step": 4874 + }, + { + "epoch": 2.0573839662447257, + "grad_norm": 1.3729257583618164, + "learning_rate": 8.118119512939464e-05, + "loss": 0.5970078706741333, + "step": 4876 + }, + { + "epoch": 2.0582278481012657, + "grad_norm": 1.1332688331604004, + "learning_rate": 8.11624187159376e-05, + "loss": 0.570341944694519, + "step": 4878 + }, + { + "epoch": 2.059071729957806, + "grad_norm": 1.2648937702178955, + "learning_rate": 8.114363511390903e-05, + "loss": 0.6302897334098816, + "step": 4880 + }, + { + "epoch": 2.059915611814346, + "grad_norm": 1.250616192817688, + "learning_rate": 8.112484432764197e-05, + "loss": 0.5619142651557922, + "step": 4882 + }, + { + "epoch": 2.060759493670886, + "grad_norm": 0.9710861444473267, + "learning_rate": 8.110604636147109e-05, + "loss": 0.5426228642463684, + "step": 4884 + }, + { + "epoch": 2.0616033755274263, + "grad_norm": 1.1979506015777588, + "learning_rate": 8.108724121973271e-05, + "loss": 0.5498107671737671, + "step": 4886 + }, + { + "epoch": 2.0624472573839663, + "grad_norm": 1.0936485528945923, + "learning_rate": 8.106842890676483e-05, + "loss": 0.5695134401321411, + "step": 4888 + }, + { + "epoch": 2.0632911392405062, + "grad_norm": 1.1246092319488525, + "learning_rate": 8.10496094269071e-05, + "loss": 0.5998331308364868, + "step": 4890 + }, + { + "epoch": 2.0641350210970466, + "grad_norm": 1.244438648223877, + "learning_rate": 8.103078278450075e-05, + "loss": 0.5702623128890991, + "step": 4892 + }, + { + "epoch": 2.0649789029535865, + "grad_norm": 1.1585633754730225, + "learning_rate": 8.101194898388881e-05, + "loss": 0.5392299890518188, + "step": 4894 + }, + { + "epoch": 2.0658227848101265, + "grad_norm": 1.3044285774230957, + "learning_rate": 8.099310802941582e-05, + "loss": 0.5640127658843994, + "step": 4896 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 1.2483032941818237, + "learning_rate": 8.097425992542804e-05, + "loss": 0.6103175282478333, + "step": 4898 + }, + { + "epoch": 2.067510548523207, + "grad_norm": 1.0845462083816528, + "learning_rate": 8.095540467627337e-05, + "loss": 0.5041166543960571, + "step": 4900 + }, + { + "epoch": 2.067510548523207, + "eval_loss": 0.6941288113594055, + "eval_runtime": 513.4497, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 4900 + }, + { + "epoch": 2.0683544303797468, + "grad_norm": 1.2493232488632202, + "learning_rate": 8.093654228630134e-05, + "loss": 0.6253946423530579, + "step": 4902 + }, + { + "epoch": 2.0691983122362867, + "grad_norm": 1.1668756008148193, + "learning_rate": 8.091767275986317e-05, + "loss": 0.523486852645874, + "step": 4904 + }, + { + "epoch": 2.070042194092827, + "grad_norm": 1.1709638833999634, + "learning_rate": 8.089879610131167e-05, + "loss": 0.5569989681243896, + "step": 4906 + }, + { + "epoch": 2.070886075949367, + "grad_norm": 1.1044740676879883, + "learning_rate": 8.087991231500133e-05, + "loss": 0.642728865146637, + "step": 4908 + }, + { + "epoch": 2.071729957805907, + "grad_norm": 1.1032549142837524, + "learning_rate": 8.086102140528828e-05, + "loss": 0.5998259782791138, + "step": 4910 + }, + { + "epoch": 2.0725738396624473, + "grad_norm": 0.9980027079582214, + "learning_rate": 8.08421233765303e-05, + "loss": 0.5460172891616821, + "step": 4912 + }, + { + "epoch": 2.0734177215189873, + "grad_norm": 1.0866090059280396, + "learning_rate": 8.082321823308679e-05, + "loss": 0.5643284916877747, + "step": 4914 + }, + { + "epoch": 2.0742616033755272, + "grad_norm": 1.1942687034606934, + "learning_rate": 8.080430597931878e-05, + "loss": 0.554400622844696, + "step": 4916 + }, + { + "epoch": 2.0751054852320676, + "grad_norm": 1.0680599212646484, + "learning_rate": 8.078538661958901e-05, + "loss": 0.5955621004104614, + "step": 4918 + }, + { + "epoch": 2.0759493670886076, + "grad_norm": 1.20845627784729, + "learning_rate": 8.076646015826179e-05, + "loss": 0.5970203280448914, + "step": 4920 + }, + { + "epoch": 2.0767932489451475, + "grad_norm": 1.8368924856185913, + "learning_rate": 8.074752659970308e-05, + "loss": 0.6467664837837219, + "step": 4922 + }, + { + "epoch": 2.077637130801688, + "grad_norm": 1.3291922807693481, + "learning_rate": 8.072858594828053e-05, + "loss": 0.630719006061554, + "step": 4924 + }, + { + "epoch": 2.078481012658228, + "grad_norm": 1.1496083736419678, + "learning_rate": 8.070963820836333e-05, + "loss": 0.601140022277832, + "step": 4926 + }, + { + "epoch": 2.0793248945147678, + "grad_norm": 1.1562724113464355, + "learning_rate": 8.069068338432239e-05, + "loss": 0.6096881031990051, + "step": 4928 + }, + { + "epoch": 2.080168776371308, + "grad_norm": 1.0115300416946411, + "learning_rate": 8.067172148053021e-05, + "loss": 0.5085908770561218, + "step": 4930 + }, + { + "epoch": 2.081012658227848, + "grad_norm": 1.2181830406188965, + "learning_rate": 8.065275250136097e-05, + "loss": 0.5268720984458923, + "step": 4932 + }, + { + "epoch": 2.081856540084388, + "grad_norm": 1.1249788999557495, + "learning_rate": 8.06337764511904e-05, + "loss": 0.6075665950775146, + "step": 4934 + }, + { + "epoch": 2.0827004219409284, + "grad_norm": 1.1143964529037476, + "learning_rate": 8.061479333439595e-05, + "loss": 0.59170001745224, + "step": 4936 + }, + { + "epoch": 2.0835443037974684, + "grad_norm": 1.4773131608963013, + "learning_rate": 8.059580315535664e-05, + "loss": 0.6689745187759399, + "step": 4938 + }, + { + "epoch": 2.0843881856540083, + "grad_norm": 1.143965244293213, + "learning_rate": 8.057680591845316e-05, + "loss": 0.5409777760505676, + "step": 4940 + }, + { + "epoch": 2.0852320675105487, + "grad_norm": 1.0384942293167114, + "learning_rate": 8.055780162806777e-05, + "loss": 0.5778636336326599, + "step": 4942 + }, + { + "epoch": 2.0860759493670886, + "grad_norm": 1.0102177858352661, + "learning_rate": 8.053879028858442e-05, + "loss": 0.5576038360595703, + "step": 4944 + }, + { + "epoch": 2.0869198312236286, + "grad_norm": 1.3792158365249634, + "learning_rate": 8.051977190438868e-05, + "loss": 0.5873376131057739, + "step": 4946 + }, + { + "epoch": 2.087763713080169, + "grad_norm": 1.4402949810028076, + "learning_rate": 8.050074647986768e-05, + "loss": 0.6067743301391602, + "step": 4948 + }, + { + "epoch": 2.088607594936709, + "grad_norm": 1.2719058990478516, + "learning_rate": 8.048171401941027e-05, + "loss": 0.604671835899353, + "step": 4950 + }, + { + "epoch": 2.089451476793249, + "grad_norm": 1.1054867506027222, + "learning_rate": 8.046267452740683e-05, + "loss": 0.5743544697761536, + "step": 4952 + }, + { + "epoch": 2.090295358649789, + "grad_norm": 1.0521535873413086, + "learning_rate": 8.044362800824944e-05, + "loss": 0.576278567314148, + "step": 4954 + }, + { + "epoch": 2.091139240506329, + "grad_norm": 1.2665088176727295, + "learning_rate": 8.042457446633174e-05, + "loss": 0.5903641581535339, + "step": 4956 + }, + { + "epoch": 2.091983122362869, + "grad_norm": 1.1283398866653442, + "learning_rate": 8.040551390604902e-05, + "loss": 0.5854214429855347, + "step": 4958 + }, + { + "epoch": 2.0928270042194095, + "grad_norm": 1.1194316148757935, + "learning_rate": 8.03864463317982e-05, + "loss": 0.5843619108200073, + "step": 4960 + }, + { + "epoch": 2.0936708860759494, + "grad_norm": 1.3581651449203491, + "learning_rate": 8.036737174797778e-05, + "loss": 0.6115096211433411, + "step": 4962 + }, + { + "epoch": 2.0945147679324894, + "grad_norm": 1.341748595237732, + "learning_rate": 8.034829015898793e-05, + "loss": 0.5998795032501221, + "step": 4964 + }, + { + "epoch": 2.0953586497890297, + "grad_norm": 1.2212611436843872, + "learning_rate": 8.032920156923038e-05, + "loss": 0.628372311592102, + "step": 4966 + }, + { + "epoch": 2.0962025316455697, + "grad_norm": 1.1348317861557007, + "learning_rate": 8.031010598310851e-05, + "loss": 0.5668916702270508, + "step": 4968 + }, + { + "epoch": 2.0970464135021096, + "grad_norm": 1.1106547117233276, + "learning_rate": 8.029100340502731e-05, + "loss": 0.5253881216049194, + "step": 4970 + }, + { + "epoch": 2.09789029535865, + "grad_norm": 1.2471354007720947, + "learning_rate": 8.027189383939339e-05, + "loss": 0.5790762901306152, + "step": 4972 + }, + { + "epoch": 2.09873417721519, + "grad_norm": 1.2477394342422485, + "learning_rate": 8.025277729061492e-05, + "loss": 0.6382888555526733, + "step": 4974 + }, + { + "epoch": 2.09957805907173, + "grad_norm": 1.2716054916381836, + "learning_rate": 8.023365376310176e-05, + "loss": 0.5962072610855103, + "step": 4976 + }, + { + "epoch": 2.10042194092827, + "grad_norm": 1.257820725440979, + "learning_rate": 8.021452326126532e-05, + "loss": 0.5882940292358398, + "step": 4978 + }, + { + "epoch": 2.1012658227848102, + "grad_norm": 1.0924186706542969, + "learning_rate": 8.019538578951864e-05, + "loss": 0.5640701055526733, + "step": 4980 + }, + { + "epoch": 2.10210970464135, + "grad_norm": 1.1250383853912354, + "learning_rate": 8.017624135227637e-05, + "loss": 0.5746428966522217, + "step": 4982 + }, + { + "epoch": 2.10295358649789, + "grad_norm": 1.131323218345642, + "learning_rate": 8.015708995395477e-05, + "loss": 0.5611346960067749, + "step": 4984 + }, + { + "epoch": 2.1037974683544305, + "grad_norm": 1.4267152547836304, + "learning_rate": 8.013793159897171e-05, + "loss": 0.6173797249794006, + "step": 4986 + }, + { + "epoch": 2.1046413502109704, + "grad_norm": 1.41414213180542, + "learning_rate": 8.011876629174662e-05, + "loss": 0.64865642786026, + "step": 4988 + }, + { + "epoch": 2.1054852320675104, + "grad_norm": 1.1498184204101562, + "learning_rate": 8.00995940367006e-05, + "loss": 0.6125827431678772, + "step": 4990 + }, + { + "epoch": 2.1063291139240508, + "grad_norm": 1.2327708005905151, + "learning_rate": 8.00804148382563e-05, + "loss": 0.670495867729187, + "step": 4992 + }, + { + "epoch": 2.1071729957805907, + "grad_norm": 1.2797311544418335, + "learning_rate": 8.0061228700838e-05, + "loss": 0.6020209193229675, + "step": 4994 + }, + { + "epoch": 2.1080168776371306, + "grad_norm": 1.079584002494812, + "learning_rate": 8.004203562887157e-05, + "loss": 0.5974310636520386, + "step": 4996 + }, + { + "epoch": 2.108860759493671, + "grad_norm": 1.4352604150772095, + "learning_rate": 8.002283562678452e-05, + "loss": 0.6424587368965149, + "step": 4998 + }, + { + "epoch": 2.109704641350211, + "grad_norm": 1.0876719951629639, + "learning_rate": 8.000362869900586e-05, + "loss": 0.6185846328735352, + "step": 5000 + }, + { + "epoch": 2.109704641350211, + "eval_loss": 0.6908889412879944, + "eval_runtime": 675.8398, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 3.118, + "step": 5000 + }, + { + "epoch": 2.110548523206751, + "grad_norm": 1.0125762224197388, + "learning_rate": 7.998441484996631e-05, + "loss": 0.6127280592918396, + "step": 5002 + }, + { + "epoch": 2.1113924050632913, + "grad_norm": 1.0253753662109375, + "learning_rate": 7.99651940840981e-05, + "loss": 0.5495694875717163, + "step": 5004 + }, + { + "epoch": 2.1122362869198312, + "grad_norm": 1.5620673894882202, + "learning_rate": 7.994596640583511e-05, + "loss": 0.6199497580528259, + "step": 5006 + }, + { + "epoch": 2.113080168776371, + "grad_norm": 1.3032969236373901, + "learning_rate": 7.992673181961281e-05, + "loss": 0.5896390676498413, + "step": 5008 + }, + { + "epoch": 2.1139240506329116, + "grad_norm": 1.0933046340942383, + "learning_rate": 7.990749032986821e-05, + "loss": 0.6332341432571411, + "step": 5010 + }, + { + "epoch": 2.1147679324894515, + "grad_norm": 1.3115314245224, + "learning_rate": 7.988824194104e-05, + "loss": 0.5964323282241821, + "step": 5012 + }, + { + "epoch": 2.1156118143459914, + "grad_norm": 1.229978084564209, + "learning_rate": 7.986898665756837e-05, + "loss": 0.5938325524330139, + "step": 5014 + }, + { + "epoch": 2.116455696202532, + "grad_norm": 1.1779940128326416, + "learning_rate": 7.984972448389517e-05, + "loss": 0.5761791467666626, + "step": 5016 + }, + { + "epoch": 2.1172995780590718, + "grad_norm": 1.063490629196167, + "learning_rate": 7.98304554244638e-05, + "loss": 0.6073653101921082, + "step": 5018 + }, + { + "epoch": 2.1181434599156117, + "grad_norm": 1.2390391826629639, + "learning_rate": 7.981117948371927e-05, + "loss": 0.6126761436462402, + "step": 5020 + }, + { + "epoch": 2.118987341772152, + "grad_norm": 1.1946247816085815, + "learning_rate": 7.979189666610818e-05, + "loss": 0.614434003829956, + "step": 5022 + }, + { + "epoch": 2.119831223628692, + "grad_norm": 1.1008374691009521, + "learning_rate": 7.977260697607867e-05, + "loss": 0.5947603583335876, + "step": 5024 + }, + { + "epoch": 2.120675105485232, + "grad_norm": 1.14899480342865, + "learning_rate": 7.975331041808054e-05, + "loss": 0.583965539932251, + "step": 5026 + }, + { + "epoch": 2.1215189873417724, + "grad_norm": 1.1627864837646484, + "learning_rate": 7.973400699656512e-05, + "loss": 0.615121603012085, + "step": 5028 + }, + { + "epoch": 2.1223628691983123, + "grad_norm": 1.3622617721557617, + "learning_rate": 7.971469671598532e-05, + "loss": 0.6268601417541504, + "step": 5030 + }, + { + "epoch": 2.1232067510548522, + "grad_norm": 1.1735879182815552, + "learning_rate": 7.96953795807957e-05, + "loss": 0.6021270155906677, + "step": 5032 + }, + { + "epoch": 2.124050632911392, + "grad_norm": 1.3856201171875, + "learning_rate": 7.96760555954523e-05, + "loss": 0.636816680431366, + "step": 5034 + }, + { + "epoch": 2.1248945147679326, + "grad_norm": 1.1410126686096191, + "learning_rate": 7.965672476441282e-05, + "loss": 0.5324423313140869, + "step": 5036 + }, + { + "epoch": 2.1257383966244725, + "grad_norm": 1.446070909500122, + "learning_rate": 7.963738709213651e-05, + "loss": 0.7433624267578125, + "step": 5038 + }, + { + "epoch": 2.1265822784810124, + "grad_norm": 1.3041753768920898, + "learning_rate": 7.961804258308419e-05, + "loss": 0.6359145641326904, + "step": 5040 + }, + { + "epoch": 2.127426160337553, + "grad_norm": 1.2043813467025757, + "learning_rate": 7.959869124171826e-05, + "loss": 0.6164234280586243, + "step": 5042 + }, + { + "epoch": 2.1282700421940928, + "grad_norm": 1.2375630140304565, + "learning_rate": 7.957933307250273e-05, + "loss": 0.6437279582023621, + "step": 5044 + }, + { + "epoch": 2.1291139240506327, + "grad_norm": 1.210644245147705, + "learning_rate": 7.955996807990314e-05, + "loss": 0.585924506187439, + "step": 5046 + }, + { + "epoch": 2.129957805907173, + "grad_norm": 1.2011489868164062, + "learning_rate": 7.954059626838661e-05, + "loss": 0.6081803441047668, + "step": 5048 + }, + { + "epoch": 2.130801687763713, + "grad_norm": 1.0365782976150513, + "learning_rate": 7.952121764242187e-05, + "loss": 0.5609047412872314, + "step": 5050 + }, + { + "epoch": 2.131645569620253, + "grad_norm": 1.7950767278671265, + "learning_rate": 7.950183220647918e-05, + "loss": 0.5612874031066895, + "step": 5052 + }, + { + "epoch": 2.1324894514767934, + "grad_norm": 1.2933409214019775, + "learning_rate": 7.94824399650304e-05, + "loss": 0.6554630994796753, + "step": 5054 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 1.129828929901123, + "learning_rate": 7.946304092254894e-05, + "loss": 0.5623239278793335, + "step": 5056 + }, + { + "epoch": 2.1341772151898732, + "grad_norm": 1.1060296297073364, + "learning_rate": 7.944363508350978e-05, + "loss": 0.5036910772323608, + "step": 5058 + }, + { + "epoch": 2.1350210970464136, + "grad_norm": 1.2622627019882202, + "learning_rate": 7.94242224523895e-05, + "loss": 0.5840913653373718, + "step": 5060 + }, + { + "epoch": 2.1358649789029536, + "grad_norm": 1.3803153038024902, + "learning_rate": 7.940480303366618e-05, + "loss": 0.6365578770637512, + "step": 5062 + }, + { + "epoch": 2.1367088607594935, + "grad_norm": 1.2524651288986206, + "learning_rate": 7.938537683181955e-05, + "loss": 0.6167916655540466, + "step": 5064 + }, + { + "epoch": 2.137552742616034, + "grad_norm": 1.3320350646972656, + "learning_rate": 7.936594385133083e-05, + "loss": 0.6356930732727051, + "step": 5066 + }, + { + "epoch": 2.138396624472574, + "grad_norm": 1.3180949687957764, + "learning_rate": 7.934650409668285e-05, + "loss": 0.5888242721557617, + "step": 5068 + }, + { + "epoch": 2.1392405063291138, + "grad_norm": 1.1376243829727173, + "learning_rate": 7.932705757235999e-05, + "loss": 0.608725905418396, + "step": 5070 + }, + { + "epoch": 2.140084388185654, + "grad_norm": 1.1734369993209839, + "learning_rate": 7.930760428284817e-05, + "loss": 0.5824158787727356, + "step": 5072 + }, + { + "epoch": 2.140928270042194, + "grad_norm": 1.1038579940795898, + "learning_rate": 7.928814423263493e-05, + "loss": 0.5629416704177856, + "step": 5074 + }, + { + "epoch": 2.141772151898734, + "grad_norm": 1.269780158996582, + "learning_rate": 7.926867742620929e-05, + "loss": 0.5994445085525513, + "step": 5076 + }, + { + "epoch": 2.1426160337552744, + "grad_norm": 1.2274279594421387, + "learning_rate": 7.924920386806188e-05, + "loss": 0.5845475792884827, + "step": 5078 + }, + { + "epoch": 2.1434599156118144, + "grad_norm": 1.168766975402832, + "learning_rate": 7.922972356268488e-05, + "loss": 0.621201753616333, + "step": 5080 + }, + { + "epoch": 2.1443037974683543, + "grad_norm": 1.0057638883590698, + "learning_rate": 7.921023651457203e-05, + "loss": 0.5282597541809082, + "step": 5082 + }, + { + "epoch": 2.1451476793248947, + "grad_norm": 1.432309865951538, + "learning_rate": 7.91907427282186e-05, + "loss": 0.632583737373352, + "step": 5084 + }, + { + "epoch": 2.1459915611814346, + "grad_norm": 1.3939776420593262, + "learning_rate": 7.917124220812144e-05, + "loss": 0.6239289045333862, + "step": 5086 + }, + { + "epoch": 2.1468354430379746, + "grad_norm": 1.3741775751113892, + "learning_rate": 7.915173495877895e-05, + "loss": 0.5749062895774841, + "step": 5088 + }, + { + "epoch": 2.147679324894515, + "grad_norm": 1.3123528957366943, + "learning_rate": 7.913222098469109e-05, + "loss": 0.6011738181114197, + "step": 5090 + }, + { + "epoch": 2.148523206751055, + "grad_norm": 1.3473498821258545, + "learning_rate": 7.911270029035932e-05, + "loss": 0.5804699659347534, + "step": 5092 + }, + { + "epoch": 2.149367088607595, + "grad_norm": 1.0873067378997803, + "learning_rate": 7.909317288028673e-05, + "loss": 0.6446103453636169, + "step": 5094 + }, + { + "epoch": 2.1502109704641352, + "grad_norm": 1.1374083757400513, + "learning_rate": 7.907363875897789e-05, + "loss": 0.6136524677276611, + "step": 5096 + }, + { + "epoch": 2.151054852320675, + "grad_norm": 1.1356533765792847, + "learning_rate": 7.905409793093896e-05, + "loss": 0.5107976794242859, + "step": 5098 + }, + { + "epoch": 2.151898734177215, + "grad_norm": 1.2579567432403564, + "learning_rate": 7.903455040067763e-05, + "loss": 0.6073099374771118, + "step": 5100 + }, + { + "epoch": 2.151898734177215, + "eval_loss": 0.6902023553848267, + "eval_runtime": 733.915, + "eval_samples_per_second": 2.871, + "eval_steps_per_second": 2.871, + "step": 5100 + }, + { + "epoch": 2.1527426160337555, + "grad_norm": 1.2401398420333862, + "learning_rate": 7.901499617270315e-05, + "loss": 0.5562406182289124, + "step": 5102 + }, + { + "epoch": 2.1535864978902954, + "grad_norm": 1.086590051651001, + "learning_rate": 7.899543525152628e-05, + "loss": 0.5749467015266418, + "step": 5104 + }, + { + "epoch": 2.1544303797468354, + "grad_norm": 1.206458568572998, + "learning_rate": 7.897586764165939e-05, + "loss": 0.6326877474784851, + "step": 5106 + }, + { + "epoch": 2.1552742616033758, + "grad_norm": 1.030740737915039, + "learning_rate": 7.895629334761632e-05, + "loss": 0.5616445541381836, + "step": 5108 + }, + { + "epoch": 2.1561181434599157, + "grad_norm": 1.3338581323623657, + "learning_rate": 7.89367123739125e-05, + "loss": 0.6307384371757507, + "step": 5110 + }, + { + "epoch": 2.1569620253164556, + "grad_norm": 1.2684671878814697, + "learning_rate": 7.891712472506485e-05, + "loss": 0.6087653636932373, + "step": 5112 + }, + { + "epoch": 2.1578059071729956, + "grad_norm": 1.1610581874847412, + "learning_rate": 7.889753040559188e-05, + "loss": 0.5747998952865601, + "step": 5114 + }, + { + "epoch": 2.158649789029536, + "grad_norm": 1.4069275856018066, + "learning_rate": 7.887792942001366e-05, + "loss": 0.6143770217895508, + "step": 5116 + }, + { + "epoch": 2.159493670886076, + "grad_norm": 1.0858227014541626, + "learning_rate": 7.885832177285173e-05, + "loss": 0.552534282207489, + "step": 5118 + }, + { + "epoch": 2.160337552742616, + "grad_norm": 1.067070722579956, + "learning_rate": 7.88387074686292e-05, + "loss": 0.5781989693641663, + "step": 5120 + }, + { + "epoch": 2.1611814345991562, + "grad_norm": 1.139981746673584, + "learning_rate": 7.881908651187072e-05, + "loss": 0.5521422624588013, + "step": 5122 + }, + { + "epoch": 2.162025316455696, + "grad_norm": 1.0987457036972046, + "learning_rate": 7.879945890710245e-05, + "loss": 0.5755025744438171, + "step": 5124 + }, + { + "epoch": 2.162869198312236, + "grad_norm": 1.1530758142471313, + "learning_rate": 7.877982465885214e-05, + "loss": 0.5783509612083435, + "step": 5126 + }, + { + "epoch": 2.1637130801687765, + "grad_norm": 1.2285696268081665, + "learning_rate": 7.876018377164899e-05, + "loss": 0.5942281484603882, + "step": 5128 + }, + { + "epoch": 2.1645569620253164, + "grad_norm": 1.1283711194992065, + "learning_rate": 7.874053625002378e-05, + "loss": 0.5539707541465759, + "step": 5130 + }, + { + "epoch": 2.1654008438818564, + "grad_norm": 1.3213335275650024, + "learning_rate": 7.872088209850885e-05, + "loss": 0.5955292582511902, + "step": 5132 + }, + { + "epoch": 2.1662447257383968, + "grad_norm": 1.1748592853546143, + "learning_rate": 7.8701221321638e-05, + "loss": 0.5422899723052979, + "step": 5134 + }, + { + "epoch": 2.1670886075949367, + "grad_norm": 1.0752148628234863, + "learning_rate": 7.868155392394662e-05, + "loss": 0.5547205209732056, + "step": 5136 + }, + { + "epoch": 2.1679324894514767, + "grad_norm": 1.1814554929733276, + "learning_rate": 7.86618799099716e-05, + "loss": 0.5938948392868042, + "step": 5138 + }, + { + "epoch": 2.168776371308017, + "grad_norm": 1.3455278873443604, + "learning_rate": 7.864219928425132e-05, + "loss": 0.6468925476074219, + "step": 5140 + }, + { + "epoch": 2.169620253164557, + "grad_norm": 1.2695354223251343, + "learning_rate": 7.862251205132576e-05, + "loss": 0.5704391002655029, + "step": 5142 + }, + { + "epoch": 2.170464135021097, + "grad_norm": 1.1529468297958374, + "learning_rate": 7.860281821573638e-05, + "loss": 0.6057283878326416, + "step": 5144 + }, + { + "epoch": 2.1713080168776373, + "grad_norm": 1.3461004495620728, + "learning_rate": 7.858311778202616e-05, + "loss": 0.6135527491569519, + "step": 5146 + }, + { + "epoch": 2.1721518987341772, + "grad_norm": 1.1258536577224731, + "learning_rate": 7.856341075473962e-05, + "loss": 0.5585638880729675, + "step": 5148 + }, + { + "epoch": 2.172995780590717, + "grad_norm": 1.254898190498352, + "learning_rate": 7.854369713842279e-05, + "loss": 0.5780918002128601, + "step": 5150 + }, + { + "epoch": 2.1738396624472576, + "grad_norm": 1.2730201482772827, + "learning_rate": 7.852397693762321e-05, + "loss": 0.595267117023468, + "step": 5152 + }, + { + "epoch": 2.1746835443037975, + "grad_norm": 1.1875078678131104, + "learning_rate": 7.850425015688999e-05, + "loss": 0.5636162161827087, + "step": 5154 + }, + { + "epoch": 2.1755274261603375, + "grad_norm": 1.0930945873260498, + "learning_rate": 7.848451680077366e-05, + "loss": 0.6362089514732361, + "step": 5156 + }, + { + "epoch": 2.176371308016878, + "grad_norm": 1.2274452447891235, + "learning_rate": 7.846477687382639e-05, + "loss": 0.6268675327301025, + "step": 5158 + }, + { + "epoch": 2.1772151898734178, + "grad_norm": 1.2023133039474487, + "learning_rate": 7.844503038060176e-05, + "loss": 0.6014906167984009, + "step": 5160 + }, + { + "epoch": 2.1780590717299577, + "grad_norm": 1.2616889476776123, + "learning_rate": 7.842527732565491e-05, + "loss": 0.6180019974708557, + "step": 5162 + }, + { + "epoch": 2.1789029535864977, + "grad_norm": 1.1046907901763916, + "learning_rate": 7.84055177135425e-05, + "loss": 0.5400100946426392, + "step": 5164 + }, + { + "epoch": 2.179746835443038, + "grad_norm": 1.1664032936096191, + "learning_rate": 7.83857515488227e-05, + "loss": 0.5713199973106384, + "step": 5166 + }, + { + "epoch": 2.180590717299578, + "grad_norm": 1.2526558637619019, + "learning_rate": 7.836597883605519e-05, + "loss": 0.5741307735443115, + "step": 5168 + }, + { + "epoch": 2.181434599156118, + "grad_norm": 1.0457103252410889, + "learning_rate": 7.834619957980112e-05, + "loss": 0.47188031673431396, + "step": 5170 + }, + { + "epoch": 2.1822784810126583, + "grad_norm": 1.1978110074996948, + "learning_rate": 7.832641378462319e-05, + "loss": 0.6149471998214722, + "step": 5172 + }, + { + "epoch": 2.1831223628691983, + "grad_norm": 1.2231460809707642, + "learning_rate": 7.830662145508567e-05, + "loss": 0.5520018339157104, + "step": 5174 + }, + { + "epoch": 2.183966244725738, + "grad_norm": 1.4367618560791016, + "learning_rate": 7.828682259575417e-05, + "loss": 0.6536548733711243, + "step": 5176 + }, + { + "epoch": 2.1848101265822786, + "grad_norm": 1.0891374349594116, + "learning_rate": 7.826701721119598e-05, + "loss": 0.5324372053146362, + "step": 5178 + }, + { + "epoch": 2.1856540084388185, + "grad_norm": 1.118695616722107, + "learning_rate": 7.82472053059798e-05, + "loss": 0.6127952337265015, + "step": 5180 + }, + { + "epoch": 2.1864978902953585, + "grad_norm": 1.1116070747375488, + "learning_rate": 7.822738688467585e-05, + "loss": 0.505962610244751, + "step": 5182 + }, + { + "epoch": 2.187341772151899, + "grad_norm": 1.2140545845031738, + "learning_rate": 7.820756195185586e-05, + "loss": 0.6210073232650757, + "step": 5184 + }, + { + "epoch": 2.188185654008439, + "grad_norm": 1.2135601043701172, + "learning_rate": 7.818773051209307e-05, + "loss": 0.6517674326896667, + "step": 5186 + }, + { + "epoch": 2.1890295358649787, + "grad_norm": 1.3875514268875122, + "learning_rate": 7.816789256996218e-05, + "loss": 0.5577492117881775, + "step": 5188 + }, + { + "epoch": 2.189873417721519, + "grad_norm": 1.181325912475586, + "learning_rate": 7.814804813003949e-05, + "loss": 0.6010199189186096, + "step": 5190 + }, + { + "epoch": 2.190717299578059, + "grad_norm": 1.102044701576233, + "learning_rate": 7.812819719690265e-05, + "loss": 0.5635302662849426, + "step": 5192 + }, + { + "epoch": 2.191561181434599, + "grad_norm": 1.4227958917617798, + "learning_rate": 7.810833977513094e-05, + "loss": 0.5804321765899658, + "step": 5194 + }, + { + "epoch": 2.1924050632911394, + "grad_norm": 1.2573446035385132, + "learning_rate": 7.80884758693051e-05, + "loss": 0.6005555987358093, + "step": 5196 + }, + { + "epoch": 2.1932489451476793, + "grad_norm": 1.3534085750579834, + "learning_rate": 7.80686054840073e-05, + "loss": 0.6263643503189087, + "step": 5198 + }, + { + "epoch": 2.1940928270042193, + "grad_norm": 1.6895852088928223, + "learning_rate": 7.804872862382131e-05, + "loss": 0.6235764622688293, + "step": 5200 + }, + { + "epoch": 2.1940928270042193, + "eval_loss": 0.6915348172187805, + "eval_runtime": 1167.9782, + "eval_samples_per_second": 1.804, + "eval_steps_per_second": 1.804, + "step": 5200 + }, + { + "epoch": 2.1949367088607596, + "grad_norm": 1.138973593711853, + "learning_rate": 7.802884529333227e-05, + "loss": 0.5586035847663879, + "step": 5202 + }, + { + "epoch": 2.1957805907172996, + "grad_norm": 1.3664026260375977, + "learning_rate": 7.800895549712697e-05, + "loss": 0.5768917202949524, + "step": 5204 + }, + { + "epoch": 2.1966244725738395, + "grad_norm": 1.2182449102401733, + "learning_rate": 7.798905923979353e-05, + "loss": 0.6046215891838074, + "step": 5206 + }, + { + "epoch": 2.19746835443038, + "grad_norm": 1.2692211866378784, + "learning_rate": 7.796915652592167e-05, + "loss": 0.5412904024124146, + "step": 5208 + }, + { + "epoch": 2.19831223628692, + "grad_norm": 1.200822114944458, + "learning_rate": 7.794924736010256e-05, + "loss": 0.5328584909439087, + "step": 5210 + }, + { + "epoch": 2.19915611814346, + "grad_norm": 1.1093779802322388, + "learning_rate": 7.792933174692886e-05, + "loss": 0.5497913360595703, + "step": 5212 + }, + { + "epoch": 2.2, + "grad_norm": 1.3838921785354614, + "learning_rate": 7.790940969099471e-05, + "loss": 0.5908066034317017, + "step": 5214 + }, + { + "epoch": 2.20084388185654, + "grad_norm": 1.1411913633346558, + "learning_rate": 7.788948119689576e-05, + "loss": 0.6117307543754578, + "step": 5216 + }, + { + "epoch": 2.20168776371308, + "grad_norm": 1.5668916702270508, + "learning_rate": 7.786954626922913e-05, + "loss": 0.5788605809211731, + "step": 5218 + }, + { + "epoch": 2.2025316455696204, + "grad_norm": 1.195027232170105, + "learning_rate": 7.784960491259344e-05, + "loss": 0.5948591828346252, + "step": 5220 + }, + { + "epoch": 2.2033755274261604, + "grad_norm": 1.2665271759033203, + "learning_rate": 7.782965713158872e-05, + "loss": 0.6321669220924377, + "step": 5222 + }, + { + "epoch": 2.2042194092827003, + "grad_norm": 1.123711109161377, + "learning_rate": 7.78097029308166e-05, + "loss": 0.5853859186172485, + "step": 5224 + }, + { + "epoch": 2.2050632911392407, + "grad_norm": 1.9381071329116821, + "learning_rate": 7.77897423148801e-05, + "loss": 0.6485977172851562, + "step": 5226 + }, + { + "epoch": 2.2059071729957807, + "grad_norm": 1.4062265157699585, + "learning_rate": 7.776977528838376e-05, + "loss": 0.6243517398834229, + "step": 5228 + }, + { + "epoch": 2.2067510548523206, + "grad_norm": 1.2127182483673096, + "learning_rate": 7.774980185593358e-05, + "loss": 0.5770578980445862, + "step": 5230 + }, + { + "epoch": 2.207594936708861, + "grad_norm": 1.250847578048706, + "learning_rate": 7.772982202213709e-05, + "loss": 0.6521194577217102, + "step": 5232 + }, + { + "epoch": 2.208438818565401, + "grad_norm": 1.2568131685256958, + "learning_rate": 7.77098357916032e-05, + "loss": 0.5755271911621094, + "step": 5234 + }, + { + "epoch": 2.209282700421941, + "grad_norm": 1.2422975301742554, + "learning_rate": 7.768984316894236e-05, + "loss": 0.5486469864845276, + "step": 5236 + }, + { + "epoch": 2.2101265822784812, + "grad_norm": 1.1018635034561157, + "learning_rate": 7.766984415876652e-05, + "loss": 0.5512928366661072, + "step": 5238 + }, + { + "epoch": 2.210970464135021, + "grad_norm": 1.2261123657226562, + "learning_rate": 7.764983876568903e-05, + "loss": 0.5753499269485474, + "step": 5240 + }, + { + "epoch": 2.211814345991561, + "grad_norm": 1.2222342491149902, + "learning_rate": 7.762982699432474e-05, + "loss": 0.5404848456382751, + "step": 5242 + }, + { + "epoch": 2.212658227848101, + "grad_norm": 1.231494426727295, + "learning_rate": 7.760980884929004e-05, + "loss": 0.5999218821525574, + "step": 5244 + }, + { + "epoch": 2.2135021097046415, + "grad_norm": 1.1530078649520874, + "learning_rate": 7.758978433520268e-05, + "loss": 0.6123101115226746, + "step": 5246 + }, + { + "epoch": 2.2143459915611814, + "grad_norm": 1.182706594467163, + "learning_rate": 7.756975345668194e-05, + "loss": 0.5945886969566345, + "step": 5248 + }, + { + "epoch": 2.2151898734177213, + "grad_norm": 1.0788652896881104, + "learning_rate": 7.754971621834857e-05, + "loss": 0.5698213577270508, + "step": 5250 + }, + { + "epoch": 2.2160337552742617, + "grad_norm": 1.2243359088897705, + "learning_rate": 7.752967262482477e-05, + "loss": 0.5959678888320923, + "step": 5252 + }, + { + "epoch": 2.2168776371308017, + "grad_norm": 1.4292869567871094, + "learning_rate": 7.750962268073421e-05, + "loss": 0.586794376373291, + "step": 5254 + }, + { + "epoch": 2.2177215189873416, + "grad_norm": 1.1809570789337158, + "learning_rate": 7.748956639070204e-05, + "loss": 0.5513298511505127, + "step": 5256 + }, + { + "epoch": 2.218565400843882, + "grad_norm": 1.485813856124878, + "learning_rate": 7.746950375935484e-05, + "loss": 0.6402831673622131, + "step": 5258 + }, + { + "epoch": 2.219409282700422, + "grad_norm": 1.0851374864578247, + "learning_rate": 7.744943479132069e-05, + "loss": 0.5729117393493652, + "step": 5260 + }, + { + "epoch": 2.220253164556962, + "grad_norm": 1.4308949708938599, + "learning_rate": 7.742935949122911e-05, + "loss": 0.6239725947380066, + "step": 5262 + }, + { + "epoch": 2.2210970464135023, + "grad_norm": 1.379258155822754, + "learning_rate": 7.740927786371107e-05, + "loss": 0.6260181069374084, + "step": 5264 + }, + { + "epoch": 2.221940928270042, + "grad_norm": 1.1661925315856934, + "learning_rate": 7.738918991339905e-05, + "loss": 0.6074157357215881, + "step": 5266 + }, + { + "epoch": 2.222784810126582, + "grad_norm": 1.168901801109314, + "learning_rate": 7.736909564492694e-05, + "loss": 0.6119515895843506, + "step": 5268 + }, + { + "epoch": 2.2236286919831225, + "grad_norm": 1.1451057195663452, + "learning_rate": 7.734899506293008e-05, + "loss": 0.5505842566490173, + "step": 5270 + }, + { + "epoch": 2.2244725738396625, + "grad_norm": 1.2303991317749023, + "learning_rate": 7.732888817204533e-05, + "loss": 0.6117991805076599, + "step": 5272 + }, + { + "epoch": 2.2253164556962024, + "grad_norm": 1.04572331905365, + "learning_rate": 7.730877497691092e-05, + "loss": 0.5589770078659058, + "step": 5274 + }, + { + "epoch": 2.226160337552743, + "grad_norm": 1.2047234773635864, + "learning_rate": 7.72886554821666e-05, + "loss": 0.6288654208183289, + "step": 5276 + }, + { + "epoch": 2.2270042194092827, + "grad_norm": 1.2036652565002441, + "learning_rate": 7.726852969245355e-05, + "loss": 0.6174501776695251, + "step": 5278 + }, + { + "epoch": 2.2278481012658227, + "grad_norm": 1.1740167140960693, + "learning_rate": 7.72483976124144e-05, + "loss": 0.6027677655220032, + "step": 5280 + }, + { + "epoch": 2.228691983122363, + "grad_norm": 1.0600008964538574, + "learning_rate": 7.722825924669326e-05, + "loss": 0.6016151309013367, + "step": 5282 + }, + { + "epoch": 2.229535864978903, + "grad_norm": 1.2631008625030518, + "learning_rate": 7.720811459993562e-05, + "loss": 0.5905849933624268, + "step": 5284 + }, + { + "epoch": 2.230379746835443, + "grad_norm": 1.1024738550186157, + "learning_rate": 7.718796367678848e-05, + "loss": 0.5129587054252625, + "step": 5286 + }, + { + "epoch": 2.2312236286919833, + "grad_norm": 1.23116934299469, + "learning_rate": 7.716780648190028e-05, + "loss": 0.5709586143493652, + "step": 5288 + }, + { + "epoch": 2.2320675105485233, + "grad_norm": 1.2739102840423584, + "learning_rate": 7.714764301992088e-05, + "loss": 0.5454761385917664, + "step": 5290 + }, + { + "epoch": 2.232911392405063, + "grad_norm": 1.303963303565979, + "learning_rate": 7.712747329550162e-05, + "loss": 0.537248969078064, + "step": 5292 + }, + { + "epoch": 2.233755274261603, + "grad_norm": 1.2454309463500977, + "learning_rate": 7.710729731329529e-05, + "loss": 0.6364415884017944, + "step": 5294 + }, + { + "epoch": 2.2345991561181435, + "grad_norm": 1.2401882410049438, + "learning_rate": 7.708711507795605e-05, + "loss": 0.5640100240707397, + "step": 5296 + }, + { + "epoch": 2.2354430379746835, + "grad_norm": 1.197432041168213, + "learning_rate": 7.706692659413959e-05, + "loss": 0.5919729471206665, + "step": 5298 + }, + { + "epoch": 2.2362869198312234, + "grad_norm": 1.1779764890670776, + "learning_rate": 7.704673186650298e-05, + "loss": 0.5569849014282227, + "step": 5300 + }, + { + "epoch": 2.2362869198312234, + "eval_loss": 0.6898328065872192, + "eval_runtime": 739.3794, + "eval_samples_per_second": 2.85, + "eval_steps_per_second": 2.85, + "step": 5300 + }, + { + "epoch": 2.237130801687764, + "grad_norm": 1.1371463537216187, + "learning_rate": 7.702653089970479e-05, + "loss": 0.5823061466217041, + "step": 5302 + }, + { + "epoch": 2.2379746835443037, + "grad_norm": 1.1877846717834473, + "learning_rate": 7.700632369840497e-05, + "loss": 0.5556252002716064, + "step": 5304 + }, + { + "epoch": 2.2388185654008437, + "grad_norm": 1.1580896377563477, + "learning_rate": 7.698611026726492e-05, + "loss": 0.5794119834899902, + "step": 5306 + }, + { + "epoch": 2.239662447257384, + "grad_norm": 1.29141366481781, + "learning_rate": 7.696589061094755e-05, + "loss": 0.5828680396080017, + "step": 5308 + }, + { + "epoch": 2.240506329113924, + "grad_norm": 1.1286728382110596, + "learning_rate": 7.694566473411706e-05, + "loss": 0.6161736845970154, + "step": 5310 + }, + { + "epoch": 2.241350210970464, + "grad_norm": 1.0969985723495483, + "learning_rate": 7.692543264143925e-05, + "loss": 0.570767879486084, + "step": 5312 + }, + { + "epoch": 2.2421940928270043, + "grad_norm": 1.2902227640151978, + "learning_rate": 7.690519433758123e-05, + "loss": 0.631476104259491, + "step": 5314 + }, + { + "epoch": 2.2430379746835443, + "grad_norm": 1.432735800743103, + "learning_rate": 7.68849498272116e-05, + "loss": 0.6142309904098511, + "step": 5316 + }, + { + "epoch": 2.243881856540084, + "grad_norm": 1.0824161767959595, + "learning_rate": 7.686469911500038e-05, + "loss": 0.5871514081954956, + "step": 5318 + }, + { + "epoch": 2.2447257383966246, + "grad_norm": 1.1694978475570679, + "learning_rate": 7.684444220561902e-05, + "loss": 0.6144557595252991, + "step": 5320 + }, + { + "epoch": 2.2455696202531645, + "grad_norm": 1.2981040477752686, + "learning_rate": 7.68241791037404e-05, + "loss": 0.6049425601959229, + "step": 5322 + }, + { + "epoch": 2.2464135021097045, + "grad_norm": 1.132128357887268, + "learning_rate": 7.680390981403885e-05, + "loss": 0.5571867823600769, + "step": 5324 + }, + { + "epoch": 2.247257383966245, + "grad_norm": 1.1760079860687256, + "learning_rate": 7.678363434119005e-05, + "loss": 0.5710517168045044, + "step": 5326 + }, + { + "epoch": 2.248101265822785, + "grad_norm": 1.1918572187423706, + "learning_rate": 7.67633526898712e-05, + "loss": 0.5508866906166077, + "step": 5328 + }, + { + "epoch": 2.2489451476793247, + "grad_norm": 1.1837294101715088, + "learning_rate": 7.674306486476091e-05, + "loss": 0.6242696046829224, + "step": 5330 + }, + { + "epoch": 2.249789029535865, + "grad_norm": 1.384918212890625, + "learning_rate": 7.672277087053914e-05, + "loss": 0.5821678042411804, + "step": 5332 + }, + { + "epoch": 2.250632911392405, + "grad_norm": 1.1248877048492432, + "learning_rate": 7.670247071188738e-05, + "loss": 0.5415928363800049, + "step": 5334 + }, + { + "epoch": 2.251476793248945, + "grad_norm": 1.228140950202942, + "learning_rate": 7.668216439348843e-05, + "loss": 0.5475174188613892, + "step": 5336 + }, + { + "epoch": 2.2523206751054854, + "grad_norm": 1.3816046714782715, + "learning_rate": 7.666185192002662e-05, + "loss": 0.5793306231498718, + "step": 5338 + }, + { + "epoch": 2.2531645569620253, + "grad_norm": 1.2446565628051758, + "learning_rate": 7.664153329618759e-05, + "loss": 0.6221131682395935, + "step": 5340 + }, + { + "epoch": 2.2540084388185653, + "grad_norm": 1.1677669286727905, + "learning_rate": 7.662120852665852e-05, + "loss": 0.5403847694396973, + "step": 5342 + }, + { + "epoch": 2.2548523206751057, + "grad_norm": 1.2485873699188232, + "learning_rate": 7.66008776161279e-05, + "loss": 0.620201587677002, + "step": 5344 + }, + { + "epoch": 2.2556962025316456, + "grad_norm": 1.2486802339553833, + "learning_rate": 7.658054056928568e-05, + "loss": 0.5969216227531433, + "step": 5346 + }, + { + "epoch": 2.2565400843881855, + "grad_norm": 1.2621372938156128, + "learning_rate": 7.656019739082326e-05, + "loss": 0.6376339793205261, + "step": 5348 + }, + { + "epoch": 2.257383966244726, + "grad_norm": 1.238633155822754, + "learning_rate": 7.65398480854334e-05, + "loss": 0.6374872326850891, + "step": 5350 + }, + { + "epoch": 2.258227848101266, + "grad_norm": 1.3031803369522095, + "learning_rate": 7.651949265781029e-05, + "loss": 0.6348551511764526, + "step": 5352 + }, + { + "epoch": 2.259071729957806, + "grad_norm": 1.3735158443450928, + "learning_rate": 7.649913111264952e-05, + "loss": 0.6267750859260559, + "step": 5354 + }, + { + "epoch": 2.259915611814346, + "grad_norm": 1.1227772235870361, + "learning_rate": 7.647876345464817e-05, + "loss": 0.623030960559845, + "step": 5356 + }, + { + "epoch": 2.260759493670886, + "grad_norm": 1.4555678367614746, + "learning_rate": 7.645838968850459e-05, + "loss": 0.5810713171958923, + "step": 5358 + }, + { + "epoch": 2.261603375527426, + "grad_norm": 1.227725863456726, + "learning_rate": 7.643800981891867e-05, + "loss": 0.6150093078613281, + "step": 5360 + }, + { + "epoch": 2.2624472573839665, + "grad_norm": 1.0648300647735596, + "learning_rate": 7.641762385059161e-05, + "loss": 0.5350445508956909, + "step": 5362 + }, + { + "epoch": 2.2632911392405064, + "grad_norm": 1.179452896118164, + "learning_rate": 7.639723178822613e-05, + "loss": 0.6253421306610107, + "step": 5364 + }, + { + "epoch": 2.2641350210970463, + "grad_norm": 1.0983240604400635, + "learning_rate": 7.637683363652621e-05, + "loss": 0.5512562990188599, + "step": 5366 + }, + { + "epoch": 2.2649789029535867, + "grad_norm": 1.1825451850891113, + "learning_rate": 7.635642940019736e-05, + "loss": 0.5584151148796082, + "step": 5368 + }, + { + "epoch": 2.2658227848101267, + "grad_norm": 1.1022000312805176, + "learning_rate": 7.633601908394643e-05, + "loss": 0.5881790518760681, + "step": 5370 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 1.1935697793960571, + "learning_rate": 7.631560269248169e-05, + "loss": 0.6060683131217957, + "step": 5372 + }, + { + "epoch": 2.267510548523207, + "grad_norm": 1.1174103021621704, + "learning_rate": 7.62951802305128e-05, + "loss": 0.5877062678337097, + "step": 5374 + }, + { + "epoch": 2.268354430379747, + "grad_norm": 1.3934977054595947, + "learning_rate": 7.627475170275086e-05, + "loss": 0.5145504474639893, + "step": 5376 + }, + { + "epoch": 2.269198312236287, + "grad_norm": 1.2637842893600464, + "learning_rate": 7.625431711390831e-05, + "loss": 0.6194025874137878, + "step": 5378 + }, + { + "epoch": 2.270042194092827, + "grad_norm": 1.2034388780593872, + "learning_rate": 7.623387646869902e-05, + "loss": 0.6205627918243408, + "step": 5380 + }, + { + "epoch": 2.270886075949367, + "grad_norm": 0.953880250453949, + "learning_rate": 7.621342977183826e-05, + "loss": 0.5609696507453918, + "step": 5382 + }, + { + "epoch": 2.271729957805907, + "grad_norm": 1.2841949462890625, + "learning_rate": 7.619297702804272e-05, + "loss": 0.6044906377792358, + "step": 5384 + }, + { + "epoch": 2.272573839662447, + "grad_norm": 1.146804690361023, + "learning_rate": 7.617251824203037e-05, + "loss": 0.5420435667037964, + "step": 5386 + }, + { + "epoch": 2.2734177215189875, + "grad_norm": 1.2225698232650757, + "learning_rate": 7.615205341852076e-05, + "loss": 0.6230710744857788, + "step": 5388 + }, + { + "epoch": 2.2742616033755274, + "grad_norm": 1.3423371315002441, + "learning_rate": 7.613158256223467e-05, + "loss": 0.6486349701881409, + "step": 5390 + }, + { + "epoch": 2.2751054852320673, + "grad_norm": 1.0840023756027222, + "learning_rate": 7.611110567789435e-05, + "loss": 0.6527825593948364, + "step": 5392 + }, + { + "epoch": 2.2759493670886077, + "grad_norm": 1.342466950416565, + "learning_rate": 7.609062277022341e-05, + "loss": 0.6859483122825623, + "step": 5394 + }, + { + "epoch": 2.2767932489451477, + "grad_norm": 1.0406129360198975, + "learning_rate": 7.607013384394691e-05, + "loss": 0.5536003708839417, + "step": 5396 + }, + { + "epoch": 2.2776371308016876, + "grad_norm": 1.0853544473648071, + "learning_rate": 7.604963890379118e-05, + "loss": 0.5488654971122742, + "step": 5398 + }, + { + "epoch": 2.278481012658228, + "grad_norm": 1.0330145359039307, + "learning_rate": 7.602913795448407e-05, + "loss": 0.6072142720222473, + "step": 5400 + }, + { + "epoch": 2.278481012658228, + "eval_loss": 0.6875645518302917, + "eval_runtime": 861.3558, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 5400 + }, + { + "epoch": 2.279324894514768, + "grad_norm": 1.1858742237091064, + "learning_rate": 7.600863100075472e-05, + "loss": 0.5420109033584595, + "step": 5402 + }, + { + "epoch": 2.280168776371308, + "grad_norm": 1.2126039266586304, + "learning_rate": 7.598811804733373e-05, + "loss": 0.6109243631362915, + "step": 5404 + }, + { + "epoch": 2.2810126582278483, + "grad_norm": 1.1290241479873657, + "learning_rate": 7.5967599098953e-05, + "loss": 0.5889696478843689, + "step": 5406 + }, + { + "epoch": 2.281856540084388, + "grad_norm": 1.320263147354126, + "learning_rate": 7.594707416034586e-05, + "loss": 0.6548630595207214, + "step": 5408 + }, + { + "epoch": 2.282700421940928, + "grad_norm": 1.346169114112854, + "learning_rate": 7.592654323624703e-05, + "loss": 0.6556787490844727, + "step": 5410 + }, + { + "epoch": 2.2835443037974685, + "grad_norm": 1.2104716300964355, + "learning_rate": 7.590600633139265e-05, + "loss": 0.5631673336029053, + "step": 5412 + }, + { + "epoch": 2.2843881856540085, + "grad_norm": 1.3298237323760986, + "learning_rate": 7.58854634505201e-05, + "loss": 0.5931088328361511, + "step": 5414 + }, + { + "epoch": 2.2852320675105484, + "grad_norm": 1.4201204776763916, + "learning_rate": 7.586491459836829e-05, + "loss": 0.6966755986213684, + "step": 5416 + }, + { + "epoch": 2.286075949367089, + "grad_norm": 1.253135323524475, + "learning_rate": 7.584435977967743e-05, + "loss": 0.6172569394111633, + "step": 5418 + }, + { + "epoch": 2.2869198312236287, + "grad_norm": 1.133144736289978, + "learning_rate": 7.582379899918911e-05, + "loss": 0.5376655459403992, + "step": 5420 + }, + { + "epoch": 2.2877637130801687, + "grad_norm": 1.1103745698928833, + "learning_rate": 7.580323226164632e-05, + "loss": 0.6138498187065125, + "step": 5422 + }, + { + "epoch": 2.2886075949367086, + "grad_norm": 1.091636300086975, + "learning_rate": 7.57826595717934e-05, + "loss": 0.5049096345901489, + "step": 5424 + }, + { + "epoch": 2.289451476793249, + "grad_norm": 1.2486571073532104, + "learning_rate": 7.57620809343761e-05, + "loss": 0.5666115283966064, + "step": 5426 + }, + { + "epoch": 2.290295358649789, + "grad_norm": 1.510684847831726, + "learning_rate": 7.57414963541415e-05, + "loss": 0.49512919783592224, + "step": 5428 + }, + { + "epoch": 2.291139240506329, + "grad_norm": 1.1142191886901855, + "learning_rate": 7.572090583583805e-05, + "loss": 0.558807373046875, + "step": 5430 + }, + { + "epoch": 2.2919831223628693, + "grad_norm": 1.1162657737731934, + "learning_rate": 7.57003093842156e-05, + "loss": 0.6245265603065491, + "step": 5432 + }, + { + "epoch": 2.292827004219409, + "grad_norm": 1.2784614562988281, + "learning_rate": 7.567970700402537e-05, + "loss": 0.5505527853965759, + "step": 5434 + }, + { + "epoch": 2.293670886075949, + "grad_norm": 1.3142638206481934, + "learning_rate": 7.565909870001992e-05, + "loss": 0.6137702465057373, + "step": 5436 + }, + { + "epoch": 2.2945147679324895, + "grad_norm": 1.072805404663086, + "learning_rate": 7.563848447695318e-05, + "loss": 0.540766716003418, + "step": 5438 + }, + { + "epoch": 2.2953586497890295, + "grad_norm": 1.2861377000808716, + "learning_rate": 7.561786433958048e-05, + "loss": 0.6806555986404419, + "step": 5440 + }, + { + "epoch": 2.2962025316455694, + "grad_norm": 1.3193045854568481, + "learning_rate": 7.559723829265847e-05, + "loss": 0.6191258430480957, + "step": 5442 + }, + { + "epoch": 2.29704641350211, + "grad_norm": 1.1969127655029297, + "learning_rate": 7.55766063409452e-05, + "loss": 0.6067718863487244, + "step": 5444 + }, + { + "epoch": 2.2978902953586497, + "grad_norm": 1.2129666805267334, + "learning_rate": 7.555596848920006e-05, + "loss": 0.5673627257347107, + "step": 5446 + }, + { + "epoch": 2.2987341772151897, + "grad_norm": 1.1639961004257202, + "learning_rate": 7.553532474218379e-05, + "loss": 0.61825031042099, + "step": 5448 + }, + { + "epoch": 2.29957805907173, + "grad_norm": 1.3893283605575562, + "learning_rate": 7.551467510465852e-05, + "loss": 0.6096790432929993, + "step": 5450 + }, + { + "epoch": 2.30042194092827, + "grad_norm": 1.0708417892456055, + "learning_rate": 7.549401958138772e-05, + "loss": 0.6121414303779602, + "step": 5452 + }, + { + "epoch": 2.30126582278481, + "grad_norm": 1.3299298286437988, + "learning_rate": 7.547335817713624e-05, + "loss": 0.6504668593406677, + "step": 5454 + }, + { + "epoch": 2.3021097046413503, + "grad_norm": 1.3594682216644287, + "learning_rate": 7.545269089667022e-05, + "loss": 0.5761144161224365, + "step": 5456 + }, + { + "epoch": 2.3029535864978903, + "grad_norm": 1.1089586019515991, + "learning_rate": 7.543201774475726e-05, + "loss": 0.5457773804664612, + "step": 5458 + }, + { + "epoch": 2.3037974683544302, + "grad_norm": 1.3472918272018433, + "learning_rate": 7.541133872616624e-05, + "loss": 0.6014775037765503, + "step": 5460 + }, + { + "epoch": 2.3046413502109706, + "grad_norm": 1.2757689952850342, + "learning_rate": 7.53906538456674e-05, + "loss": 0.6246467232704163, + "step": 5462 + }, + { + "epoch": 2.3054852320675105, + "grad_norm": 1.4598166942596436, + "learning_rate": 7.536996310803236e-05, + "loss": 0.6583935022354126, + "step": 5464 + }, + { + "epoch": 2.3063291139240505, + "grad_norm": 1.2861602306365967, + "learning_rate": 7.534926651803407e-05, + "loss": 0.562523603439331, + "step": 5466 + }, + { + "epoch": 2.307172995780591, + "grad_norm": 1.0953221321105957, + "learning_rate": 7.532856408044684e-05, + "loss": 0.6093505620956421, + "step": 5468 + }, + { + "epoch": 2.308016877637131, + "grad_norm": 1.0982829332351685, + "learning_rate": 7.530785580004631e-05, + "loss": 0.6196447014808655, + "step": 5470 + }, + { + "epoch": 2.3088607594936708, + "grad_norm": 1.2224280834197998, + "learning_rate": 7.52871416816095e-05, + "loss": 0.6360989212989807, + "step": 5472 + }, + { + "epoch": 2.309704641350211, + "grad_norm": 1.244486927986145, + "learning_rate": 7.526642172991476e-05, + "loss": 0.6189543008804321, + "step": 5474 + }, + { + "epoch": 2.310548523206751, + "grad_norm": 1.2408053874969482, + "learning_rate": 7.524569594974178e-05, + "loss": 0.6137582659721375, + "step": 5476 + }, + { + "epoch": 2.311392405063291, + "grad_norm": 1.3323272466659546, + "learning_rate": 7.522496434587157e-05, + "loss": 0.6462169289588928, + "step": 5478 + }, + { + "epoch": 2.3122362869198314, + "grad_norm": 1.1076425313949585, + "learning_rate": 7.520422692308657e-05, + "loss": 0.5495362877845764, + "step": 5480 + }, + { + "epoch": 2.3130801687763713, + "grad_norm": 1.3298509120941162, + "learning_rate": 7.518348368617046e-05, + "loss": 0.5560636520385742, + "step": 5482 + }, + { + "epoch": 2.3139240506329113, + "grad_norm": 1.0740195512771606, + "learning_rate": 7.516273463990832e-05, + "loss": 0.5763371586799622, + "step": 5484 + }, + { + "epoch": 2.3147679324894517, + "grad_norm": 1.0748567581176758, + "learning_rate": 7.514197978908657e-05, + "loss": 0.5111498832702637, + "step": 5486 + }, + { + "epoch": 2.3156118143459916, + "grad_norm": 1.2047218084335327, + "learning_rate": 7.512121913849294e-05, + "loss": 0.6599951982498169, + "step": 5488 + }, + { + "epoch": 2.3164556962025316, + "grad_norm": 1.2956700325012207, + "learning_rate": 7.510045269291651e-05, + "loss": 0.6409770846366882, + "step": 5490 + }, + { + "epoch": 2.317299578059072, + "grad_norm": 1.241860032081604, + "learning_rate": 7.50796804571477e-05, + "loss": 0.5967662334442139, + "step": 5492 + }, + { + "epoch": 2.318143459915612, + "grad_norm": 1.1612682342529297, + "learning_rate": 7.50589024359783e-05, + "loss": 0.5856342315673828, + "step": 5494 + }, + { + "epoch": 2.318987341772152, + "grad_norm": 1.0895500183105469, + "learning_rate": 7.503811863420135e-05, + "loss": 0.5652023553848267, + "step": 5496 + }, + { + "epoch": 2.319831223628692, + "grad_norm": 1.3374481201171875, + "learning_rate": 7.50173290566113e-05, + "loss": 0.6777268648147583, + "step": 5498 + }, + { + "epoch": 2.320675105485232, + "grad_norm": 1.192614197731018, + "learning_rate": 7.499653370800391e-05, + "loss": 0.6052314043045044, + "step": 5500 + }, + { + "epoch": 2.320675105485232, + "eval_loss": 0.6867148876190186, + "eval_runtime": 941.3545, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 2.238, + "step": 5500 + }, + { + "epoch": 2.321518987341772, + "grad_norm": 1.1008832454681396, + "learning_rate": 7.497573259317625e-05, + "loss": 0.5208253860473633, + "step": 5502 + }, + { + "epoch": 2.3223628691983125, + "grad_norm": 1.2141541242599487, + "learning_rate": 7.495492571692677e-05, + "loss": 0.6352296471595764, + "step": 5504 + }, + { + "epoch": 2.3232067510548524, + "grad_norm": 1.2588802576065063, + "learning_rate": 7.493411308405517e-05, + "loss": 0.6132256388664246, + "step": 5506 + }, + { + "epoch": 2.3240506329113924, + "grad_norm": 1.348765254020691, + "learning_rate": 7.491329469936258e-05, + "loss": 0.571265697479248, + "step": 5508 + }, + { + "epoch": 2.3248945147679323, + "grad_norm": 1.266377329826355, + "learning_rate": 7.489247056765135e-05, + "loss": 0.5433708429336548, + "step": 5510 + }, + { + "epoch": 2.3257383966244727, + "grad_norm": 1.2920128107070923, + "learning_rate": 7.487164069372523e-05, + "loss": 0.6193158030509949, + "step": 5512 + }, + { + "epoch": 2.3265822784810126, + "grad_norm": 1.068169116973877, + "learning_rate": 7.485080508238928e-05, + "loss": 0.5817977786064148, + "step": 5514 + }, + { + "epoch": 2.3274261603375526, + "grad_norm": 1.2941710948944092, + "learning_rate": 7.482996373844985e-05, + "loss": 0.6558082103729248, + "step": 5516 + }, + { + "epoch": 2.328270042194093, + "grad_norm": 1.2143336534500122, + "learning_rate": 7.480911666671467e-05, + "loss": 0.5569961667060852, + "step": 5518 + }, + { + "epoch": 2.329113924050633, + "grad_norm": 1.3364789485931396, + "learning_rate": 7.478826387199274e-05, + "loss": 0.6497300863265991, + "step": 5520 + }, + { + "epoch": 2.329957805907173, + "grad_norm": 1.057530403137207, + "learning_rate": 7.47674053590944e-05, + "loss": 0.5793087482452393, + "step": 5522 + }, + { + "epoch": 2.330801687763713, + "grad_norm": 1.1543176174163818, + "learning_rate": 7.47465411328313e-05, + "loss": 0.5583140850067139, + "step": 5524 + }, + { + "epoch": 2.331645569620253, + "grad_norm": 1.3409180641174316, + "learning_rate": 7.472567119801645e-05, + "loss": 0.6318784952163696, + "step": 5526 + }, + { + "epoch": 2.332489451476793, + "grad_norm": 1.2899413108825684, + "learning_rate": 7.47047955594641e-05, + "loss": 0.5950855612754822, + "step": 5528 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 1.329220175743103, + "learning_rate": 7.468391422198989e-05, + "loss": 0.6181023716926575, + "step": 5530 + }, + { + "epoch": 2.3341772151898734, + "grad_norm": 1.202129602432251, + "learning_rate": 7.466302719041073e-05, + "loss": 0.6384578943252563, + "step": 5532 + }, + { + "epoch": 2.3350210970464134, + "grad_norm": 1.1890549659729004, + "learning_rate": 7.464213446954487e-05, + "loss": 0.6059293746948242, + "step": 5534 + }, + { + "epoch": 2.3358649789029537, + "grad_norm": 1.2041429281234741, + "learning_rate": 7.462123606421183e-05, + "loss": 0.6432797908782959, + "step": 5536 + }, + { + "epoch": 2.3367088607594937, + "grad_norm": 1.3827080726623535, + "learning_rate": 7.460033197923249e-05, + "loss": 0.6796717047691345, + "step": 5538 + }, + { + "epoch": 2.3375527426160336, + "grad_norm": 1.2323482036590576, + "learning_rate": 7.457942221942903e-05, + "loss": 0.5772476196289062, + "step": 5540 + }, + { + "epoch": 2.338396624472574, + "grad_norm": 1.2011388540267944, + "learning_rate": 7.455850678962493e-05, + "loss": 0.5964269042015076, + "step": 5542 + }, + { + "epoch": 2.339240506329114, + "grad_norm": 1.1133569478988647, + "learning_rate": 7.453758569464495e-05, + "loss": 0.6416608095169067, + "step": 5544 + }, + { + "epoch": 2.340084388185654, + "grad_norm": 1.1257679462432861, + "learning_rate": 7.451665893931521e-05, + "loss": 0.5668829679489136, + "step": 5546 + }, + { + "epoch": 2.3409282700421943, + "grad_norm": 1.3494724035263062, + "learning_rate": 7.449572652846311e-05, + "loss": 0.6029916405677795, + "step": 5548 + }, + { + "epoch": 2.3417721518987342, + "grad_norm": 1.2199759483337402, + "learning_rate": 7.447478846691735e-05, + "loss": 0.6336984634399414, + "step": 5550 + }, + { + "epoch": 2.342616033755274, + "grad_norm": 1.2806570529937744, + "learning_rate": 7.445384475950792e-05, + "loss": 0.579140305519104, + "step": 5552 + }, + { + "epoch": 2.343459915611814, + "grad_norm": 0.9874221086502075, + "learning_rate": 7.443289541106616e-05, + "loss": 0.6061640381813049, + "step": 5554 + }, + { + "epoch": 2.3443037974683545, + "grad_norm": 1.2271486520767212, + "learning_rate": 7.441194042642467e-05, + "loss": 0.5502339601516724, + "step": 5556 + }, + { + "epoch": 2.3451476793248944, + "grad_norm": 1.2522462606430054, + "learning_rate": 7.439097981041738e-05, + "loss": 0.5774438381195068, + "step": 5558 + }, + { + "epoch": 2.3459915611814344, + "grad_norm": 1.267204761505127, + "learning_rate": 7.437001356787945e-05, + "loss": 0.6091527342796326, + "step": 5560 + }, + { + "epoch": 2.3468354430379748, + "grad_norm": 1.1711935997009277, + "learning_rate": 7.434904170364747e-05, + "loss": 0.5443631410598755, + "step": 5562 + }, + { + "epoch": 2.3476793248945147, + "grad_norm": 1.085097074508667, + "learning_rate": 7.432806422255918e-05, + "loss": 0.5255029201507568, + "step": 5564 + }, + { + "epoch": 2.3485232067510546, + "grad_norm": 1.3244949579238892, + "learning_rate": 7.430708112945369e-05, + "loss": 0.5197238922119141, + "step": 5566 + }, + { + "epoch": 2.349367088607595, + "grad_norm": 1.3646879196166992, + "learning_rate": 7.428609242917141e-05, + "loss": 0.5576170682907104, + "step": 5568 + }, + { + "epoch": 2.350210970464135, + "grad_norm": 1.339190125465393, + "learning_rate": 7.426509812655406e-05, + "loss": 0.6254662275314331, + "step": 5570 + }, + { + "epoch": 2.351054852320675, + "grad_norm": 1.4624155759811401, + "learning_rate": 7.424409822644457e-05, + "loss": 0.6593500375747681, + "step": 5572 + }, + { + "epoch": 2.3518987341772153, + "grad_norm": 1.1931114196777344, + "learning_rate": 7.422309273368722e-05, + "loss": 0.6102238297462463, + "step": 5574 + }, + { + "epoch": 2.3527426160337552, + "grad_norm": 1.789340615272522, + "learning_rate": 7.420208165312762e-05, + "loss": 0.6695854067802429, + "step": 5576 + }, + { + "epoch": 2.353586497890295, + "grad_norm": 1.2364262342453003, + "learning_rate": 7.418106498961258e-05, + "loss": 0.578844428062439, + "step": 5578 + }, + { + "epoch": 2.3544303797468356, + "grad_norm": 1.1568509340286255, + "learning_rate": 7.416004274799027e-05, + "loss": 0.5717503428459167, + "step": 5580 + }, + { + "epoch": 2.3552742616033755, + "grad_norm": 1.1744630336761475, + "learning_rate": 7.413901493311009e-05, + "loss": 0.6170201897621155, + "step": 5582 + }, + { + "epoch": 2.3561181434599154, + "grad_norm": 1.0684332847595215, + "learning_rate": 7.411798154982275e-05, + "loss": 0.6482691764831543, + "step": 5584 + }, + { + "epoch": 2.356962025316456, + "grad_norm": 1.046196460723877, + "learning_rate": 7.409694260298025e-05, + "loss": 0.572839617729187, + "step": 5586 + }, + { + "epoch": 2.3578059071729958, + "grad_norm": 1.0110210180282593, + "learning_rate": 7.407589809743591e-05, + "loss": 0.5645976662635803, + "step": 5588 + }, + { + "epoch": 2.3586497890295357, + "grad_norm": 1.0801016092300415, + "learning_rate": 7.405484803804425e-05, + "loss": 0.5653133392333984, + "step": 5590 + }, + { + "epoch": 2.359493670886076, + "grad_norm": 1.0934380292892456, + "learning_rate": 7.403379242966116e-05, + "loss": 0.5972150564193726, + "step": 5592 + }, + { + "epoch": 2.360337552742616, + "grad_norm": 1.3722410202026367, + "learning_rate": 7.40127312771437e-05, + "loss": 0.5927542448043823, + "step": 5594 + }, + { + "epoch": 2.361181434599156, + "grad_norm": 1.1567236185073853, + "learning_rate": 7.399166458535032e-05, + "loss": 0.547027051448822, + "step": 5596 + }, + { + "epoch": 2.3620253164556964, + "grad_norm": 1.2254211902618408, + "learning_rate": 7.397059235914067e-05, + "loss": 0.5356617569923401, + "step": 5598 + }, + { + "epoch": 2.3628691983122363, + "grad_norm": 1.1529103517532349, + "learning_rate": 7.394951460337575e-05, + "loss": 0.5424175262451172, + "step": 5600 + }, + { + "epoch": 2.3628691983122363, + "eval_loss": 0.6851074695587158, + "eval_runtime": 938.5536, + "eval_samples_per_second": 2.245, + "eval_steps_per_second": 2.245, + "step": 5600 + }, + { + "epoch": 2.3637130801687762, + "grad_norm": 1.2050299644470215, + "learning_rate": 7.392843132291777e-05, + "loss": 0.5834107398986816, + "step": 5602 + }, + { + "epoch": 2.3645569620253166, + "grad_norm": 1.264567494392395, + "learning_rate": 7.390734252263024e-05, + "loss": 0.5445035099983215, + "step": 5604 + }, + { + "epoch": 2.3654008438818566, + "grad_norm": 1.357791781425476, + "learning_rate": 7.388624820737791e-05, + "loss": 0.6207653880119324, + "step": 5606 + }, + { + "epoch": 2.3662447257383965, + "grad_norm": 1.2246928215026855, + "learning_rate": 7.386514838202689e-05, + "loss": 0.6628696322441101, + "step": 5608 + }, + { + "epoch": 2.367088607594937, + "grad_norm": 1.1455399990081787, + "learning_rate": 7.384404305144447e-05, + "loss": 0.5870704054832458, + "step": 5610 + }, + { + "epoch": 2.367932489451477, + "grad_norm": 1.2338638305664062, + "learning_rate": 7.382293222049925e-05, + "loss": 0.6160538792610168, + "step": 5612 + }, + { + "epoch": 2.3687763713080168, + "grad_norm": 1.231271505355835, + "learning_rate": 7.38018158940611e-05, + "loss": 0.6274036765098572, + "step": 5614 + }, + { + "epoch": 2.369620253164557, + "grad_norm": 1.022050380706787, + "learning_rate": 7.378069407700114e-05, + "loss": 0.5623515248298645, + "step": 5616 + }, + { + "epoch": 2.370464135021097, + "grad_norm": 1.2040951251983643, + "learning_rate": 7.375956677419178e-05, + "loss": 0.5505564212799072, + "step": 5618 + }, + { + "epoch": 2.371308016877637, + "grad_norm": 1.1754523515701294, + "learning_rate": 7.373843399050668e-05, + "loss": 0.6537002921104431, + "step": 5620 + }, + { + "epoch": 2.3721518987341774, + "grad_norm": 1.1710485219955444, + "learning_rate": 7.371729573082073e-05, + "loss": 0.6224458813667297, + "step": 5622 + }, + { + "epoch": 2.3729957805907174, + "grad_norm": 1.1629483699798584, + "learning_rate": 7.36961520000102e-05, + "loss": 0.6297177076339722, + "step": 5624 + }, + { + "epoch": 2.3738396624472573, + "grad_norm": 1.1069440841674805, + "learning_rate": 7.367500280295248e-05, + "loss": 0.5202008485794067, + "step": 5626 + }, + { + "epoch": 2.3746835443037977, + "grad_norm": 1.0068297386169434, + "learning_rate": 7.36538481445263e-05, + "loss": 0.5256102681159973, + "step": 5628 + }, + { + "epoch": 2.3755274261603376, + "grad_norm": 1.1103417873382568, + "learning_rate": 7.363268802961161e-05, + "loss": 0.5460903644561768, + "step": 5630 + }, + { + "epoch": 2.3763713080168776, + "grad_norm": 1.2885268926620483, + "learning_rate": 7.361152246308969e-05, + "loss": 0.5817124247550964, + "step": 5632 + }, + { + "epoch": 2.377215189873418, + "grad_norm": 1.233831524848938, + "learning_rate": 7.359035144984302e-05, + "loss": 0.5415143966674805, + "step": 5634 + }, + { + "epoch": 2.378059071729958, + "grad_norm": 1.3451908826828003, + "learning_rate": 7.35691749947553e-05, + "loss": 0.6837685108184814, + "step": 5636 + }, + { + "epoch": 2.378902953586498, + "grad_norm": 1.1320621967315674, + "learning_rate": 7.354799310271159e-05, + "loss": 0.5966196656227112, + "step": 5638 + }, + { + "epoch": 2.379746835443038, + "grad_norm": 1.1884461641311646, + "learning_rate": 7.35268057785981e-05, + "loss": 0.5607479214668274, + "step": 5640 + }, + { + "epoch": 2.380590717299578, + "grad_norm": 1.2710856199264526, + "learning_rate": 7.350561302730236e-05, + "loss": 0.595242977142334, + "step": 5642 + }, + { + "epoch": 2.381434599156118, + "grad_norm": 1.3110458850860596, + "learning_rate": 7.348441485371314e-05, + "loss": 0.6208752393722534, + "step": 5644 + }, + { + "epoch": 2.382278481012658, + "grad_norm": 1.1734380722045898, + "learning_rate": 7.346321126272044e-05, + "loss": 0.6173125505447388, + "step": 5646 + }, + { + "epoch": 2.3831223628691984, + "grad_norm": 1.2024762630462646, + "learning_rate": 7.34420022592155e-05, + "loss": 0.6013050675392151, + "step": 5648 + }, + { + "epoch": 2.3839662447257384, + "grad_norm": 1.1305288076400757, + "learning_rate": 7.342078784809086e-05, + "loss": 0.5919594764709473, + "step": 5650 + }, + { + "epoch": 2.3848101265822783, + "grad_norm": 1.075323462486267, + "learning_rate": 7.339956803424028e-05, + "loss": 0.5399283766746521, + "step": 5652 + }, + { + "epoch": 2.3856540084388187, + "grad_norm": 1.2035599946975708, + "learning_rate": 7.337834282255873e-05, + "loss": 0.6253576874732971, + "step": 5654 + }, + { + "epoch": 2.3864978902953586, + "grad_norm": 1.0572105646133423, + "learning_rate": 7.335711221794251e-05, + "loss": 0.5247007608413696, + "step": 5656 + }, + { + "epoch": 2.3873417721518986, + "grad_norm": 1.2701191902160645, + "learning_rate": 7.333587622528906e-05, + "loss": 0.5800243020057678, + "step": 5658 + }, + { + "epoch": 2.388185654008439, + "grad_norm": 1.1772741079330444, + "learning_rate": 7.331463484949716e-05, + "loss": 0.589645504951477, + "step": 5660 + }, + { + "epoch": 2.389029535864979, + "grad_norm": 1.0562703609466553, + "learning_rate": 7.329338809546674e-05, + "loss": 0.5820419192314148, + "step": 5662 + }, + { + "epoch": 2.389873417721519, + "grad_norm": 1.1634355783462524, + "learning_rate": 7.327213596809906e-05, + "loss": 0.591435432434082, + "step": 5664 + }, + { + "epoch": 2.3907172995780592, + "grad_norm": 1.2220302820205688, + "learning_rate": 7.325087847229655e-05, + "loss": 0.5630883574485779, + "step": 5666 + }, + { + "epoch": 2.391561181434599, + "grad_norm": 1.4087659120559692, + "learning_rate": 7.322961561296294e-05, + "loss": 0.6050130128860474, + "step": 5668 + }, + { + "epoch": 2.392405063291139, + "grad_norm": 1.1126172542572021, + "learning_rate": 7.320834739500313e-05, + "loss": 0.56146240234375, + "step": 5670 + }, + { + "epoch": 2.3932489451476795, + "grad_norm": 0.99373859167099, + "learning_rate": 7.31870738233233e-05, + "loss": 0.5507852435112, + "step": 5672 + }, + { + "epoch": 2.3940928270042194, + "grad_norm": 1.14408540725708, + "learning_rate": 7.316579490283085e-05, + "loss": 0.5895347595214844, + "step": 5674 + }, + { + "epoch": 2.3949367088607594, + "grad_norm": 1.1728581190109253, + "learning_rate": 7.314451063843443e-05, + "loss": 0.5304404497146606, + "step": 5676 + }, + { + "epoch": 2.3957805907172998, + "grad_norm": 1.1721378564834595, + "learning_rate": 7.31232210350439e-05, + "loss": 0.5805793404579163, + "step": 5678 + }, + { + "epoch": 2.3966244725738397, + "grad_norm": 1.0499866008758545, + "learning_rate": 7.310192609757038e-05, + "loss": 0.5671767592430115, + "step": 5680 + }, + { + "epoch": 2.3974683544303796, + "grad_norm": 1.0959177017211914, + "learning_rate": 7.308062583092617e-05, + "loss": 0.6335723400115967, + "step": 5682 + }, + { + "epoch": 2.3983122362869196, + "grad_norm": 1.31142258644104, + "learning_rate": 7.305932024002487e-05, + "loss": 0.6032374501228333, + "step": 5684 + }, + { + "epoch": 2.39915611814346, + "grad_norm": 0.9212818741798401, + "learning_rate": 7.303800932978124e-05, + "loss": 0.5492936372756958, + "step": 5686 + }, + { + "epoch": 2.4, + "grad_norm": 1.1956428289413452, + "learning_rate": 7.301669310511132e-05, + "loss": 0.5533297061920166, + "step": 5688 + }, + { + "epoch": 2.40084388185654, + "grad_norm": 1.4048634767532349, + "learning_rate": 7.299537157093232e-05, + "loss": 0.5859368443489075, + "step": 5690 + }, + { + "epoch": 2.4016877637130802, + "grad_norm": 1.0580679178237915, + "learning_rate": 7.297404473216277e-05, + "loss": 0.5099439024925232, + "step": 5692 + }, + { + "epoch": 2.40253164556962, + "grad_norm": 1.2450575828552246, + "learning_rate": 7.29527125937223e-05, + "loss": 0.5631486177444458, + "step": 5694 + }, + { + "epoch": 2.40337552742616, + "grad_norm": 1.338466763496399, + "learning_rate": 7.293137516053187e-05, + "loss": 0.6045404672622681, + "step": 5696 + }, + { + "epoch": 2.4042194092827005, + "grad_norm": 1.198588252067566, + "learning_rate": 7.291003243751358e-05, + "loss": 0.6063475608825684, + "step": 5698 + }, + { + "epoch": 2.4050632911392404, + "grad_norm": 1.2315080165863037, + "learning_rate": 7.288868442959081e-05, + "loss": 0.5734809041023254, + "step": 5700 + }, + { + "epoch": 2.4050632911392404, + "eval_loss": 0.6841402053833008, + "eval_runtime": 941.6641, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 2.238, + "step": 5700 + }, + { + "epoch": 2.4059071729957804, + "grad_norm": 1.1494885683059692, + "learning_rate": 7.286733114168812e-05, + "loss": 0.5744594931602478, + "step": 5702 + }, + { + "epoch": 2.4067510548523208, + "grad_norm": 1.3769505023956299, + "learning_rate": 7.284597257873132e-05, + "loss": 0.611789882183075, + "step": 5704 + }, + { + "epoch": 2.4075949367088607, + "grad_norm": 1.2326449155807495, + "learning_rate": 7.28246087456474e-05, + "loss": 0.6091431975364685, + "step": 5706 + }, + { + "epoch": 2.4084388185654007, + "grad_norm": 1.1960830688476562, + "learning_rate": 7.28032396473646e-05, + "loss": 0.49431973695755005, + "step": 5708 + }, + { + "epoch": 2.409282700421941, + "grad_norm": 1.1672827005386353, + "learning_rate": 7.278186528881237e-05, + "loss": 0.5344718098640442, + "step": 5710 + }, + { + "epoch": 2.410126582278481, + "grad_norm": 1.1923719644546509, + "learning_rate": 7.276048567492136e-05, + "loss": 0.6011165380477905, + "step": 5712 + }, + { + "epoch": 2.410970464135021, + "grad_norm": 1.2314990758895874, + "learning_rate": 7.273910081062341e-05, + "loss": 0.6300925016403198, + "step": 5714 + }, + { + "epoch": 2.4118143459915613, + "grad_norm": 0.8976680040359497, + "learning_rate": 7.27177107008516e-05, + "loss": 0.56329345703125, + "step": 5716 + }, + { + "epoch": 2.4126582278481012, + "grad_norm": 1.2954038381576538, + "learning_rate": 7.269631535054026e-05, + "loss": 0.6266427040100098, + "step": 5718 + }, + { + "epoch": 2.413502109704641, + "grad_norm": 1.3357585668563843, + "learning_rate": 7.267491476462485e-05, + "loss": 0.6234018802642822, + "step": 5720 + }, + { + "epoch": 2.4143459915611816, + "grad_norm": 1.1913645267486572, + "learning_rate": 7.265350894804209e-05, + "loss": 0.5909059047698975, + "step": 5722 + }, + { + "epoch": 2.4151898734177215, + "grad_norm": 1.3425955772399902, + "learning_rate": 7.263209790572986e-05, + "loss": 0.5708479285240173, + "step": 5724 + }, + { + "epoch": 2.4160337552742615, + "grad_norm": 1.2258507013320923, + "learning_rate": 7.261068164262734e-05, + "loss": 0.5810034871101379, + "step": 5726 + }, + { + "epoch": 2.416877637130802, + "grad_norm": 1.348794937133789, + "learning_rate": 7.258926016367479e-05, + "loss": 0.5939235687255859, + "step": 5728 + }, + { + "epoch": 2.4177215189873418, + "grad_norm": 1.0896574258804321, + "learning_rate": 7.256783347381375e-05, + "loss": 0.6298259496688843, + "step": 5730 + }, + { + "epoch": 2.4185654008438817, + "grad_norm": 1.164866328239441, + "learning_rate": 7.254640157798696e-05, + "loss": 0.5277430415153503, + "step": 5732 + }, + { + "epoch": 2.419409282700422, + "grad_norm": 1.1215453147888184, + "learning_rate": 7.252496448113833e-05, + "loss": 0.5724055767059326, + "step": 5734 + }, + { + "epoch": 2.420253164556962, + "grad_norm": 1.0640764236450195, + "learning_rate": 7.2503522188213e-05, + "loss": 0.5439977645874023, + "step": 5736 + }, + { + "epoch": 2.421097046413502, + "grad_norm": 1.4874604940414429, + "learning_rate": 7.248207470415729e-05, + "loss": 0.7568614482879639, + "step": 5738 + }, + { + "epoch": 2.4219409282700424, + "grad_norm": 1.2611099481582642, + "learning_rate": 7.246062203391873e-05, + "loss": 0.6389632225036621, + "step": 5740 + }, + { + "epoch": 2.4227848101265823, + "grad_norm": 1.185644507408142, + "learning_rate": 7.243916418244602e-05, + "loss": 0.6180628538131714, + "step": 5742 + }, + { + "epoch": 2.4236286919831223, + "grad_norm": 1.1648430824279785, + "learning_rate": 7.241770115468909e-05, + "loss": 0.619799017906189, + "step": 5744 + }, + { + "epoch": 2.4244725738396626, + "grad_norm": 1.1974445581436157, + "learning_rate": 7.239623295559903e-05, + "loss": 0.6446201205253601, + "step": 5746 + }, + { + "epoch": 2.4253164556962026, + "grad_norm": 1.140477180480957, + "learning_rate": 7.237475959012818e-05, + "loss": 0.5839580297470093, + "step": 5748 + }, + { + "epoch": 2.4261603375527425, + "grad_norm": 1.1374423503875732, + "learning_rate": 7.235328106322998e-05, + "loss": 0.48815420269966125, + "step": 5750 + }, + { + "epoch": 2.427004219409283, + "grad_norm": 1.411432147026062, + "learning_rate": 7.233179737985916e-05, + "loss": 0.638519287109375, + "step": 5752 + }, + { + "epoch": 2.427848101265823, + "grad_norm": 1.1232497692108154, + "learning_rate": 7.231030854497157e-05, + "loss": 0.5776677131652832, + "step": 5754 + }, + { + "epoch": 2.428691983122363, + "grad_norm": 1.0815738439559937, + "learning_rate": 7.228881456352428e-05, + "loss": 0.5297027230262756, + "step": 5756 + }, + { + "epoch": 2.429535864978903, + "grad_norm": 1.2230733633041382, + "learning_rate": 7.226731544047553e-05, + "loss": 0.5630011558532715, + "step": 5758 + }, + { + "epoch": 2.430379746835443, + "grad_norm": 1.2033147811889648, + "learning_rate": 7.224581118078476e-05, + "loss": 0.5772101283073425, + "step": 5760 + }, + { + "epoch": 2.431223628691983, + "grad_norm": 1.2150053977966309, + "learning_rate": 7.22243017894126e-05, + "loss": 0.5412847399711609, + "step": 5762 + }, + { + "epoch": 2.4320675105485234, + "grad_norm": 1.0494824647903442, + "learning_rate": 7.220278727132083e-05, + "loss": 0.5568405389785767, + "step": 5764 + }, + { + "epoch": 2.4329113924050634, + "grad_norm": 1.2803306579589844, + "learning_rate": 7.218126763147244e-05, + "loss": 0.6022217869758606, + "step": 5766 + }, + { + "epoch": 2.4337552742616033, + "grad_norm": 1.0832798480987549, + "learning_rate": 7.215974287483163e-05, + "loss": 0.5568796396255493, + "step": 5768 + }, + { + "epoch": 2.4345991561181437, + "grad_norm": 1.1829264163970947, + "learning_rate": 7.213821300636372e-05, + "loss": 0.5607990026473999, + "step": 5770 + }, + { + "epoch": 2.4354430379746836, + "grad_norm": 2.3017473220825195, + "learning_rate": 7.211667803103523e-05, + "loss": 0.6382274031639099, + "step": 5772 + }, + { + "epoch": 2.4362869198312236, + "grad_norm": 1.1701387166976929, + "learning_rate": 7.209513795381388e-05, + "loss": 0.5748776793479919, + "step": 5774 + }, + { + "epoch": 2.4371308016877635, + "grad_norm": 1.0480856895446777, + "learning_rate": 7.207359277966856e-05, + "loss": 0.5760934352874756, + "step": 5776 + }, + { + "epoch": 2.437974683544304, + "grad_norm": 1.2263693809509277, + "learning_rate": 7.20520425135693e-05, + "loss": 0.6387208104133606, + "step": 5778 + }, + { + "epoch": 2.438818565400844, + "grad_norm": 1.219246506690979, + "learning_rate": 7.203048716048737e-05, + "loss": 0.6078037619590759, + "step": 5780 + }, + { + "epoch": 2.439662447257384, + "grad_norm": 1.2452640533447266, + "learning_rate": 7.200892672539515e-05, + "loss": 0.606924831867218, + "step": 5782 + }, + { + "epoch": 2.440506329113924, + "grad_norm": 1.3469732999801636, + "learning_rate": 7.198736121326621e-05, + "loss": 0.585297703742981, + "step": 5784 + }, + { + "epoch": 2.441350210970464, + "grad_norm": 1.151127576828003, + "learning_rate": 7.196579062907533e-05, + "loss": 0.5849902033805847, + "step": 5786 + }, + { + "epoch": 2.442194092827004, + "grad_norm": 1.0669564008712769, + "learning_rate": 7.19442149777984e-05, + "loss": 0.6150397062301636, + "step": 5788 + }, + { + "epoch": 2.4430379746835444, + "grad_norm": 1.1700209379196167, + "learning_rate": 7.192263426441252e-05, + "loss": 0.6324567794799805, + "step": 5790 + }, + { + "epoch": 2.4438818565400844, + "grad_norm": 1.2832094430923462, + "learning_rate": 7.190104849389597e-05, + "loss": 0.6202381253242493, + "step": 5792 + }, + { + "epoch": 2.4447257383966243, + "grad_norm": 1.2046177387237549, + "learning_rate": 7.187945767122813e-05, + "loss": 0.6156684756278992, + "step": 5794 + }, + { + "epoch": 2.4455696202531647, + "grad_norm": 1.031133770942688, + "learning_rate": 7.185786180138961e-05, + "loss": 0.5763497352600098, + "step": 5796 + }, + { + "epoch": 2.4464135021097047, + "grad_norm": 1.2803475856781006, + "learning_rate": 7.183626088936216e-05, + "loss": 0.5419677495956421, + "step": 5798 + }, + { + "epoch": 2.4472573839662446, + "grad_norm": 1.2407588958740234, + "learning_rate": 7.181465494012869e-05, + "loss": 0.629108190536499, + "step": 5800 + }, + { + "epoch": 2.4472573839662446, + "eval_loss": 0.6835155487060547, + "eval_runtime": 758.407, + "eval_samples_per_second": 2.778, + "eval_steps_per_second": 2.778, + "step": 5800 + }, + { + "epoch": 2.448101265822785, + "grad_norm": 1.3525878190994263, + "learning_rate": 7.17930439586733e-05, + "loss": 0.6146516799926758, + "step": 5802 + }, + { + "epoch": 2.448945147679325, + "grad_norm": 1.255921721458435, + "learning_rate": 7.177142794998121e-05, + "loss": 0.5796315670013428, + "step": 5804 + }, + { + "epoch": 2.449789029535865, + "grad_norm": 1.2135448455810547, + "learning_rate": 7.174980691903881e-05, + "loss": 0.5978766679763794, + "step": 5806 + }, + { + "epoch": 2.4506329113924052, + "grad_norm": 1.117942214012146, + "learning_rate": 7.172818087083367e-05, + "loss": 0.5941054821014404, + "step": 5808 + }, + { + "epoch": 2.451476793248945, + "grad_norm": 1.2917672395706177, + "learning_rate": 7.17065498103545e-05, + "loss": 0.6213865876197815, + "step": 5810 + }, + { + "epoch": 2.452320675105485, + "grad_norm": 1.2287952899932861, + "learning_rate": 7.168491374259118e-05, + "loss": 0.627090573310852, + "step": 5812 + }, + { + "epoch": 2.453164556962025, + "grad_norm": 1.2427480220794678, + "learning_rate": 7.16632726725347e-05, + "loss": 0.605871319770813, + "step": 5814 + }, + { + "epoch": 2.4540084388185655, + "grad_norm": 1.2568929195404053, + "learning_rate": 7.16416266051773e-05, + "loss": 0.5961518883705139, + "step": 5816 + }, + { + "epoch": 2.4548523206751054, + "grad_norm": 1.2202998399734497, + "learning_rate": 7.161997554551226e-05, + "loss": 0.585054874420166, + "step": 5818 + }, + { + "epoch": 2.4556962025316453, + "grad_norm": 1.2326043844223022, + "learning_rate": 7.159831949853409e-05, + "loss": 0.6219096779823303, + "step": 5820 + }, + { + "epoch": 2.4565400843881857, + "grad_norm": 1.2161623239517212, + "learning_rate": 7.15766584692384e-05, + "loss": 0.641189455986023, + "step": 5822 + }, + { + "epoch": 2.4573839662447257, + "grad_norm": 1.2391023635864258, + "learning_rate": 7.1554992462622e-05, + "loss": 0.577190101146698, + "step": 5824 + }, + { + "epoch": 2.4582278481012656, + "grad_norm": 1.0883333683013916, + "learning_rate": 7.153332148368281e-05, + "loss": 0.5264694690704346, + "step": 5826 + }, + { + "epoch": 2.459071729957806, + "grad_norm": 1.2129524946212769, + "learning_rate": 7.15116455374199e-05, + "loss": 0.631437361240387, + "step": 5828 + }, + { + "epoch": 2.459915611814346, + "grad_norm": 1.0476374626159668, + "learning_rate": 7.148996462883352e-05, + "loss": 0.5025489926338196, + "step": 5830 + }, + { + "epoch": 2.460759493670886, + "grad_norm": 1.1389570236206055, + "learning_rate": 7.146827876292502e-05, + "loss": 0.5903586745262146, + "step": 5832 + }, + { + "epoch": 2.4616033755274263, + "grad_norm": 1.4385539293289185, + "learning_rate": 7.14465879446969e-05, + "loss": 0.633786141872406, + "step": 5834 + }, + { + "epoch": 2.462447257383966, + "grad_norm": 1.1184585094451904, + "learning_rate": 7.142489217915283e-05, + "loss": 0.5889136791229248, + "step": 5836 + }, + { + "epoch": 2.463291139240506, + "grad_norm": 1.2257685661315918, + "learning_rate": 7.140319147129763e-05, + "loss": 0.5774597525596619, + "step": 5838 + }, + { + "epoch": 2.4641350210970465, + "grad_norm": 0.9524238109588623, + "learning_rate": 7.13814858261372e-05, + "loss": 0.5220611095428467, + "step": 5840 + }, + { + "epoch": 2.4649789029535865, + "grad_norm": 1.2814422845840454, + "learning_rate": 7.135977524867861e-05, + "loss": 0.5724858641624451, + "step": 5842 + }, + { + "epoch": 2.4658227848101264, + "grad_norm": 1.0978140830993652, + "learning_rate": 7.133805974393013e-05, + "loss": 0.5469759702682495, + "step": 5844 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 1.310279130935669, + "learning_rate": 7.131633931690104e-05, + "loss": 0.6554312705993652, + "step": 5846 + }, + { + "epoch": 2.4675105485232067, + "grad_norm": 1.286189317703247, + "learning_rate": 7.129461397260187e-05, + "loss": 0.6166019439697266, + "step": 5848 + }, + { + "epoch": 2.4683544303797467, + "grad_norm": 1.1586377620697021, + "learning_rate": 7.127288371604424e-05, + "loss": 0.6301121711730957, + "step": 5850 + }, + { + "epoch": 2.469198312236287, + "grad_norm": 1.1684564352035522, + "learning_rate": 7.125114855224087e-05, + "loss": 0.6022663712501526, + "step": 5852 + }, + { + "epoch": 2.470042194092827, + "grad_norm": 1.182511329650879, + "learning_rate": 7.122940848620567e-05, + "loss": 0.5959302186965942, + "step": 5854 + }, + { + "epoch": 2.470886075949367, + "grad_norm": 1.2383002042770386, + "learning_rate": 7.120766352295366e-05, + "loss": 0.6251413822174072, + "step": 5856 + }, + { + "epoch": 2.4717299578059073, + "grad_norm": 1.2001979351043701, + "learning_rate": 7.118591366750097e-05, + "loss": 0.6332544088363647, + "step": 5858 + }, + { + "epoch": 2.4725738396624473, + "grad_norm": 1.2166392803192139, + "learning_rate": 7.116415892486488e-05, + "loss": 0.5797795057296753, + "step": 5860 + }, + { + "epoch": 2.473417721518987, + "grad_norm": 1.2235382795333862, + "learning_rate": 7.114239930006379e-05, + "loss": 0.5335313081741333, + "step": 5862 + }, + { + "epoch": 2.4742616033755276, + "grad_norm": 1.2405973672866821, + "learning_rate": 7.112063479811724e-05, + "loss": 0.5536905527114868, + "step": 5864 + }, + { + "epoch": 2.4751054852320675, + "grad_norm": 1.116328477859497, + "learning_rate": 7.109886542404585e-05, + "loss": 0.554654061794281, + "step": 5866 + }, + { + "epoch": 2.4759493670886075, + "grad_norm": 1.2757837772369385, + "learning_rate": 7.107709118287143e-05, + "loss": 0.6017873287200928, + "step": 5868 + }, + { + "epoch": 2.476793248945148, + "grad_norm": 1.3445937633514404, + "learning_rate": 7.105531207961686e-05, + "loss": 0.6479908227920532, + "step": 5870 + }, + { + "epoch": 2.477637130801688, + "grad_norm": 1.1464542150497437, + "learning_rate": 7.103352811930619e-05, + "loss": 0.5829157829284668, + "step": 5872 + }, + { + "epoch": 2.4784810126582277, + "grad_norm": 1.3275130987167358, + "learning_rate": 7.101173930696453e-05, + "loss": 0.54380863904953, + "step": 5874 + }, + { + "epoch": 2.479324894514768, + "grad_norm": 1.006990909576416, + "learning_rate": 7.098994564761813e-05, + "loss": 0.6313910484313965, + "step": 5876 + }, + { + "epoch": 2.480168776371308, + "grad_norm": 1.1358299255371094, + "learning_rate": 7.09681471462944e-05, + "loss": 0.5343483090400696, + "step": 5878 + }, + { + "epoch": 2.481012658227848, + "grad_norm": 1.1456117630004883, + "learning_rate": 7.094634380802184e-05, + "loss": 0.49450409412384033, + "step": 5880 + }, + { + "epoch": 2.4818565400843884, + "grad_norm": 1.2961846590042114, + "learning_rate": 7.092453563783003e-05, + "loss": 0.6378757357597351, + "step": 5882 + }, + { + "epoch": 2.4827004219409283, + "grad_norm": 0.983889102935791, + "learning_rate": 7.090272264074972e-05, + "loss": 0.5937124490737915, + "step": 5884 + }, + { + "epoch": 2.4835443037974683, + "grad_norm": 1.0205817222595215, + "learning_rate": 7.088090482181273e-05, + "loss": 0.5301283597946167, + "step": 5886 + }, + { + "epoch": 2.4843881856540087, + "grad_norm": 1.1721397638320923, + "learning_rate": 7.085908218605204e-05, + "loss": 0.6191756129264832, + "step": 5888 + }, + { + "epoch": 2.4852320675105486, + "grad_norm": 1.2432814836502075, + "learning_rate": 7.083725473850168e-05, + "loss": 0.5928890109062195, + "step": 5890 + }, + { + "epoch": 2.4860759493670885, + "grad_norm": 1.252125859260559, + "learning_rate": 7.081542248419686e-05, + "loss": 0.6136764287948608, + "step": 5892 + }, + { + "epoch": 2.486919831223629, + "grad_norm": 1.3686699867248535, + "learning_rate": 7.079358542817382e-05, + "loss": 0.6084910035133362, + "step": 5894 + }, + { + "epoch": 2.487763713080169, + "grad_norm": 1.0877282619476318, + "learning_rate": 7.077174357546996e-05, + "loss": 0.5862250924110413, + "step": 5896 + }, + { + "epoch": 2.488607594936709, + "grad_norm": 1.164095401763916, + "learning_rate": 7.074989693112381e-05, + "loss": 0.6300894021987915, + "step": 5898 + }, + { + "epoch": 2.489451476793249, + "grad_norm": 1.1169507503509521, + "learning_rate": 7.072804550017493e-05, + "loss": 0.5508570075035095, + "step": 5900 + }, + { + "epoch": 2.489451476793249, + "eval_loss": 0.6820966005325317, + "eval_runtime": 513.3515, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 5900 + }, + { + "epoch": 2.490295358649789, + "grad_norm": 1.1718615293502808, + "learning_rate": 7.070618928766406e-05, + "loss": 0.550847589969635, + "step": 5902 + }, + { + "epoch": 2.491139240506329, + "grad_norm": 1.4725650548934937, + "learning_rate": 7.068432829863298e-05, + "loss": 0.5663347840309143, + "step": 5904 + }, + { + "epoch": 2.491983122362869, + "grad_norm": 1.042083978652954, + "learning_rate": 7.066246253812462e-05, + "loss": 0.5506191849708557, + "step": 5906 + }, + { + "epoch": 2.4928270042194094, + "grad_norm": 1.2020974159240723, + "learning_rate": 7.064059201118297e-05, + "loss": 0.5656929612159729, + "step": 5908 + }, + { + "epoch": 2.4936708860759493, + "grad_norm": 1.1040663719177246, + "learning_rate": 7.061871672285317e-05, + "loss": 0.5159370303153992, + "step": 5910 + }, + { + "epoch": 2.4945147679324893, + "grad_norm": 1.3681589365005493, + "learning_rate": 7.05968366781814e-05, + "loss": 0.6161949634552002, + "step": 5912 + }, + { + "epoch": 2.4953586497890297, + "grad_norm": 1.26628839969635, + "learning_rate": 7.057495188221498e-05, + "loss": 0.6357758641242981, + "step": 5914 + }, + { + "epoch": 2.4962025316455696, + "grad_norm": 1.2714020013809204, + "learning_rate": 7.05530623400023e-05, + "loss": 0.5467366576194763, + "step": 5916 + }, + { + "epoch": 2.4970464135021095, + "grad_norm": 1.2255018949508667, + "learning_rate": 7.053116805659287e-05, + "loss": 0.592526376247406, + "step": 5918 + }, + { + "epoch": 2.49789029535865, + "grad_norm": 1.2816206216812134, + "learning_rate": 7.050926903703729e-05, + "loss": 0.5819981694221497, + "step": 5920 + }, + { + "epoch": 2.49873417721519, + "grad_norm": 1.1938221454620361, + "learning_rate": 7.048736528638722e-05, + "loss": 0.6037712693214417, + "step": 5922 + }, + { + "epoch": 2.49957805907173, + "grad_norm": 1.1330323219299316, + "learning_rate": 7.046545680969545e-05, + "loss": 0.5567215085029602, + "step": 5924 + }, + { + "epoch": 2.50042194092827, + "grad_norm": 1.233564019203186, + "learning_rate": 7.044354361201585e-05, + "loss": 0.5626974105834961, + "step": 5926 + }, + { + "epoch": 2.50126582278481, + "grad_norm": 1.1913540363311768, + "learning_rate": 7.042162569840336e-05, + "loss": 0.5672739744186401, + "step": 5928 + }, + { + "epoch": 2.50210970464135, + "grad_norm": 1.060952067375183, + "learning_rate": 7.039970307391402e-05, + "loss": 0.5965602993965149, + "step": 5930 + }, + { + "epoch": 2.5029535864978905, + "grad_norm": 1.2003182172775269, + "learning_rate": 7.037777574360497e-05, + "loss": 0.590932309627533, + "step": 5932 + }, + { + "epoch": 2.5037974683544304, + "grad_norm": 1.073434829711914, + "learning_rate": 7.035584371253441e-05, + "loss": 0.5736868381500244, + "step": 5934 + }, + { + "epoch": 2.5046413502109703, + "grad_norm": 1.2641130685806274, + "learning_rate": 7.033390698576166e-05, + "loss": 0.614703357219696, + "step": 5936 + }, + { + "epoch": 2.5054852320675103, + "grad_norm": 1.2406511306762695, + "learning_rate": 7.031196556834708e-05, + "loss": 0.5866397023200989, + "step": 5938 + }, + { + "epoch": 2.5063291139240507, + "grad_norm": 1.231619119644165, + "learning_rate": 7.029001946535215e-05, + "loss": 0.5792667865753174, + "step": 5940 + }, + { + "epoch": 2.5071729957805906, + "grad_norm": 1.419447660446167, + "learning_rate": 7.026806868183939e-05, + "loss": 0.5686604976654053, + "step": 5942 + }, + { + "epoch": 2.5080168776371305, + "grad_norm": 1.139244556427002, + "learning_rate": 7.024611322287245e-05, + "loss": 0.5860661268234253, + "step": 5944 + }, + { + "epoch": 2.508860759493671, + "grad_norm": 1.070517897605896, + "learning_rate": 7.022415309351602e-05, + "loss": 0.5823250412940979, + "step": 5946 + }, + { + "epoch": 2.509704641350211, + "grad_norm": 1.0775398015975952, + "learning_rate": 7.020218829883589e-05, + "loss": 0.5291389226913452, + "step": 5948 + }, + { + "epoch": 2.510548523206751, + "grad_norm": 1.339716911315918, + "learning_rate": 7.018021884389892e-05, + "loss": 0.6215447783470154, + "step": 5950 + }, + { + "epoch": 2.511392405063291, + "grad_norm": 1.3589707612991333, + "learning_rate": 7.0158244733773e-05, + "loss": 0.5419909358024597, + "step": 5952 + }, + { + "epoch": 2.512236286919831, + "grad_norm": 1.1664098501205444, + "learning_rate": 7.01362659735272e-05, + "loss": 0.5476977229118347, + "step": 5954 + }, + { + "epoch": 2.513080168776371, + "grad_norm": 1.1184223890304565, + "learning_rate": 7.011428256823154e-05, + "loss": 0.5896323919296265, + "step": 5956 + }, + { + "epoch": 2.5139240506329115, + "grad_norm": 1.4071170091629028, + "learning_rate": 7.00922945229572e-05, + "loss": 0.6353691220283508, + "step": 5958 + }, + { + "epoch": 2.5147679324894514, + "grad_norm": 1.3740885257720947, + "learning_rate": 7.007030184277641e-05, + "loss": 0.6605582237243652, + "step": 5960 + }, + { + "epoch": 2.5156118143459913, + "grad_norm": 1.071395754814148, + "learning_rate": 7.004830453276241e-05, + "loss": 0.6399887800216675, + "step": 5962 + }, + { + "epoch": 2.5164556962025317, + "grad_norm": 1.2292311191558838, + "learning_rate": 7.002630259798962e-05, + "loss": 0.5992775559425354, + "step": 5964 + }, + { + "epoch": 2.5172995780590717, + "grad_norm": 1.0133391618728638, + "learning_rate": 7.000429604353341e-05, + "loss": 0.5716721415519714, + "step": 5966 + }, + { + "epoch": 2.5181434599156116, + "grad_norm": 1.2669343948364258, + "learning_rate": 6.998228487447032e-05, + "loss": 0.5455520749092102, + "step": 5968 + }, + { + "epoch": 2.518987341772152, + "grad_norm": 1.2026386260986328, + "learning_rate": 6.996026909587785e-05, + "loss": 0.6411572694778442, + "step": 5970 + }, + { + "epoch": 2.519831223628692, + "grad_norm": 1.359923243522644, + "learning_rate": 6.993824871283465e-05, + "loss": 0.6687750220298767, + "step": 5972 + }, + { + "epoch": 2.520675105485232, + "grad_norm": 1.1265650987625122, + "learning_rate": 6.99162237304204e-05, + "loss": 0.6271382570266724, + "step": 5974 + }, + { + "epoch": 2.5215189873417723, + "grad_norm": 1.197667121887207, + "learning_rate": 6.989419415371583e-05, + "loss": 0.6191279888153076, + "step": 5976 + }, + { + "epoch": 2.522362869198312, + "grad_norm": 1.169992446899414, + "learning_rate": 6.987215998780275e-05, + "loss": 0.6313687562942505, + "step": 5978 + }, + { + "epoch": 2.523206751054852, + "grad_norm": 1.2706433534622192, + "learning_rate": 6.9850121237764e-05, + "loss": 0.6058336496353149, + "step": 5980 + }, + { + "epoch": 2.5240506329113925, + "grad_norm": 1.322376012802124, + "learning_rate": 6.982807790868352e-05, + "loss": 0.6466464400291443, + "step": 5982 + }, + { + "epoch": 2.5248945147679325, + "grad_norm": 1.2398571968078613, + "learning_rate": 6.980603000564626e-05, + "loss": 0.5730098485946655, + "step": 5984 + }, + { + "epoch": 2.5257383966244724, + "grad_norm": 1.2035216093063354, + "learning_rate": 6.978397753373826e-05, + "loss": 0.5305635333061218, + "step": 5986 + }, + { + "epoch": 2.526582278481013, + "grad_norm": 1.1951299905776978, + "learning_rate": 6.976192049804661e-05, + "loss": 0.5601096153259277, + "step": 5988 + }, + { + "epoch": 2.5274261603375527, + "grad_norm": 0.9950459599494934, + "learning_rate": 6.973985890365945e-05, + "loss": 0.5049516558647156, + "step": 5990 + }, + { + "epoch": 2.5282700421940927, + "grad_norm": 1.2581008672714233, + "learning_rate": 6.971779275566593e-05, + "loss": 0.5456960797309875, + "step": 5992 + }, + { + "epoch": 2.529113924050633, + "grad_norm": 1.2196903228759766, + "learning_rate": 6.969572205915632e-05, + "loss": 0.6026827096939087, + "step": 5994 + }, + { + "epoch": 2.529957805907173, + "grad_norm": 1.3109357357025146, + "learning_rate": 6.967364681922189e-05, + "loss": 0.597453236579895, + "step": 5996 + }, + { + "epoch": 2.530801687763713, + "grad_norm": 1.016904354095459, + "learning_rate": 6.965156704095498e-05, + "loss": 0.5304323434829712, + "step": 5998 + }, + { + "epoch": 2.5316455696202533, + "grad_norm": 1.2363858222961426, + "learning_rate": 6.962948272944896e-05, + "loss": 0.5748253464698792, + "step": 6000 + }, + { + "epoch": 2.5316455696202533, + "eval_loss": 0.6813357472419739, + "eval_runtime": 513.5491, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 6000 + }, + { + "epoch": 2.5324894514767933, + "grad_norm": 1.1766576766967773, + "learning_rate": 6.960739388979827e-05, + "loss": 0.613327145576477, + "step": 6002 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 1.4065337181091309, + "learning_rate": 6.95853005270984e-05, + "loss": 0.6648217439651489, + "step": 6004 + }, + { + "epoch": 2.5341772151898736, + "grad_norm": 0.9513862133026123, + "learning_rate": 6.956320264644582e-05, + "loss": 0.5165349841117859, + "step": 6006 + }, + { + "epoch": 2.5350210970464135, + "grad_norm": 1.1104962825775146, + "learning_rate": 6.95411002529381e-05, + "loss": 0.5594159364700317, + "step": 6008 + }, + { + "epoch": 2.5358649789029535, + "grad_norm": 1.1698877811431885, + "learning_rate": 6.951899335167386e-05, + "loss": 0.5662833452224731, + "step": 6010 + }, + { + "epoch": 2.536708860759494, + "grad_norm": 1.2051950693130493, + "learning_rate": 6.949688194775272e-05, + "loss": 0.5780806541442871, + "step": 6012 + }, + { + "epoch": 2.537552742616034, + "grad_norm": 1.2434250116348267, + "learning_rate": 6.947476604627536e-05, + "loss": 0.6112543344497681, + "step": 6014 + }, + { + "epoch": 2.5383966244725737, + "grad_norm": 1.1473076343536377, + "learning_rate": 6.945264565234348e-05, + "loss": 0.5556519031524658, + "step": 6016 + }, + { + "epoch": 2.539240506329114, + "grad_norm": 1.3139631748199463, + "learning_rate": 6.943052077105987e-05, + "loss": 0.6664283275604248, + "step": 6018 + }, + { + "epoch": 2.540084388185654, + "grad_norm": 1.3407402038574219, + "learning_rate": 6.940839140752825e-05, + "loss": 0.6358945369720459, + "step": 6020 + }, + { + "epoch": 2.540928270042194, + "grad_norm": 1.2223491668701172, + "learning_rate": 6.938625756685352e-05, + "loss": 0.6310063600540161, + "step": 6022 + }, + { + "epoch": 2.5417721518987344, + "grad_norm": 1.3984094858169556, + "learning_rate": 6.936411925414146e-05, + "loss": 0.6090726256370544, + "step": 6024 + }, + { + "epoch": 2.5426160337552743, + "grad_norm": 1.1876440048217773, + "learning_rate": 6.9341976474499e-05, + "loss": 0.585586428642273, + "step": 6026 + }, + { + "epoch": 2.5434599156118143, + "grad_norm": 1.2213155031204224, + "learning_rate": 6.931982923303402e-05, + "loss": 0.6382114887237549, + "step": 6028 + }, + { + "epoch": 2.5443037974683547, + "grad_norm": 1.0637959241867065, + "learning_rate": 6.92976775348555e-05, + "loss": 0.5851555466651917, + "step": 6030 + }, + { + "epoch": 2.5451476793248946, + "grad_norm": 1.150227665901184, + "learning_rate": 6.927552138507337e-05, + "loss": 0.5867910385131836, + "step": 6032 + }, + { + "epoch": 2.5459915611814345, + "grad_norm": 1.1405255794525146, + "learning_rate": 6.925336078879865e-05, + "loss": 0.5876969695091248, + "step": 6034 + }, + { + "epoch": 2.546835443037975, + "grad_norm": 1.0269757509231567, + "learning_rate": 6.923119575114339e-05, + "loss": 0.626306414604187, + "step": 6036 + }, + { + "epoch": 2.547679324894515, + "grad_norm": 1.1978809833526611, + "learning_rate": 6.920902627722059e-05, + "loss": 0.645074188709259, + "step": 6038 + }, + { + "epoch": 2.548523206751055, + "grad_norm": 1.1684149503707886, + "learning_rate": 6.918685237214435e-05, + "loss": 0.6284276247024536, + "step": 6040 + }, + { + "epoch": 2.549367088607595, + "grad_norm": 1.2538992166519165, + "learning_rate": 6.916467404102977e-05, + "loss": 0.5770997405052185, + "step": 6042 + }, + { + "epoch": 2.550210970464135, + "grad_norm": 1.2381856441497803, + "learning_rate": 6.914249128899294e-05, + "loss": 0.5501131415367126, + "step": 6044 + }, + { + "epoch": 2.551054852320675, + "grad_norm": 1.0487099885940552, + "learning_rate": 6.912030412115101e-05, + "loss": 0.5362627506256104, + "step": 6046 + }, + { + "epoch": 2.5518987341772155, + "grad_norm": 1.3471804857254028, + "learning_rate": 6.909811254262213e-05, + "loss": 0.6694624423980713, + "step": 6048 + }, + { + "epoch": 2.5527426160337554, + "grad_norm": 1.4262096881866455, + "learning_rate": 6.907591655852547e-05, + "loss": 0.642368733882904, + "step": 6050 + }, + { + "epoch": 2.5535864978902953, + "grad_norm": 1.171004295349121, + "learning_rate": 6.905371617398122e-05, + "loss": 0.6266166567802429, + "step": 6052 + }, + { + "epoch": 2.5544303797468353, + "grad_norm": 1.1249992847442627, + "learning_rate": 6.90315113941106e-05, + "loss": 0.5518985986709595, + "step": 6054 + }, + { + "epoch": 2.5552742616033757, + "grad_norm": 1.3049964904785156, + "learning_rate": 6.900930222403579e-05, + "loss": 0.5367884039878845, + "step": 6056 + }, + { + "epoch": 2.5561181434599156, + "grad_norm": 1.3548237085342407, + "learning_rate": 6.898708866888005e-05, + "loss": 0.6057673096656799, + "step": 6058 + }, + { + "epoch": 2.5569620253164556, + "grad_norm": 1.1422157287597656, + "learning_rate": 6.89648707337676e-05, + "loss": 0.5493726134300232, + "step": 6060 + }, + { + "epoch": 2.557805907172996, + "grad_norm": 1.0179574489593506, + "learning_rate": 6.89426484238237e-05, + "loss": 0.5055251717567444, + "step": 6062 + }, + { + "epoch": 2.558649789029536, + "grad_norm": 1.2062081098556519, + "learning_rate": 6.89204217441746e-05, + "loss": 0.6099714040756226, + "step": 6064 + }, + { + "epoch": 2.559493670886076, + "grad_norm": 1.3043999671936035, + "learning_rate": 6.889819069994759e-05, + "loss": 0.6432347893714905, + "step": 6066 + }, + { + "epoch": 2.5603375527426158, + "grad_norm": 1.241347074508667, + "learning_rate": 6.887595529627093e-05, + "loss": 0.6052974462509155, + "step": 6068 + }, + { + "epoch": 2.561181434599156, + "grad_norm": 1.2502845525741577, + "learning_rate": 6.88537155382739e-05, + "loss": 0.6239711046218872, + "step": 6070 + }, + { + "epoch": 2.562025316455696, + "grad_norm": 1.0815852880477905, + "learning_rate": 6.883147143108679e-05, + "loss": 0.5462124347686768, + "step": 6072 + }, + { + "epoch": 2.562869198312236, + "grad_norm": 1.1990602016448975, + "learning_rate": 6.880922297984087e-05, + "loss": 0.5727240443229675, + "step": 6074 + }, + { + "epoch": 2.5637130801687764, + "grad_norm": 1.016781210899353, + "learning_rate": 6.878697018966846e-05, + "loss": 0.5160089731216431, + "step": 6076 + }, + { + "epoch": 2.5645569620253164, + "grad_norm": 1.1946886777877808, + "learning_rate": 6.876471306570286e-05, + "loss": 0.6344075798988342, + "step": 6078 + }, + { + "epoch": 2.5654008438818563, + "grad_norm": 1.1460139751434326, + "learning_rate": 6.87424516130783e-05, + "loss": 0.6142247319221497, + "step": 6080 + }, + { + "epoch": 2.5662447257383967, + "grad_norm": 1.3636937141418457, + "learning_rate": 6.872018583693013e-05, + "loss": 0.6330769658088684, + "step": 6082 + }, + { + "epoch": 2.5670886075949366, + "grad_norm": 1.3545513153076172, + "learning_rate": 6.869791574239463e-05, + "loss": 0.6386255621910095, + "step": 6084 + }, + { + "epoch": 2.5679324894514766, + "grad_norm": 1.1196715831756592, + "learning_rate": 6.867564133460904e-05, + "loss": 0.5527385473251343, + "step": 6086 + }, + { + "epoch": 2.568776371308017, + "grad_norm": 1.0583977699279785, + "learning_rate": 6.865336261871168e-05, + "loss": 0.5689145922660828, + "step": 6088 + }, + { + "epoch": 2.569620253164557, + "grad_norm": 1.2963348627090454, + "learning_rate": 6.86310795998418e-05, + "loss": 0.5756540298461914, + "step": 6090 + }, + { + "epoch": 2.570464135021097, + "grad_norm": 1.122214436531067, + "learning_rate": 6.860879228313968e-05, + "loss": 0.6062834858894348, + "step": 6092 + }, + { + "epoch": 2.571308016877637, + "grad_norm": 1.1313230991363525, + "learning_rate": 6.858650067374657e-05, + "loss": 0.5526617169380188, + "step": 6094 + }, + { + "epoch": 2.572151898734177, + "grad_norm": 1.6992650032043457, + "learning_rate": 6.856420477680471e-05, + "loss": 0.5911332964897156, + "step": 6096 + }, + { + "epoch": 2.572995780590717, + "grad_norm": 1.2622860670089722, + "learning_rate": 6.854190459745735e-05, + "loss": 0.5730270743370056, + "step": 6098 + }, + { + "epoch": 2.5738396624472575, + "grad_norm": 1.1420512199401855, + "learning_rate": 6.851960014084868e-05, + "loss": 0.597838282585144, + "step": 6100 + }, + { + "epoch": 2.5738396624472575, + "eval_loss": 0.6812278628349304, + "eval_runtime": 513.4749, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 6100 + }, + { + "epoch": 2.5746835443037974, + "grad_norm": 1.129335641860962, + "learning_rate": 6.849729141212396e-05, + "loss": 0.6048991084098816, + "step": 6102 + }, + { + "epoch": 2.5755274261603374, + "grad_norm": 1.161284327507019, + "learning_rate": 6.847497841642935e-05, + "loss": 0.6359057426452637, + "step": 6104 + }, + { + "epoch": 2.5763713080168777, + "grad_norm": 1.285344123840332, + "learning_rate": 6.845266115891203e-05, + "loss": 0.5858902335166931, + "step": 6106 + }, + { + "epoch": 2.5772151898734177, + "grad_norm": 1.085143804550171, + "learning_rate": 6.843033964472018e-05, + "loss": 0.5742247700691223, + "step": 6108 + }, + { + "epoch": 2.5780590717299576, + "grad_norm": 1.1920831203460693, + "learning_rate": 6.840801387900291e-05, + "loss": 0.6738532185554504, + "step": 6110 + }, + { + "epoch": 2.578902953586498, + "grad_norm": 1.2750232219696045, + "learning_rate": 6.838568386691042e-05, + "loss": 0.6046389937400818, + "step": 6112 + }, + { + "epoch": 2.579746835443038, + "grad_norm": 1.1027764081954956, + "learning_rate": 6.836334961359373e-05, + "loss": 0.6231611967086792, + "step": 6114 + }, + { + "epoch": 2.580590717299578, + "grad_norm": 1.2996546030044556, + "learning_rate": 6.834101112420497e-05, + "loss": 0.5848191380500793, + "step": 6116 + }, + { + "epoch": 2.5814345991561183, + "grad_norm": 1.2683454751968384, + "learning_rate": 6.831866840389719e-05, + "loss": 0.6160622835159302, + "step": 6118 + }, + { + "epoch": 2.5822784810126582, + "grad_norm": 1.049797534942627, + "learning_rate": 6.829632145782441e-05, + "loss": 0.5220097899436951, + "step": 6120 + }, + { + "epoch": 2.583122362869198, + "grad_norm": 1.1798468828201294, + "learning_rate": 6.827397029114168e-05, + "loss": 0.5709835290908813, + "step": 6122 + }, + { + "epoch": 2.5839662447257385, + "grad_norm": 1.0136369466781616, + "learning_rate": 6.825161490900495e-05, + "loss": 0.5086703300476074, + "step": 6124 + }, + { + "epoch": 2.5848101265822785, + "grad_norm": 1.147735595703125, + "learning_rate": 6.822925531657119e-05, + "loss": 0.5904423594474792, + "step": 6126 + }, + { + "epoch": 2.5856540084388184, + "grad_norm": 0.9979357123374939, + "learning_rate": 6.820689151899833e-05, + "loss": 0.5002011060714722, + "step": 6128 + }, + { + "epoch": 2.586497890295359, + "grad_norm": 1.4129728078842163, + "learning_rate": 6.818452352144527e-05, + "loss": 0.5694814920425415, + "step": 6130 + }, + { + "epoch": 2.5873417721518988, + "grad_norm": 1.1388975381851196, + "learning_rate": 6.816215132907186e-05, + "loss": 0.5448270440101624, + "step": 6132 + }, + { + "epoch": 2.5881856540084387, + "grad_norm": 1.268865942955017, + "learning_rate": 6.813977494703896e-05, + "loss": 0.6184739470481873, + "step": 6134 + }, + { + "epoch": 2.589029535864979, + "grad_norm": 1.2403846979141235, + "learning_rate": 6.811739438050835e-05, + "loss": 0.6493034958839417, + "step": 6136 + }, + { + "epoch": 2.589873417721519, + "grad_norm": 1.108298659324646, + "learning_rate": 6.809500963464282e-05, + "loss": 0.6168854236602783, + "step": 6138 + }, + { + "epoch": 2.590717299578059, + "grad_norm": 1.106427788734436, + "learning_rate": 6.807262071460609e-05, + "loss": 0.5734958052635193, + "step": 6140 + }, + { + "epoch": 2.5915611814345993, + "grad_norm": 1.147791862487793, + "learning_rate": 6.805022762556286e-05, + "loss": 0.5422238111495972, + "step": 6142 + }, + { + "epoch": 2.5924050632911393, + "grad_norm": 1.214465856552124, + "learning_rate": 6.802783037267874e-05, + "loss": 0.6511701345443726, + "step": 6144 + }, + { + "epoch": 2.5932489451476792, + "grad_norm": 1.087735891342163, + "learning_rate": 6.800542896112043e-05, + "loss": 0.5978493094444275, + "step": 6146 + }, + { + "epoch": 2.5940928270042196, + "grad_norm": 1.0772241353988647, + "learning_rate": 6.798302339605544e-05, + "loss": 0.5656765699386597, + "step": 6148 + }, + { + "epoch": 2.5949367088607596, + "grad_norm": 1.1666499376296997, + "learning_rate": 6.796061368265231e-05, + "loss": 0.6147777438163757, + "step": 6150 + }, + { + "epoch": 2.5957805907172995, + "grad_norm": 0.9949467182159424, + "learning_rate": 6.793819982608057e-05, + "loss": 0.502659022808075, + "step": 6152 + }, + { + "epoch": 2.59662447257384, + "grad_norm": 1.311484456062317, + "learning_rate": 6.791578183151061e-05, + "loss": 0.6019812226295471, + "step": 6154 + }, + { + "epoch": 2.59746835443038, + "grad_norm": 0.9594855904579163, + "learning_rate": 6.789335970411387e-05, + "loss": 0.625690221786499, + "step": 6156 + }, + { + "epoch": 2.5983122362869198, + "grad_norm": 1.2252063751220703, + "learning_rate": 6.78709334490627e-05, + "loss": 0.628356397151947, + "step": 6158 + }, + { + "epoch": 2.59915611814346, + "grad_norm": 1.089603304862976, + "learning_rate": 6.784850307153043e-05, + "loss": 0.5447192192077637, + "step": 6160 + }, + { + "epoch": 2.6, + "grad_norm": 1.1035163402557373, + "learning_rate": 6.782606857669125e-05, + "loss": 0.5400487184524536, + "step": 6162 + }, + { + "epoch": 2.60084388185654, + "grad_norm": 1.2329976558685303, + "learning_rate": 6.780362996972042e-05, + "loss": 0.5795643329620361, + "step": 6164 + }, + { + "epoch": 2.6016877637130804, + "grad_norm": 1.2984000444412231, + "learning_rate": 6.778118725579408e-05, + "loss": 0.5664985775947571, + "step": 6166 + }, + { + "epoch": 2.6025316455696204, + "grad_norm": 1.3563600778579712, + "learning_rate": 6.775874044008933e-05, + "loss": 0.5406283140182495, + "step": 6168 + }, + { + "epoch": 2.6033755274261603, + "grad_norm": 1.1897385120391846, + "learning_rate": 6.773628952778421e-05, + "loss": 0.5362374782562256, + "step": 6170 + }, + { + "epoch": 2.6042194092827007, + "grad_norm": 1.1492685079574585, + "learning_rate": 6.771383452405773e-05, + "loss": 0.5942689180374146, + "step": 6172 + }, + { + "epoch": 2.6050632911392406, + "grad_norm": 1.2306408882141113, + "learning_rate": 6.769137543408985e-05, + "loss": 0.6144227981567383, + "step": 6174 + }, + { + "epoch": 2.6059071729957806, + "grad_norm": 1.1260589361190796, + "learning_rate": 6.766891226306143e-05, + "loss": 0.5147640705108643, + "step": 6176 + }, + { + "epoch": 2.606751054852321, + "grad_norm": 1.214007019996643, + "learning_rate": 6.764644501615427e-05, + "loss": 0.6822091341018677, + "step": 6178 + }, + { + "epoch": 2.607594936708861, + "grad_norm": 1.2251341342926025, + "learning_rate": 6.762397369855116e-05, + "loss": 0.5330857038497925, + "step": 6180 + }, + { + "epoch": 2.608438818565401, + "grad_norm": 1.3556525707244873, + "learning_rate": 6.760149831543578e-05, + "loss": 0.58979332447052, + "step": 6182 + }, + { + "epoch": 2.6092827004219408, + "grad_norm": 1.286598563194275, + "learning_rate": 6.757901887199278e-05, + "loss": 0.5667334198951721, + "step": 6184 + }, + { + "epoch": 2.610126582278481, + "grad_norm": 1.2515888214111328, + "learning_rate": 6.755653537340776e-05, + "loss": 0.6028750538825989, + "step": 6186 + }, + { + "epoch": 2.610970464135021, + "grad_norm": 1.1090617179870605, + "learning_rate": 6.753404782486719e-05, + "loss": 0.604102611541748, + "step": 6188 + }, + { + "epoch": 2.611814345991561, + "grad_norm": 1.1782273054122925, + "learning_rate": 6.751155623155853e-05, + "loss": 0.5486276745796204, + "step": 6190 + }, + { + "epoch": 2.6126582278481014, + "grad_norm": 1.5475431680679321, + "learning_rate": 6.748906059867018e-05, + "loss": 0.630682110786438, + "step": 6192 + }, + { + "epoch": 2.6135021097046414, + "grad_norm": 1.237891435623169, + "learning_rate": 6.746656093139143e-05, + "loss": 0.571597695350647, + "step": 6194 + }, + { + "epoch": 2.6143459915611813, + "grad_norm": 1.2367130517959595, + "learning_rate": 6.744405723491253e-05, + "loss": 0.6020040512084961, + "step": 6196 + }, + { + "epoch": 2.6151898734177212, + "grad_norm": 1.0747612714767456, + "learning_rate": 6.742154951442464e-05, + "loss": 0.5520704984664917, + "step": 6198 + }, + { + "epoch": 2.6160337552742616, + "grad_norm": 1.3944035768508911, + "learning_rate": 6.739903777511985e-05, + "loss": 0.7312755584716797, + "step": 6200 + }, + { + "epoch": 2.6160337552742616, + "eval_loss": 0.6795271039009094, + "eval_runtime": 513.2393, + "eval_samples_per_second": 4.105, + "eval_steps_per_second": 4.105, + "step": 6200 + }, + { + "epoch": 2.6168776371308016, + "grad_norm": 1.3716613054275513, + "learning_rate": 6.737652202219121e-05, + "loss": 0.617123007774353, + "step": 6202 + }, + { + "epoch": 2.6177215189873415, + "grad_norm": 1.1962300539016724, + "learning_rate": 6.735400226083267e-05, + "loss": 0.5791950225830078, + "step": 6204 + }, + { + "epoch": 2.618565400843882, + "grad_norm": 1.2570394277572632, + "learning_rate": 6.733147849623909e-05, + "loss": 0.5941018462181091, + "step": 6206 + }, + { + "epoch": 2.619409282700422, + "grad_norm": 1.2903523445129395, + "learning_rate": 6.730895073360628e-05, + "loss": 0.5417253971099854, + "step": 6208 + }, + { + "epoch": 2.620253164556962, + "grad_norm": 1.0618562698364258, + "learning_rate": 6.728641897813096e-05, + "loss": 0.536359965801239, + "step": 6210 + }, + { + "epoch": 2.621097046413502, + "grad_norm": 1.307300090789795, + "learning_rate": 6.726388323501077e-05, + "loss": 0.6409479975700378, + "step": 6212 + }, + { + "epoch": 2.621940928270042, + "grad_norm": 1.3672584295272827, + "learning_rate": 6.72413435094443e-05, + "loss": 0.66277676820755, + "step": 6214 + }, + { + "epoch": 2.622784810126582, + "grad_norm": 1.2156232595443726, + "learning_rate": 6.721879980663098e-05, + "loss": 0.6193054914474487, + "step": 6216 + }, + { + "epoch": 2.6236286919831224, + "grad_norm": 1.1575636863708496, + "learning_rate": 6.719625213177124e-05, + "loss": 0.5773701667785645, + "step": 6218 + }, + { + "epoch": 2.6244725738396624, + "grad_norm": 1.2327474355697632, + "learning_rate": 6.71737004900664e-05, + "loss": 0.6913977265357971, + "step": 6220 + }, + { + "epoch": 2.6253164556962023, + "grad_norm": 1.1316778659820557, + "learning_rate": 6.715114488671869e-05, + "loss": 0.5773524045944214, + "step": 6222 + }, + { + "epoch": 2.6261603375527427, + "grad_norm": 1.1508816480636597, + "learning_rate": 6.712858532693125e-05, + "loss": 0.5554601550102234, + "step": 6224 + }, + { + "epoch": 2.6270042194092826, + "grad_norm": 1.2404967546463013, + "learning_rate": 6.710602181590812e-05, + "loss": 0.6090670824050903, + "step": 6226 + }, + { + "epoch": 2.6278481012658226, + "grad_norm": 1.0721718072891235, + "learning_rate": 6.70834543588543e-05, + "loss": 0.5546537637710571, + "step": 6228 + }, + { + "epoch": 2.628691983122363, + "grad_norm": 1.2788114547729492, + "learning_rate": 6.706088296097564e-05, + "loss": 0.5939876437187195, + "step": 6230 + }, + { + "epoch": 2.629535864978903, + "grad_norm": 1.1952526569366455, + "learning_rate": 6.703830762747896e-05, + "loss": 0.5291836857795715, + "step": 6232 + }, + { + "epoch": 2.630379746835443, + "grad_norm": 1.0261807441711426, + "learning_rate": 6.701572836357191e-05, + "loss": 0.518436074256897, + "step": 6234 + }, + { + "epoch": 2.6312236286919832, + "grad_norm": 1.1804791688919067, + "learning_rate": 6.699314517446316e-05, + "loss": 0.5830684900283813, + "step": 6236 + }, + { + "epoch": 2.632067510548523, + "grad_norm": 1.2079823017120361, + "learning_rate": 6.697055806536214e-05, + "loss": 0.5899971127510071, + "step": 6238 + }, + { + "epoch": 2.632911392405063, + "grad_norm": 1.1989154815673828, + "learning_rate": 6.694796704147932e-05, + "loss": 0.6533132791519165, + "step": 6240 + }, + { + "epoch": 2.6337552742616035, + "grad_norm": 1.0621024370193481, + "learning_rate": 6.692537210802598e-05, + "loss": 0.5341002345085144, + "step": 6242 + }, + { + "epoch": 2.6345991561181434, + "grad_norm": 1.2911880016326904, + "learning_rate": 6.690277327021436e-05, + "loss": 0.6795719861984253, + "step": 6244 + }, + { + "epoch": 2.6354430379746834, + "grad_norm": 1.3586145639419556, + "learning_rate": 6.688017053325757e-05, + "loss": 0.5390555262565613, + "step": 6246 + }, + { + "epoch": 2.6362869198312238, + "grad_norm": 1.31569242477417, + "learning_rate": 6.685756390236964e-05, + "loss": 0.5935586094856262, + "step": 6248 + }, + { + "epoch": 2.6371308016877637, + "grad_norm": 1.0801384449005127, + "learning_rate": 6.683495338276547e-05, + "loss": 0.5845919847488403, + "step": 6250 + }, + { + "epoch": 2.6379746835443036, + "grad_norm": 1.179715633392334, + "learning_rate": 6.681233897966087e-05, + "loss": 0.6017906665802002, + "step": 6252 + }, + { + "epoch": 2.638818565400844, + "grad_norm": 1.1927930116653442, + "learning_rate": 6.678972069827255e-05, + "loss": 0.6637946367263794, + "step": 6254 + }, + { + "epoch": 2.639662447257384, + "grad_norm": 1.2167247533798218, + "learning_rate": 6.676709854381812e-05, + "loss": 0.5572535991668701, + "step": 6256 + }, + { + "epoch": 2.640506329113924, + "grad_norm": 1.2026311159133911, + "learning_rate": 6.674447252151608e-05, + "loss": 0.5426514148712158, + "step": 6258 + }, + { + "epoch": 2.6413502109704643, + "grad_norm": 1.101891279220581, + "learning_rate": 6.672184263658579e-05, + "loss": 0.5123113989830017, + "step": 6260 + }, + { + "epoch": 2.6421940928270042, + "grad_norm": 1.3467986583709717, + "learning_rate": 6.669920889424758e-05, + "loss": 0.6018276214599609, + "step": 6262 + }, + { + "epoch": 2.643037974683544, + "grad_norm": 1.2477779388427734, + "learning_rate": 6.667657129972257e-05, + "loss": 0.5618380308151245, + "step": 6264 + }, + { + "epoch": 2.6438818565400846, + "grad_norm": 1.1284273862838745, + "learning_rate": 6.665392985823287e-05, + "loss": 0.5541924834251404, + "step": 6266 + }, + { + "epoch": 2.6447257383966245, + "grad_norm": 1.2376370429992676, + "learning_rate": 6.663128457500137e-05, + "loss": 0.5534335970878601, + "step": 6268 + }, + { + "epoch": 2.6455696202531644, + "grad_norm": 1.3205965757369995, + "learning_rate": 6.660863545525196e-05, + "loss": 0.6160520315170288, + "step": 6270 + }, + { + "epoch": 2.646413502109705, + "grad_norm": 1.175926685333252, + "learning_rate": 6.65859825042093e-05, + "loss": 0.6035991311073303, + "step": 6272 + }, + { + "epoch": 2.6472573839662448, + "grad_norm": 1.2805176973342896, + "learning_rate": 6.656332572709901e-05, + "loss": 0.6101992130279541, + "step": 6274 + }, + { + "epoch": 2.6481012658227847, + "grad_norm": 1.2493922710418701, + "learning_rate": 6.65406651291476e-05, + "loss": 0.5665684342384338, + "step": 6276 + }, + { + "epoch": 2.648945147679325, + "grad_norm": 1.3103299140930176, + "learning_rate": 6.65180007155824e-05, + "loss": 0.682868242263794, + "step": 6278 + }, + { + "epoch": 2.649789029535865, + "grad_norm": 1.3098952770233154, + "learning_rate": 6.649533249163167e-05, + "loss": 0.6398087739944458, + "step": 6280 + }, + { + "epoch": 2.650632911392405, + "grad_norm": 1.230396032333374, + "learning_rate": 6.647266046252454e-05, + "loss": 0.5410205721855164, + "step": 6282 + }, + { + "epoch": 2.6514767932489454, + "grad_norm": 1.1755880117416382, + "learning_rate": 6.6449984633491e-05, + "loss": 0.6019781231880188, + "step": 6284 + }, + { + "epoch": 2.6523206751054853, + "grad_norm": 1.1013081073760986, + "learning_rate": 6.642730500976193e-05, + "loss": 0.5327204465866089, + "step": 6286 + }, + { + "epoch": 2.6531645569620252, + "grad_norm": 1.1285136938095093, + "learning_rate": 6.640462159656908e-05, + "loss": 0.6458070278167725, + "step": 6288 + }, + { + "epoch": 2.6540084388185656, + "grad_norm": 1.5320124626159668, + "learning_rate": 6.638193439914512e-05, + "loss": 0.6038496494293213, + "step": 6290 + }, + { + "epoch": 2.6548523206751056, + "grad_norm": 1.0231032371520996, + "learning_rate": 6.635924342272349e-05, + "loss": 0.5353283286094666, + "step": 6292 + }, + { + "epoch": 2.6556962025316455, + "grad_norm": 1.1871505975723267, + "learning_rate": 6.633654867253858e-05, + "loss": 0.644368588924408, + "step": 6294 + }, + { + "epoch": 2.656540084388186, + "grad_norm": 1.0641425848007202, + "learning_rate": 6.631385015382565e-05, + "loss": 0.5251830220222473, + "step": 6296 + }, + { + "epoch": 2.657383966244726, + "grad_norm": 0.8980898261070251, + "learning_rate": 6.62911478718208e-05, + "loss": 0.527733564376831, + "step": 6298 + }, + { + "epoch": 2.6582278481012658, + "grad_norm": 1.1694822311401367, + "learning_rate": 6.626844183176102e-05, + "loss": 0.5868222117424011, + "step": 6300 + }, + { + "epoch": 2.6582278481012658, + "eval_loss": 0.6781066656112671, + "eval_runtime": 512.3669, + "eval_samples_per_second": 4.112, + "eval_steps_per_second": 4.112, + "step": 6300 + }, + { + "epoch": 2.659071729957806, + "grad_norm": 1.3010352849960327, + "learning_rate": 6.624573203888413e-05, + "loss": 0.5965607166290283, + "step": 6302 + }, + { + "epoch": 2.659915611814346, + "grad_norm": 1.074964165687561, + "learning_rate": 6.62230184984289e-05, + "loss": 0.5776658654212952, + "step": 6304 + }, + { + "epoch": 2.660759493670886, + "grad_norm": 1.0930451154708862, + "learning_rate": 6.620030121563484e-05, + "loss": 0.584223210811615, + "step": 6306 + }, + { + "epoch": 2.6616033755274264, + "grad_norm": 1.1418803930282593, + "learning_rate": 6.617758019574243e-05, + "loss": 0.534063994884491, + "step": 6308 + }, + { + "epoch": 2.6624472573839664, + "grad_norm": 1.1602790355682373, + "learning_rate": 6.615485544399298e-05, + "loss": 0.5719610452651978, + "step": 6310 + }, + { + "epoch": 2.6632911392405063, + "grad_norm": 1.0926544666290283, + "learning_rate": 6.613212696562863e-05, + "loss": 0.5489934682846069, + "step": 6312 + }, + { + "epoch": 2.6641350210970463, + "grad_norm": 1.2560242414474487, + "learning_rate": 6.610939476589239e-05, + "loss": 0.5568612217903137, + "step": 6314 + }, + { + "epoch": 2.6649789029535866, + "grad_norm": 1.110960602760315, + "learning_rate": 6.60866588500282e-05, + "loss": 0.6019266247749329, + "step": 6316 + }, + { + "epoch": 2.6658227848101266, + "grad_norm": 1.333012342453003, + "learning_rate": 6.606391922328074e-05, + "loss": 0.6083081364631653, + "step": 6318 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.1256170272827148, + "learning_rate": 6.604117589089564e-05, + "loss": 0.5586183071136475, + "step": 6320 + }, + { + "epoch": 2.667510548523207, + "grad_norm": 1.2877609729766846, + "learning_rate": 6.601842885811934e-05, + "loss": 0.5676470994949341, + "step": 6322 + }, + { + "epoch": 2.668354430379747, + "grad_norm": 1.305034875869751, + "learning_rate": 6.599567813019914e-05, + "loss": 0.6470263600349426, + "step": 6324 + }, + { + "epoch": 2.669198312236287, + "grad_norm": 1.1695195436477661, + "learning_rate": 6.597292371238318e-05, + "loss": 0.588540256023407, + "step": 6326 + }, + { + "epoch": 2.670042194092827, + "grad_norm": 1.084652304649353, + "learning_rate": 6.59501656099205e-05, + "loss": 0.602922260761261, + "step": 6328 + }, + { + "epoch": 2.670886075949367, + "grad_norm": 1.1664962768554688, + "learning_rate": 6.592740382806094e-05, + "loss": 0.5613425970077515, + "step": 6330 + }, + { + "epoch": 2.671729957805907, + "grad_norm": 1.2208726406097412, + "learning_rate": 6.590463837205522e-05, + "loss": 0.5850927829742432, + "step": 6332 + }, + { + "epoch": 2.672573839662447, + "grad_norm": 1.0662479400634766, + "learning_rate": 6.588186924715488e-05, + "loss": 0.503675639629364, + "step": 6334 + }, + { + "epoch": 2.6734177215189874, + "grad_norm": 1.5318000316619873, + "learning_rate": 6.58590964586123e-05, + "loss": 0.6245100498199463, + "step": 6336 + }, + { + "epoch": 2.6742616033755273, + "grad_norm": 1.402784824371338, + "learning_rate": 6.583632001168077e-05, + "loss": 0.6556243896484375, + "step": 6338 + }, + { + "epoch": 2.6751054852320673, + "grad_norm": 1.2293213605880737, + "learning_rate": 6.581353991161435e-05, + "loss": 0.6398119926452637, + "step": 6340 + }, + { + "epoch": 2.6759493670886076, + "grad_norm": 1.2687599658966064, + "learning_rate": 6.579075616366797e-05, + "loss": 0.5792493224143982, + "step": 6342 + }, + { + "epoch": 2.6767932489451476, + "grad_norm": 1.2112480401992798, + "learning_rate": 6.576796877309741e-05, + "loss": 0.6669304966926575, + "step": 6344 + }, + { + "epoch": 2.6776371308016875, + "grad_norm": 1.3074487447738647, + "learning_rate": 6.574517774515929e-05, + "loss": 0.6012452840805054, + "step": 6346 + }, + { + "epoch": 2.678481012658228, + "grad_norm": 1.3157081604003906, + "learning_rate": 6.572238308511106e-05, + "loss": 0.6556297540664673, + "step": 6348 + }, + { + "epoch": 2.679324894514768, + "grad_norm": 1.0735292434692383, + "learning_rate": 6.569958479821099e-05, + "loss": 0.5607976317405701, + "step": 6350 + }, + { + "epoch": 2.680168776371308, + "grad_norm": 1.1896809339523315, + "learning_rate": 6.567678288971825e-05, + "loss": 0.6040812730789185, + "step": 6352 + }, + { + "epoch": 2.681012658227848, + "grad_norm": 1.1350760459899902, + "learning_rate": 6.565397736489274e-05, + "loss": 0.5807676911354065, + "step": 6354 + }, + { + "epoch": 2.681856540084388, + "grad_norm": 1.3865782022476196, + "learning_rate": 6.563116822899532e-05, + "loss": 0.5877989530563354, + "step": 6356 + }, + { + "epoch": 2.682700421940928, + "grad_norm": 1.218682050704956, + "learning_rate": 6.560835548728758e-05, + "loss": 0.614531397819519, + "step": 6358 + }, + { + "epoch": 2.6835443037974684, + "grad_norm": 1.06162691116333, + "learning_rate": 6.5585539145032e-05, + "loss": 0.5880973935127258, + "step": 6360 + }, + { + "epoch": 2.6843881856540084, + "grad_norm": 1.264328956604004, + "learning_rate": 6.556271920749187e-05, + "loss": 0.5795428156852722, + "step": 6362 + }, + { + "epoch": 2.6852320675105483, + "grad_norm": 1.335652470588684, + "learning_rate": 6.553989567993129e-05, + "loss": 0.5927176475524902, + "step": 6364 + }, + { + "epoch": 2.6860759493670887, + "grad_norm": 1.1110745668411255, + "learning_rate": 6.551706856761524e-05, + "loss": 0.5814473628997803, + "step": 6366 + }, + { + "epoch": 2.6869198312236287, + "grad_norm": 1.1731220483779907, + "learning_rate": 6.549423787580947e-05, + "loss": 0.557738184928894, + "step": 6368 + }, + { + "epoch": 2.6877637130801686, + "grad_norm": 1.2679874897003174, + "learning_rate": 6.54714036097806e-05, + "loss": 0.5947291254997253, + "step": 6370 + }, + { + "epoch": 2.688607594936709, + "grad_norm": 1.112322211265564, + "learning_rate": 6.544856577479606e-05, + "loss": 0.5769563317298889, + "step": 6372 + }, + { + "epoch": 2.689451476793249, + "grad_norm": 1.3385759592056274, + "learning_rate": 6.542572437612408e-05, + "loss": 0.6077675223350525, + "step": 6374 + }, + { + "epoch": 2.690295358649789, + "grad_norm": 1.0953450202941895, + "learning_rate": 6.540287941903375e-05, + "loss": 0.5600538849830627, + "step": 6376 + }, + { + "epoch": 2.6911392405063292, + "grad_norm": 1.2455042600631714, + "learning_rate": 6.538003090879495e-05, + "loss": 0.5828459858894348, + "step": 6378 + }, + { + "epoch": 2.691983122362869, + "grad_norm": 1.2563562393188477, + "learning_rate": 6.53571788506784e-05, + "loss": 0.5844002366065979, + "step": 6380 + }, + { + "epoch": 2.692827004219409, + "grad_norm": 1.3466061353683472, + "learning_rate": 6.533432324995563e-05, + "loss": 0.6632003784179688, + "step": 6382 + }, + { + "epoch": 2.6936708860759495, + "grad_norm": 1.2467784881591797, + "learning_rate": 6.531146411189899e-05, + "loss": 0.5532103180885315, + "step": 6384 + }, + { + "epoch": 2.6945147679324895, + "grad_norm": 1.344250202178955, + "learning_rate": 6.528860144178163e-05, + "loss": 0.5722881555557251, + "step": 6386 + }, + { + "epoch": 2.6953586497890294, + "grad_norm": 1.3688865900039673, + "learning_rate": 6.526573524487756e-05, + "loss": 0.6424282789230347, + "step": 6388 + }, + { + "epoch": 2.6962025316455698, + "grad_norm": 1.4252339601516724, + "learning_rate": 6.524286552646153e-05, + "loss": 0.5986620783805847, + "step": 6390 + }, + { + "epoch": 2.6970464135021097, + "grad_norm": 1.4102380275726318, + "learning_rate": 6.52199922918092e-05, + "loss": 0.6466318368911743, + "step": 6392 + }, + { + "epoch": 2.6978902953586497, + "grad_norm": 1.184442400932312, + "learning_rate": 6.519711554619692e-05, + "loss": 0.6259894371032715, + "step": 6394 + }, + { + "epoch": 2.69873417721519, + "grad_norm": 1.2751896381378174, + "learning_rate": 6.517423529490198e-05, + "loss": 0.5682622194290161, + "step": 6396 + }, + { + "epoch": 2.69957805907173, + "grad_norm": 1.3333114385604858, + "learning_rate": 6.515135154320236e-05, + "loss": 0.573390007019043, + "step": 6398 + }, + { + "epoch": 2.70042194092827, + "grad_norm": 1.2505477666854858, + "learning_rate": 6.512846429637693e-05, + "loss": 0.5839408040046692, + "step": 6400 + }, + { + "epoch": 2.70042194092827, + "eval_loss": 0.6764505505561829, + "eval_runtime": 512.7682, + "eval_samples_per_second": 4.109, + "eval_steps_per_second": 4.109, + "step": 6400 + }, + { + "epoch": 2.7012658227848103, + "grad_norm": 1.2822065353393555, + "learning_rate": 6.510557355970534e-05, + "loss": 0.6000106334686279, + "step": 6402 + }, + { + "epoch": 2.7021097046413503, + "grad_norm": 1.2144463062286377, + "learning_rate": 6.508267933846803e-05, + "loss": 0.5796633362770081, + "step": 6404 + }, + { + "epoch": 2.70295358649789, + "grad_norm": 1.189985990524292, + "learning_rate": 6.505978163794628e-05, + "loss": 0.5976626873016357, + "step": 6406 + }, + { + "epoch": 2.7037974683544306, + "grad_norm": 1.0484727621078491, + "learning_rate": 6.503688046342212e-05, + "loss": 0.5054599642753601, + "step": 6408 + }, + { + "epoch": 2.7046413502109705, + "grad_norm": 1.4333025217056274, + "learning_rate": 6.501397582017844e-05, + "loss": 0.6539149284362793, + "step": 6410 + }, + { + "epoch": 2.7054852320675105, + "grad_norm": 1.1808522939682007, + "learning_rate": 6.499106771349887e-05, + "loss": 0.5220640301704407, + "step": 6412 + }, + { + "epoch": 2.706329113924051, + "grad_norm": 2.8626298904418945, + "learning_rate": 6.496815614866791e-05, + "loss": 0.6019118428230286, + "step": 6414 + }, + { + "epoch": 2.707172995780591, + "grad_norm": 1.1092768907546997, + "learning_rate": 6.494524113097078e-05, + "loss": 0.5754269361495972, + "step": 6416 + }, + { + "epoch": 2.7080168776371307, + "grad_norm": 1.2416579723358154, + "learning_rate": 6.492232266569353e-05, + "loss": 0.5548025369644165, + "step": 6418 + }, + { + "epoch": 2.708860759493671, + "grad_norm": 1.012360692024231, + "learning_rate": 6.489940075812306e-05, + "loss": 0.5706405639648438, + "step": 6420 + }, + { + "epoch": 2.709704641350211, + "grad_norm": 1.376641869544983, + "learning_rate": 6.487647541354698e-05, + "loss": 0.5862169861793518, + "step": 6422 + }, + { + "epoch": 2.710548523206751, + "grad_norm": 1.2425684928894043, + "learning_rate": 6.485354663725374e-05, + "loss": 0.5928428769111633, + "step": 6424 + }, + { + "epoch": 2.7113924050632914, + "grad_norm": 1.0926302671432495, + "learning_rate": 6.483061443453254e-05, + "loss": 0.5903078317642212, + "step": 6426 + }, + { + "epoch": 2.7122362869198313, + "grad_norm": 1.3698115348815918, + "learning_rate": 6.480767881067342e-05, + "loss": 0.5848883986473083, + "step": 6428 + }, + { + "epoch": 2.7130801687763713, + "grad_norm": 1.2949504852294922, + "learning_rate": 6.478473977096718e-05, + "loss": 0.5285207629203796, + "step": 6430 + }, + { + "epoch": 2.7139240506329116, + "grad_norm": 1.3662208318710327, + "learning_rate": 6.476179732070543e-05, + "loss": 0.5965171456336975, + "step": 6432 + }, + { + "epoch": 2.7147679324894516, + "grad_norm": 1.3127343654632568, + "learning_rate": 6.473885146518055e-05, + "loss": 0.6549378037452698, + "step": 6434 + }, + { + "epoch": 2.7156118143459915, + "grad_norm": 1.199431300163269, + "learning_rate": 6.471590220968568e-05, + "loss": 0.574461042881012, + "step": 6436 + }, + { + "epoch": 2.716455696202532, + "grad_norm": 1.1624091863632202, + "learning_rate": 6.469294955951481e-05, + "loss": 0.6142178177833557, + "step": 6438 + }, + { + "epoch": 2.717299578059072, + "grad_norm": 1.2685147523880005, + "learning_rate": 6.466999351996266e-05, + "loss": 0.5775829553604126, + "step": 6440 + }, + { + "epoch": 2.718143459915612, + "grad_norm": 1.0987834930419922, + "learning_rate": 6.464703409632476e-05, + "loss": 0.5400159955024719, + "step": 6442 + }, + { + "epoch": 2.7189873417721517, + "grad_norm": 1.2638986110687256, + "learning_rate": 6.462407129389736e-05, + "loss": 0.558712899684906, + "step": 6444 + }, + { + "epoch": 2.719831223628692, + "grad_norm": 1.174168586730957, + "learning_rate": 6.46011051179776e-05, + "loss": 0.5465238094329834, + "step": 6446 + }, + { + "epoch": 2.720675105485232, + "grad_norm": 1.2185649871826172, + "learning_rate": 6.457813557386331e-05, + "loss": 0.629173219203949, + "step": 6448 + }, + { + "epoch": 2.721518987341772, + "grad_norm": 1.1563167572021484, + "learning_rate": 6.455516266685311e-05, + "loss": 0.5557543039321899, + "step": 6450 + }, + { + "epoch": 2.7223628691983124, + "grad_norm": 1.2934051752090454, + "learning_rate": 6.453218640224642e-05, + "loss": 0.6350696682929993, + "step": 6452 + }, + { + "epoch": 2.7232067510548523, + "grad_norm": 1.045218825340271, + "learning_rate": 6.450920678534342e-05, + "loss": 0.544219434261322, + "step": 6454 + }, + { + "epoch": 2.7240506329113923, + "grad_norm": 1.3102771043777466, + "learning_rate": 6.44862238214451e-05, + "loss": 0.6312481760978699, + "step": 6456 + }, + { + "epoch": 2.7248945147679327, + "grad_norm": 1.3338704109191895, + "learning_rate": 6.446323751585312e-05, + "loss": 0.5772860050201416, + "step": 6458 + }, + { + "epoch": 2.7257383966244726, + "grad_norm": 1.1826046705245972, + "learning_rate": 6.444024787387003e-05, + "loss": 0.5450227856636047, + "step": 6460 + }, + { + "epoch": 2.7265822784810125, + "grad_norm": 1.2449530363082886, + "learning_rate": 6.441725490079908e-05, + "loss": 0.5775642395019531, + "step": 6462 + }, + { + "epoch": 2.7274261603375525, + "grad_norm": 1.1204898357391357, + "learning_rate": 6.439425860194432e-05, + "loss": 0.5795316100120544, + "step": 6464 + }, + { + "epoch": 2.728270042194093, + "grad_norm": 1.179542064666748, + "learning_rate": 6.437125898261056e-05, + "loss": 0.6187583804130554, + "step": 6466 + }, + { + "epoch": 2.729113924050633, + "grad_norm": 1.2231724262237549, + "learning_rate": 6.434825604810333e-05, + "loss": 0.581790566444397, + "step": 6468 + }, + { + "epoch": 2.7299578059071727, + "grad_norm": 1.178859829902649, + "learning_rate": 6.432524980372902e-05, + "loss": 0.5470858812332153, + "step": 6470 + }, + { + "epoch": 2.730801687763713, + "grad_norm": 1.2092641592025757, + "learning_rate": 6.430224025479469e-05, + "loss": 0.591381311416626, + "step": 6472 + }, + { + "epoch": 2.731645569620253, + "grad_norm": 1.395704746246338, + "learning_rate": 6.42792274066082e-05, + "loss": 0.6809561252593994, + "step": 6474 + }, + { + "epoch": 2.732489451476793, + "grad_norm": 1.1937509775161743, + "learning_rate": 6.42562112644782e-05, + "loss": 0.5667102932929993, + "step": 6476 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 1.2181694507598877, + "learning_rate": 6.423319183371405e-05, + "loss": 0.5832397937774658, + "step": 6478 + }, + { + "epoch": 2.7341772151898733, + "grad_norm": 0.9961143732070923, + "learning_rate": 6.42101691196259e-05, + "loss": 0.5432526469230652, + "step": 6480 + }, + { + "epoch": 2.7350210970464133, + "grad_norm": 1.2029842138290405, + "learning_rate": 6.418714312752466e-05, + "loss": 0.5740163326263428, + "step": 6482 + }, + { + "epoch": 2.7358649789029537, + "grad_norm": 1.4317080974578857, + "learning_rate": 6.416411386272196e-05, + "loss": 0.6384599804878235, + "step": 6484 + }, + { + "epoch": 2.7367088607594936, + "grad_norm": 1.2837908267974854, + "learning_rate": 6.414108133053022e-05, + "loss": 0.6619245409965515, + "step": 6486 + }, + { + "epoch": 2.7375527426160335, + "grad_norm": 1.1140583753585815, + "learning_rate": 6.41180455362626e-05, + "loss": 0.5453745126724243, + "step": 6488 + }, + { + "epoch": 2.738396624472574, + "grad_norm": 1.1226048469543457, + "learning_rate": 6.409500648523302e-05, + "loss": 0.6225460171699524, + "step": 6490 + }, + { + "epoch": 2.739240506329114, + "grad_norm": 1.2367178201675415, + "learning_rate": 6.407196418275613e-05, + "loss": 0.5767168402671814, + "step": 6492 + }, + { + "epoch": 2.740084388185654, + "grad_norm": 1.4078115224838257, + "learning_rate": 6.404891863414736e-05, + "loss": 0.6131237745285034, + "step": 6494 + }, + { + "epoch": 2.740928270042194, + "grad_norm": 1.21550452709198, + "learning_rate": 6.40258698447229e-05, + "loss": 0.5236409306526184, + "step": 6496 + }, + { + "epoch": 2.741772151898734, + "grad_norm": 1.22257661819458, + "learning_rate": 6.400281781979962e-05, + "loss": 0.5483267307281494, + "step": 6498 + }, + { + "epoch": 2.742616033755274, + "grad_norm": 1.1525336503982544, + "learning_rate": 6.39797625646952e-05, + "loss": 0.6161116361618042, + "step": 6500 + }, + { + "epoch": 2.742616033755274, + "eval_loss": 0.6768895387649536, + "eval_runtime": 513.0657, + "eval_samples_per_second": 4.107, + "eval_steps_per_second": 4.107, + "step": 6500 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 1 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.751359103207182e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-6500/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-6500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7000/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7000/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7000/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..96a07c2b8637c3e7c69037eb75ea51b298a483f4 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83291f6d455ca5ce17a07f21cc02ec56cba0671e1d6495dd81c1d98fd10b7c26 +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7000/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbe9b935e175788861fb53a2e68ac7307b32c514 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb2d6ab22e4c24d3bdb3455a91378a172481ceb90647d40564efeaf1bf56307c +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7000/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..929c8031b538d8ddf14095d5184fcfd31b9d11b6 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67c205064a25e0d7257d786bea3d827c41af89be5eecf88657f8f1dc80f0c97f +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7000/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8819a6ef07c2747c2be3bb8056b50a6a532e7fa --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cfb1fc1a61155f01252c49faa6f04f3bb891814d770cf69d6f143907b16f5db +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7000/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a3e222b4639c40e54f0340c0044c98a569e34f71 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/trainer_state.json @@ -0,0 +1,25103 @@ +{ + "best_global_step": 7000, + "best_metric": 0.6706293225288391, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-7000", + "epoch": 2.9535864978902953, + "eval_steps": 100, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + }, + { + "epoch": 0.8447257383966245, + "grad_norm": 1.145434021949768, + "learning_rate": 9.893006338154401e-05, + "loss": 0.731850802898407, + "step": 2002 + }, + { + "epoch": 0.8455696202531645, + "grad_norm": 1.0618077516555786, + "learning_rate": 9.892511635084101e-05, + "loss": 0.6711665391921997, + "step": 2004 + }, + { + "epoch": 0.8464135021097047, + "grad_norm": 1.1657867431640625, + "learning_rate": 9.892015803405331e-05, + "loss": 0.6894803643226624, + "step": 2006 + }, + { + "epoch": 0.8472573839662447, + "grad_norm": 1.080140233039856, + "learning_rate": 9.891518843232467e-05, + "loss": 0.628146231174469, + "step": 2008 + }, + { + "epoch": 0.8481012658227848, + "grad_norm": 1.0664509534835815, + "learning_rate": 9.891020754680151e-05, + "loss": 0.740858793258667, + "step": 2010 + }, + { + "epoch": 0.8489451476793249, + "grad_norm": 1.5567615032196045, + "learning_rate": 9.89052153786328e-05, + "loss": 0.7763919234275818, + "step": 2012 + }, + { + "epoch": 0.849789029535865, + "grad_norm": 1.4347095489501953, + "learning_rate": 9.890021192897016e-05, + "loss": 0.8131396770477295, + "step": 2014 + }, + { + "epoch": 0.850632911392405, + "grad_norm": 1.1787892580032349, + "learning_rate": 9.889519719896776e-05, + "loss": 0.6829051375389099, + "step": 2016 + }, + { + "epoch": 0.8514767932489451, + "grad_norm": 1.239745855331421, + "learning_rate": 9.889017118978241e-05, + "loss": 0.7664558291435242, + "step": 2018 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 1.1224207878112793, + "learning_rate": 9.888513390257352e-05, + "loss": 0.7307376861572266, + "step": 2020 + }, + { + "epoch": 0.8531645569620253, + "grad_norm": 1.100536823272705, + "learning_rate": 9.88800853385031e-05, + "loss": 0.6786578893661499, + "step": 2022 + }, + { + "epoch": 0.8540084388185654, + "grad_norm": 1.25773024559021, + "learning_rate": 9.887502549873576e-05, + "loss": 0.7971984148025513, + "step": 2024 + }, + { + "epoch": 0.8548523206751055, + "grad_norm": 0.9980104565620422, + "learning_rate": 9.886995438443868e-05, + "loss": 0.6990941166877747, + "step": 2026 + }, + { + "epoch": 0.8556962025316456, + "grad_norm": 1.0464621782302856, + "learning_rate": 9.886487199678171e-05, + "loss": 0.763938307762146, + "step": 2028 + }, + { + "epoch": 0.8565400843881856, + "grad_norm": 1.2303017377853394, + "learning_rate": 9.885977833693724e-05, + "loss": 0.7165632247924805, + "step": 2030 + }, + { + "epoch": 0.8573839662447258, + "grad_norm": 1.2203325033187866, + "learning_rate": 9.885467340608027e-05, + "loss": 0.7586364150047302, + "step": 2032 + }, + { + "epoch": 0.8582278481012658, + "grad_norm": 1.113882064819336, + "learning_rate": 9.884955720538843e-05, + "loss": 0.703253984451294, + "step": 2034 + }, + { + "epoch": 0.8590717299578059, + "grad_norm": 1.1731632947921753, + "learning_rate": 9.88444297360419e-05, + "loss": 0.8530917763710022, + "step": 2036 + }, + { + "epoch": 0.859915611814346, + "grad_norm": 1.4592338800430298, + "learning_rate": 9.883929099922349e-05, + "loss": 0.8166638612747192, + "step": 2038 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.1279125213623047, + "learning_rate": 9.883414099611864e-05, + "loss": 0.6762415170669556, + "step": 2040 + }, + { + "epoch": 0.8616033755274262, + "grad_norm": 1.1587293148040771, + "learning_rate": 9.882897972791534e-05, + "loss": 0.6826539039611816, + "step": 2042 + }, + { + "epoch": 0.8624472573839662, + "grad_norm": 1.1909502744674683, + "learning_rate": 9.88238071958042e-05, + "loss": 0.7372410893440247, + "step": 2044 + }, + { + "epoch": 0.8632911392405064, + "grad_norm": 1.0340155363082886, + "learning_rate": 9.881862340097841e-05, + "loss": 0.699260950088501, + "step": 2046 + }, + { + "epoch": 0.8641350210970464, + "grad_norm": 1.1745870113372803, + "learning_rate": 9.881342834463379e-05, + "loss": 0.7689789533615112, + "step": 2048 + }, + { + "epoch": 0.8649789029535865, + "grad_norm": 1.0003606081008911, + "learning_rate": 9.880822202796872e-05, + "loss": 0.6877372860908508, + "step": 2050 + }, + { + "epoch": 0.8658227848101265, + "grad_norm": 1.2546781301498413, + "learning_rate": 9.88030044521842e-05, + "loss": 0.7632413506507874, + "step": 2052 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.1178704500198364, + "learning_rate": 9.879777561848385e-05, + "loss": 0.6776729822158813, + "step": 2054 + }, + { + "epoch": 0.8675105485232067, + "grad_norm": 1.523606777191162, + "learning_rate": 9.879253552807384e-05, + "loss": 0.7592973709106445, + "step": 2056 + }, + { + "epoch": 0.8683544303797468, + "grad_norm": 1.3490995168685913, + "learning_rate": 9.878728418216296e-05, + "loss": 0.8028839230537415, + "step": 2058 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 1.1851624250411987, + "learning_rate": 9.87820215819626e-05, + "loss": 0.7499933838844299, + "step": 2060 + }, + { + "epoch": 0.870042194092827, + "grad_norm": 1.1877925395965576, + "learning_rate": 9.877674772868672e-05, + "loss": 0.7324717044830322, + "step": 2062 + }, + { + "epoch": 0.8708860759493671, + "grad_norm": 1.2982885837554932, + "learning_rate": 9.877146262355194e-05, + "loss": 0.7456585168838501, + "step": 2064 + }, + { + "epoch": 0.8717299578059071, + "grad_norm": 1.043912649154663, + "learning_rate": 9.876616626777739e-05, + "loss": 0.7552799582481384, + "step": 2066 + }, + { + "epoch": 0.8725738396624473, + "grad_norm": 1.172580599784851, + "learning_rate": 9.876085866258487e-05, + "loss": 0.6964990496635437, + "step": 2068 + }, + { + "epoch": 0.8734177215189873, + "grad_norm": 1.26815927028656, + "learning_rate": 9.875553980919871e-05, + "loss": 0.7368612289428711, + "step": 2070 + }, + { + "epoch": 0.8742616033755274, + "grad_norm": 1.1268136501312256, + "learning_rate": 9.875020970884587e-05, + "loss": 0.7400802969932556, + "step": 2072 + }, + { + "epoch": 0.8751054852320675, + "grad_norm": 1.0556721687316895, + "learning_rate": 9.874486836275594e-05, + "loss": 0.6931334137916565, + "step": 2074 + }, + { + "epoch": 0.8759493670886076, + "grad_norm": 1.1967823505401611, + "learning_rate": 9.873951577216106e-05, + "loss": 0.7124089002609253, + "step": 2076 + }, + { + "epoch": 0.8767932489451477, + "grad_norm": 1.1753164529800415, + "learning_rate": 9.873415193829591e-05, + "loss": 0.7462030053138733, + "step": 2078 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 1.326923131942749, + "learning_rate": 9.872877686239789e-05, + "loss": 0.778078019618988, + "step": 2080 + }, + { + "epoch": 0.8784810126582279, + "grad_norm": 1.1472662687301636, + "learning_rate": 9.87233905457069e-05, + "loss": 0.6592919826507568, + "step": 2082 + }, + { + "epoch": 0.8793248945147679, + "grad_norm": 1.1162762641906738, + "learning_rate": 9.871799298946544e-05, + "loss": 0.661717414855957, + "step": 2084 + }, + { + "epoch": 0.880168776371308, + "grad_norm": 1.1694408655166626, + "learning_rate": 9.871258419491866e-05, + "loss": 0.6203670501708984, + "step": 2086 + }, + { + "epoch": 0.8810126582278481, + "grad_norm": 1.229691505432129, + "learning_rate": 9.870716416331425e-05, + "loss": 0.758888304233551, + "step": 2088 + }, + { + "epoch": 0.8818565400843882, + "grad_norm": 1.540377140045166, + "learning_rate": 9.870173289590251e-05, + "loss": 0.760649561882019, + "step": 2090 + }, + { + "epoch": 0.8827004219409282, + "grad_norm": 1.173628568649292, + "learning_rate": 9.869629039393632e-05, + "loss": 0.6981227397918701, + "step": 2092 + }, + { + "epoch": 0.8835443037974684, + "grad_norm": 1.1404013633728027, + "learning_rate": 9.869083665867116e-05, + "loss": 0.7808336615562439, + "step": 2094 + }, + { + "epoch": 0.8843881856540085, + "grad_norm": 1.1038721799850464, + "learning_rate": 9.868537169136511e-05, + "loss": 0.7540555596351624, + "step": 2096 + }, + { + "epoch": 0.8852320675105485, + "grad_norm": 1.1510080099105835, + "learning_rate": 9.867989549327885e-05, + "loss": 0.6650454998016357, + "step": 2098 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.166912317276001, + "learning_rate": 9.867440806567561e-05, + "loss": 0.673769474029541, + "step": 2100 + }, + { + "epoch": 0.8860759493670886, + "eval_loss": 0.7559094429016113, + "eval_runtime": 847.8311, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2100 + }, + { + "epoch": 0.8869198312236287, + "grad_norm": 1.227583885192871, + "learning_rate": 9.866890940982121e-05, + "loss": 0.8314241766929626, + "step": 2102 + }, + { + "epoch": 0.8877637130801688, + "grad_norm": 1.1813976764678955, + "learning_rate": 9.866339952698413e-05, + "loss": 0.6770843863487244, + "step": 2104 + }, + { + "epoch": 0.8886075949367088, + "grad_norm": 1.2471063137054443, + "learning_rate": 9.865787841843539e-05, + "loss": 0.7142292857170105, + "step": 2106 + }, + { + "epoch": 0.889451476793249, + "grad_norm": 1.1602860689163208, + "learning_rate": 9.865234608544858e-05, + "loss": 0.6981731653213501, + "step": 2108 + }, + { + "epoch": 0.890295358649789, + "grad_norm": 1.145677089691162, + "learning_rate": 9.864680252929992e-05, + "loss": 0.7019379138946533, + "step": 2110 + }, + { + "epoch": 0.8911392405063291, + "grad_norm": 1.2222462892532349, + "learning_rate": 9.86412477512682e-05, + "loss": 0.7690986394882202, + "step": 2112 + }, + { + "epoch": 0.8919831223628693, + "grad_norm": 1.1288166046142578, + "learning_rate": 9.863568175263478e-05, + "loss": 0.7241792678833008, + "step": 2114 + }, + { + "epoch": 0.8928270042194093, + "grad_norm": 1.1773978471755981, + "learning_rate": 9.863010453468364e-05, + "loss": 0.7392162084579468, + "step": 2116 + }, + { + "epoch": 0.8936708860759494, + "grad_norm": 1.102638840675354, + "learning_rate": 9.862451609870136e-05, + "loss": 0.7603078484535217, + "step": 2118 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.861891644597707e-05, + "loss": 0.6804911494255066, + "step": 2120 + }, + { + "epoch": 0.8953586497890296, + "grad_norm": 1.1381969451904297, + "learning_rate": 9.86133055778025e-05, + "loss": 0.787288248538971, + "step": 2122 + }, + { + "epoch": 0.8962025316455696, + "grad_norm": 1.2454546689987183, + "learning_rate": 9.860768349547196e-05, + "loss": 0.7282505035400391, + "step": 2124 + }, + { + "epoch": 0.8970464135021097, + "grad_norm": 1.2568305730819702, + "learning_rate": 9.860205020028237e-05, + "loss": 0.7554803490638733, + "step": 2126 + }, + { + "epoch": 0.8978902953586498, + "grad_norm": 1.1523523330688477, + "learning_rate": 9.859640569353321e-05, + "loss": 0.7126525044441223, + "step": 2128 + }, + { + "epoch": 0.8987341772151899, + "grad_norm": 1.314878225326538, + "learning_rate": 9.859074997652658e-05, + "loss": 0.7300811409950256, + "step": 2130 + }, + { + "epoch": 0.8995780590717299, + "grad_norm": 1.1272218227386475, + "learning_rate": 9.858508305056713e-05, + "loss": 0.7217329144477844, + "step": 2132 + }, + { + "epoch": 0.90042194092827, + "grad_norm": 1.10934317111969, + "learning_rate": 9.857940491696211e-05, + "loss": 0.714308500289917, + "step": 2134 + }, + { + "epoch": 0.9012658227848102, + "grad_norm": 1.1991039514541626, + "learning_rate": 9.857371557702136e-05, + "loss": 0.6613366007804871, + "step": 2136 + }, + { + "epoch": 0.9021097046413502, + "grad_norm": 1.3176918029785156, + "learning_rate": 9.85680150320573e-05, + "loss": 0.6972863078117371, + "step": 2138 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 1.1966592073440552, + "learning_rate": 9.856230328338496e-05, + "loss": 0.7299100160598755, + "step": 2140 + }, + { + "epoch": 0.9037974683544304, + "grad_norm": 1.2889270782470703, + "learning_rate": 9.85565803323219e-05, + "loss": 0.7145020961761475, + "step": 2142 + }, + { + "epoch": 0.9046413502109705, + "grad_norm": 1.2112789154052734, + "learning_rate": 9.855084618018828e-05, + "loss": 0.6717942953109741, + "step": 2144 + }, + { + "epoch": 0.9054852320675105, + "grad_norm": 1.2550239562988281, + "learning_rate": 9.85451008283069e-05, + "loss": 0.7460196018218994, + "step": 2146 + }, + { + "epoch": 0.9063291139240506, + "grad_norm": 1.2926387786865234, + "learning_rate": 9.853934427800309e-05, + "loss": 0.8300626873970032, + "step": 2148 + }, + { + "epoch": 0.9071729957805907, + "grad_norm": 1.0690672397613525, + "learning_rate": 9.853357653060478e-05, + "loss": 0.715215802192688, + "step": 2150 + }, + { + "epoch": 0.9080168776371308, + "grad_norm": 1.1021424531936646, + "learning_rate": 9.852779758744245e-05, + "loss": 0.7021427154541016, + "step": 2152 + }, + { + "epoch": 0.9088607594936708, + "grad_norm": 1.0713517665863037, + "learning_rate": 9.852200744984921e-05, + "loss": 0.7576406598091125, + "step": 2154 + }, + { + "epoch": 0.909704641350211, + "grad_norm": 1.277526617050171, + "learning_rate": 9.851620611916075e-05, + "loss": 0.7008846998214722, + "step": 2156 + }, + { + "epoch": 0.9105485232067511, + "grad_norm": 1.2434618473052979, + "learning_rate": 9.85103935967153e-05, + "loss": 0.7536613345146179, + "step": 2158 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 1.1654841899871826, + "learning_rate": 9.850456988385371e-05, + "loss": 0.7435567378997803, + "step": 2160 + }, + { + "epoch": 0.9122362869198313, + "grad_norm": 1.0718246698379517, + "learning_rate": 9.849873498191939e-05, + "loss": 0.7725666165351868, + "step": 2162 + }, + { + "epoch": 0.9130801687763713, + "grad_norm": 1.3425630331039429, + "learning_rate": 9.849288889225835e-05, + "loss": 0.7833593487739563, + "step": 2164 + }, + { + "epoch": 0.9139240506329114, + "grad_norm": 1.1989985704421997, + "learning_rate": 9.848703161621917e-05, + "loss": 0.7290158867835999, + "step": 2166 + }, + { + "epoch": 0.9147679324894514, + "grad_norm": 1.0549380779266357, + "learning_rate": 9.8481163155153e-05, + "loss": 0.6787996888160706, + "step": 2168 + }, + { + "epoch": 0.9156118143459916, + "grad_norm": 1.0757017135620117, + "learning_rate": 9.847528351041359e-05, + "loss": 0.7645748853683472, + "step": 2170 + }, + { + "epoch": 0.9164556962025316, + "grad_norm": 1.0636975765228271, + "learning_rate": 9.846939268335726e-05, + "loss": 0.6640698313713074, + "step": 2172 + }, + { + "epoch": 0.9172995780590717, + "grad_norm": 1.2038439512252808, + "learning_rate": 9.846349067534291e-05, + "loss": 0.7216284275054932, + "step": 2174 + }, + { + "epoch": 0.9181434599156119, + "grad_norm": 1.17854642868042, + "learning_rate": 9.845757748773203e-05, + "loss": 0.7244991660118103, + "step": 2176 + }, + { + "epoch": 0.9189873417721519, + "grad_norm": 1.0391159057617188, + "learning_rate": 9.845165312188864e-05, + "loss": 0.6043152809143066, + "step": 2178 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 1.2382071018218994, + "learning_rate": 9.844571757917944e-05, + "loss": 0.7791659832000732, + "step": 2180 + }, + { + "epoch": 0.920675105485232, + "grad_norm": 1.0855708122253418, + "learning_rate": 9.84397708609736e-05, + "loss": 0.7190433144569397, + "step": 2182 + }, + { + "epoch": 0.9215189873417722, + "grad_norm": 1.103308916091919, + "learning_rate": 9.843381296864291e-05, + "loss": 0.6648658514022827, + "step": 2184 + }, + { + "epoch": 0.9223628691983122, + "grad_norm": 1.073517918586731, + "learning_rate": 9.842784390356178e-05, + "loss": 0.6891760230064392, + "step": 2186 + }, + { + "epoch": 0.9232067510548523, + "grad_norm": 1.0806199312210083, + "learning_rate": 9.842186366710712e-05, + "loss": 0.6880859136581421, + "step": 2188 + }, + { + "epoch": 0.9240506329113924, + "grad_norm": 1.0631483793258667, + "learning_rate": 9.841587226065848e-05, + "loss": 0.6238307952880859, + "step": 2190 + }, + { + "epoch": 0.9248945147679325, + "grad_norm": 1.2630863189697266, + "learning_rate": 9.840986968559795e-05, + "loss": 0.6905744075775146, + "step": 2192 + }, + { + "epoch": 0.9257383966244725, + "grad_norm": 1.1307560205459595, + "learning_rate": 9.840385594331022e-05, + "loss": 0.7531564235687256, + "step": 2194 + }, + { + "epoch": 0.9265822784810127, + "grad_norm": 1.0294862985610962, + "learning_rate": 9.839783103518254e-05, + "loss": 0.6750671863555908, + "step": 2196 + }, + { + "epoch": 0.9274261603375528, + "grad_norm": 1.2446976900100708, + "learning_rate": 9.839179496260472e-05, + "loss": 0.7200804352760315, + "step": 2198 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 1.2673420906066895, + "learning_rate": 9.83857477269692e-05, + "loss": 0.7002623677253723, + "step": 2200 + }, + { + "epoch": 0.9282700421940928, + "eval_loss": 0.7497645616531372, + "eval_runtime": 856.8766, + "eval_samples_per_second": 2.459, + "eval_steps_per_second": 2.459, + "step": 2200 + }, + { + "epoch": 0.9291139240506329, + "grad_norm": 1.5114624500274658, + "learning_rate": 9.837968932967094e-05, + "loss": 0.7718265056610107, + "step": 2202 + }, + { + "epoch": 0.929957805907173, + "grad_norm": 1.2059369087219238, + "learning_rate": 9.837361977210751e-05, + "loss": 0.7204271554946899, + "step": 2204 + }, + { + "epoch": 0.9308016877637131, + "grad_norm": 1.2077301740646362, + "learning_rate": 9.836753905567902e-05, + "loss": 0.7371073961257935, + "step": 2206 + }, + { + "epoch": 0.9316455696202531, + "grad_norm": 1.120097279548645, + "learning_rate": 9.836144718178818e-05, + "loss": 0.6601167321205139, + "step": 2208 + }, + { + "epoch": 0.9324894514767933, + "grad_norm": 1.1755714416503906, + "learning_rate": 9.835534415184029e-05, + "loss": 0.6897423267364502, + "step": 2210 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3587000370025635, + "learning_rate": 9.834922996724317e-05, + "loss": 0.758438229560852, + "step": 2212 + }, + { + "epoch": 0.9341772151898734, + "grad_norm": 1.1898177862167358, + "learning_rate": 9.834310462940727e-05, + "loss": 0.7489214539527893, + "step": 2214 + }, + { + "epoch": 0.9350210970464135, + "grad_norm": 1.0814623832702637, + "learning_rate": 9.833696813974558e-05, + "loss": 0.6844488382339478, + "step": 2216 + }, + { + "epoch": 0.9358649789029536, + "grad_norm": 1.1060179471969604, + "learning_rate": 9.833082049967366e-05, + "loss": 0.6617586016654968, + "step": 2218 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 1.1780575513839722, + "learning_rate": 9.832466171060968e-05, + "loss": 0.7383584976196289, + "step": 2220 + }, + { + "epoch": 0.9375527426160337, + "grad_norm": 1.3734618425369263, + "learning_rate": 9.831849177397432e-05, + "loss": 0.7764308452606201, + "step": 2222 + }, + { + "epoch": 0.9383966244725739, + "grad_norm": 1.1367733478546143, + "learning_rate": 9.831231069119089e-05, + "loss": 0.6834397912025452, + "step": 2224 + }, + { + "epoch": 0.9392405063291139, + "grad_norm": 1.1695492267608643, + "learning_rate": 9.830611846368524e-05, + "loss": 0.7054480910301208, + "step": 2226 + }, + { + "epoch": 0.940084388185654, + "grad_norm": 1.0345736742019653, + "learning_rate": 9.829991509288579e-05, + "loss": 0.694448709487915, + "step": 2228 + }, + { + "epoch": 0.9409282700421941, + "grad_norm": 1.298105239868164, + "learning_rate": 9.829370058022356e-05, + "loss": 0.6839741468429565, + "step": 2230 + }, + { + "epoch": 0.9417721518987342, + "grad_norm": 1.2905502319335938, + "learning_rate": 9.828747492713209e-05, + "loss": 0.7886884212493896, + "step": 2232 + }, + { + "epoch": 0.9426160337552743, + "grad_norm": 1.12301504611969, + "learning_rate": 9.828123813504753e-05, + "loss": 0.7206413149833679, + "step": 2234 + }, + { + "epoch": 0.9434599156118143, + "grad_norm": 1.2644896507263184, + "learning_rate": 9.82749902054086e-05, + "loss": 0.7700693607330322, + "step": 2236 + }, + { + "epoch": 0.9443037974683545, + "grad_norm": 1.1626365184783936, + "learning_rate": 9.826873113965655e-05, + "loss": 0.7199711203575134, + "step": 2238 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 1.0728627443313599, + "learning_rate": 9.826246093923528e-05, + "loss": 0.7183539271354675, + "step": 2240 + }, + { + "epoch": 0.9459915611814346, + "grad_norm": 1.1444766521453857, + "learning_rate": 9.825617960559114e-05, + "loss": 0.7417964935302734, + "step": 2242 + }, + { + "epoch": 0.9468354430379747, + "grad_norm": 1.4059823751449585, + "learning_rate": 9.824988714017316e-05, + "loss": 0.7949740290641785, + "step": 2244 + }, + { + "epoch": 0.9476793248945148, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.824358354443286e-05, + "loss": 0.6433083415031433, + "step": 2246 + }, + { + "epoch": 0.9485232067510548, + "grad_norm": 1.0879144668579102, + "learning_rate": 9.823726881982438e-05, + "loss": 0.6519861817359924, + "step": 2248 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.2289162874221802, + "learning_rate": 9.82309429678044e-05, + "loss": 0.7280195355415344, + "step": 2250 + }, + { + "epoch": 0.950210970464135, + "grad_norm": 1.1755765676498413, + "learning_rate": 9.822460598983217e-05, + "loss": 0.7524687647819519, + "step": 2252 + }, + { + "epoch": 0.9510548523206751, + "grad_norm": 1.179807186126709, + "learning_rate": 9.821825788736949e-05, + "loss": 0.7543174624443054, + "step": 2254 + }, + { + "epoch": 0.9518987341772152, + "grad_norm": 1.1234289407730103, + "learning_rate": 9.821189866188079e-05, + "loss": 0.716377854347229, + "step": 2256 + }, + { + "epoch": 0.9527426160337553, + "grad_norm": 1.0324063301086426, + "learning_rate": 9.820552831483297e-05, + "loss": 0.6403332948684692, + "step": 2258 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 1.1459579467773438, + "learning_rate": 9.819914684769558e-05, + "loss": 0.7406947612762451, + "step": 2260 + }, + { + "epoch": 0.9544303797468354, + "grad_norm": 1.2886124849319458, + "learning_rate": 9.819275426194072e-05, + "loss": 0.749687671661377, + "step": 2262 + }, + { + "epoch": 0.9552742616033755, + "grad_norm": 1.3349844217300415, + "learning_rate": 9.818635055904299e-05, + "loss": 0.778410017490387, + "step": 2264 + }, + { + "epoch": 0.9561181434599156, + "grad_norm": 1.0994901657104492, + "learning_rate": 9.81799357404796e-05, + "loss": 0.6701914668083191, + "step": 2266 + }, + { + "epoch": 0.9569620253164557, + "grad_norm": 1.1787796020507812, + "learning_rate": 9.817350980773038e-05, + "loss": 0.7205135226249695, + "step": 2268 + }, + { + "epoch": 0.9578059071729957, + "grad_norm": 1.100813627243042, + "learning_rate": 9.816707276227763e-05, + "loss": 0.6897916197776794, + "step": 2270 + }, + { + "epoch": 0.9586497890295359, + "grad_norm": 1.1280698776245117, + "learning_rate": 9.816062460560627e-05, + "loss": 0.6763570308685303, + "step": 2272 + }, + { + "epoch": 0.959493670886076, + "grad_norm": 1.2322514057159424, + "learning_rate": 9.815416533920374e-05, + "loss": 0.6948683857917786, + "step": 2274 + }, + { + "epoch": 0.960337552742616, + "grad_norm": 1.3963630199432373, + "learning_rate": 9.814769496456008e-05, + "loss": 0.7876828908920288, + "step": 2276 + }, + { + "epoch": 0.9611814345991562, + "grad_norm": 1.2093676328659058, + "learning_rate": 9.814121348316792e-05, + "loss": 0.8191362619400024, + "step": 2278 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 1.2223572731018066, + "learning_rate": 9.813472089652233e-05, + "loss": 0.7162626385688782, + "step": 2280 + }, + { + "epoch": 0.9628691983122363, + "grad_norm": 1.1498078107833862, + "learning_rate": 9.812821720612111e-05, + "loss": 0.7183970212936401, + "step": 2282 + }, + { + "epoch": 0.9637130801687763, + "grad_norm": 1.1563853025436401, + "learning_rate": 9.812170241346449e-05, + "loss": 0.734487771987915, + "step": 2284 + }, + { + "epoch": 0.9645569620253165, + "grad_norm": 1.1823415756225586, + "learning_rate": 9.81151765200553e-05, + "loss": 0.7312371730804443, + "step": 2286 + }, + { + "epoch": 0.9654008438818565, + "grad_norm": 1.1336151361465454, + "learning_rate": 9.810863952739899e-05, + "loss": 0.7668377757072449, + "step": 2288 + }, + { + "epoch": 0.9662447257383966, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.810209143700347e-05, + "loss": 0.7100399732589722, + "step": 2290 + }, + { + "epoch": 0.9670886075949368, + "grad_norm": 1.1368129253387451, + "learning_rate": 9.809553225037926e-05, + "loss": 0.7169836163520813, + "step": 2292 + }, + { + "epoch": 0.9679324894514768, + "grad_norm": 1.141107439994812, + "learning_rate": 9.808896196903947e-05, + "loss": 0.7709535956382751, + "step": 2294 + }, + { + "epoch": 0.9687763713080169, + "grad_norm": 1.276405930519104, + "learning_rate": 9.808238059449971e-05, + "loss": 0.7300511002540588, + "step": 2296 + }, + { + "epoch": 0.9696202531645569, + "grad_norm": 0.9817046523094177, + "learning_rate": 9.80757881282782e-05, + "loss": 0.6259129047393799, + "step": 2298 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 1.3965257406234741, + "learning_rate": 9.806918457189566e-05, + "loss": 0.7361716032028198, + "step": 2300 + }, + { + "epoch": 0.9704641350210971, + "eval_loss": 0.7464568614959717, + "eval_runtime": 864.2128, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 2.438, + "step": 2300 + }, + { + "epoch": 0.9713080168776371, + "grad_norm": 1.2168612480163574, + "learning_rate": 9.806256992687544e-05, + "loss": 0.805477499961853, + "step": 2302 + }, + { + "epoch": 0.9721518987341772, + "grad_norm": 1.0418168306350708, + "learning_rate": 9.80559441947434e-05, + "loss": 0.6673368811607361, + "step": 2304 + }, + { + "epoch": 0.9729957805907173, + "grad_norm": 1.223128318786621, + "learning_rate": 9.804930737702796e-05, + "loss": 0.7585647106170654, + "step": 2306 + }, + { + "epoch": 0.9738396624472574, + "grad_norm": 1.264511227607727, + "learning_rate": 9.804265947526011e-05, + "loss": 0.7642034888267517, + "step": 2308 + }, + { + "epoch": 0.9746835443037974, + "grad_norm": 1.076887607574463, + "learning_rate": 9.803600049097339e-05, + "loss": 0.7094541192054749, + "step": 2310 + }, + { + "epoch": 0.9755274261603376, + "grad_norm": 1.0214987993240356, + "learning_rate": 9.802933042570392e-05, + "loss": 0.7370059490203857, + "step": 2312 + }, + { + "epoch": 0.9763713080168777, + "grad_norm": 1.3075295686721802, + "learning_rate": 9.802264928099035e-05, + "loss": 0.726834237575531, + "step": 2314 + }, + { + "epoch": 0.9772151898734177, + "grad_norm": 1.057386040687561, + "learning_rate": 9.801595705837385e-05, + "loss": 0.6742353439331055, + "step": 2316 + }, + { + "epoch": 0.9780590717299578, + "grad_norm": 1.3998085260391235, + "learning_rate": 9.800925375939825e-05, + "loss": 0.6862425208091736, + "step": 2318 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 1.080574631690979, + "learning_rate": 9.800253938560983e-05, + "loss": 0.6212031245231628, + "step": 2320 + }, + { + "epoch": 0.979746835443038, + "grad_norm": 1.3643771409988403, + "learning_rate": 9.799581393855748e-05, + "loss": 0.7522522211074829, + "step": 2322 + }, + { + "epoch": 0.980590717299578, + "grad_norm": 1.2455768585205078, + "learning_rate": 9.798907741979264e-05, + "loss": 0.7265716791152954, + "step": 2324 + }, + { + "epoch": 0.9814345991561182, + "grad_norm": 1.078774333000183, + "learning_rate": 9.798232983086927e-05, + "loss": 0.7160419225692749, + "step": 2326 + }, + { + "epoch": 0.9822784810126582, + "grad_norm": 1.3013948202133179, + "learning_rate": 9.797557117334394e-05, + "loss": 0.7991124391555786, + "step": 2328 + }, + { + "epoch": 0.9831223628691983, + "grad_norm": 1.2216732501983643, + "learning_rate": 9.796880144877572e-05, + "loss": 0.7193916440010071, + "step": 2330 + }, + { + "epoch": 0.9839662447257383, + "grad_norm": 1.1469542980194092, + "learning_rate": 9.796202065872627e-05, + "loss": 0.7184370756149292, + "step": 2332 + }, + { + "epoch": 0.9848101265822785, + "grad_norm": 1.0431830883026123, + "learning_rate": 9.795522880475979e-05, + "loss": 0.6474619507789612, + "step": 2334 + }, + { + "epoch": 0.9856540084388186, + "grad_norm": 1.1819576025009155, + "learning_rate": 9.794842588844299e-05, + "loss": 0.6392545700073242, + "step": 2336 + }, + { + "epoch": 0.9864978902953586, + "grad_norm": 1.1984983682632446, + "learning_rate": 9.794161191134525e-05, + "loss": 0.7358114719390869, + "step": 2338 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 1.3378512859344482, + "learning_rate": 9.793478687503834e-05, + "loss": 0.6762020587921143, + "step": 2340 + }, + { + "epoch": 0.9881856540084388, + "grad_norm": 1.272674560546875, + "learning_rate": 9.792795078109673e-05, + "loss": 0.7478934526443481, + "step": 2342 + }, + { + "epoch": 0.9890295358649789, + "grad_norm": 1.153746247291565, + "learning_rate": 9.792110363109733e-05, + "loss": 0.7316533923149109, + "step": 2344 + }, + { + "epoch": 0.9898734177215189, + "grad_norm": 1.1361702680587769, + "learning_rate": 9.791424542661967e-05, + "loss": 0.7078539133071899, + "step": 2346 + }, + { + "epoch": 0.9907172995780591, + "grad_norm": 1.3043115139007568, + "learning_rate": 9.790737616924581e-05, + "loss": 0.7945935130119324, + "step": 2348 + }, + { + "epoch": 0.9915611814345991, + "grad_norm": 1.1913264989852905, + "learning_rate": 9.790049586056034e-05, + "loss": 0.8247197866439819, + "step": 2350 + }, + { + "epoch": 0.9924050632911392, + "grad_norm": 1.1560171842575073, + "learning_rate": 9.789360450215041e-05, + "loss": 0.7099657654762268, + "step": 2352 + }, + { + "epoch": 0.9932489451476794, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.788670209560575e-05, + "loss": 0.7480318546295166, + "step": 2354 + }, + { + "epoch": 0.9940928270042194, + "grad_norm": 1.1584707498550415, + "learning_rate": 9.787978864251859e-05, + "loss": 0.6870889067649841, + "step": 2356 + }, + { + "epoch": 0.9949367088607595, + "grad_norm": 1.057478666305542, + "learning_rate": 9.787286414448375e-05, + "loss": 0.6114922165870667, + "step": 2358 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 1.1431775093078613, + "learning_rate": 9.786592860309856e-05, + "loss": 0.6955118179321289, + "step": 2360 + }, + { + "epoch": 0.9966244725738397, + "grad_norm": 1.232142448425293, + "learning_rate": 9.785898201996292e-05, + "loss": 0.735048770904541, + "step": 2362 + }, + { + "epoch": 0.9974683544303797, + "grad_norm": 1.1236306428909302, + "learning_rate": 9.785202439667928e-05, + "loss": 0.7150241136550903, + "step": 2364 + }, + { + "epoch": 0.9983122362869198, + "grad_norm": 1.0517534017562866, + "learning_rate": 9.784505573485263e-05, + "loss": 0.6870222687721252, + "step": 2366 + }, + { + "epoch": 0.99915611814346, + "grad_norm": 1.1747480630874634, + "learning_rate": 9.78380760360905e-05, + "loss": 0.7521567940711975, + "step": 2368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2790346145629883, + "learning_rate": 9.783108530200298e-05, + "loss": 0.7336234450340271, + "step": 2370 + }, + { + "epoch": 1.0008438818565402, + "grad_norm": 1.1216399669647217, + "learning_rate": 9.78240835342027e-05, + "loss": 0.6378109455108643, + "step": 2372 + }, + { + "epoch": 1.00168776371308, + "grad_norm": 1.267336368560791, + "learning_rate": 9.781707073430482e-05, + "loss": 0.6174905300140381, + "step": 2374 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 1.1342934370040894, + "learning_rate": 9.781004690392706e-05, + "loss": 0.6579123139381409, + "step": 2376 + }, + { + "epoch": 1.0033755274261604, + "grad_norm": 1.1317468881607056, + "learning_rate": 9.78030120446897e-05, + "loss": 0.6679617166519165, + "step": 2378 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 1.2992616891860962, + "learning_rate": 9.779596615821552e-05, + "loss": 0.7368149161338806, + "step": 2380 + }, + { + "epoch": 1.0050632911392405, + "grad_norm": 1.1714510917663574, + "learning_rate": 9.77889092461299e-05, + "loss": 0.6887164115905762, + "step": 2382 + }, + { + "epoch": 1.0059071729957807, + "grad_norm": 1.1670639514923096, + "learning_rate": 9.778184131006071e-05, + "loss": 0.681344211101532, + "step": 2384 + }, + { + "epoch": 1.0067510548523206, + "grad_norm": 1.2487291097640991, + "learning_rate": 9.77747623516384e-05, + "loss": 0.7342769503593445, + "step": 2386 + }, + { + "epoch": 1.0075949367088608, + "grad_norm": 1.2408956289291382, + "learning_rate": 9.776767237249595e-05, + "loss": 0.577454149723053, + "step": 2388 + }, + { + "epoch": 1.0084388185654007, + "grad_norm": 1.067991852760315, + "learning_rate": 9.776057137426889e-05, + "loss": 0.6588307023048401, + "step": 2390 + }, + { + "epoch": 1.009282700421941, + "grad_norm": 1.2821543216705322, + "learning_rate": 9.775345935859525e-05, + "loss": 0.7045041918754578, + "step": 2392 + }, + { + "epoch": 1.010126582278481, + "grad_norm": 1.3160134553909302, + "learning_rate": 9.774633632711569e-05, + "loss": 0.7141479253768921, + "step": 2394 + }, + { + "epoch": 1.010970464135021, + "grad_norm": 1.66774320602417, + "learning_rate": 9.773920228147329e-05, + "loss": 0.723293662071228, + "step": 2396 + }, + { + "epoch": 1.0118143459915612, + "grad_norm": 1.027588963508606, + "learning_rate": 9.77320572233138e-05, + "loss": 0.5812023878097534, + "step": 2398 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.406507968902588, + "learning_rate": 9.77249011542854e-05, + "loss": 0.7071458101272583, + "step": 2400 + }, + { + "epoch": 1.0126582278481013, + "eval_loss": 0.7421699166297913, + "eval_runtime": 854.2185, + "eval_samples_per_second": 2.467, + "eval_steps_per_second": 2.467, + "step": 2400 + }, + { + "epoch": 1.0135021097046413, + "grad_norm": 1.1236240863800049, + "learning_rate": 9.771773407603889e-05, + "loss": 0.7049722671508789, + "step": 2402 + }, + { + "epoch": 1.0143459915611814, + "grad_norm": 1.1924289464950562, + "learning_rate": 9.771055599022756e-05, + "loss": 0.635308027267456, + "step": 2404 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 1.1744966506958008, + "learning_rate": 9.770336689850727e-05, + "loss": 0.7286487817764282, + "step": 2406 + }, + { + "epoch": 1.0160337552742615, + "grad_norm": 1.2131173610687256, + "learning_rate": 9.769616680253639e-05, + "loss": 0.6828222274780273, + "step": 2408 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 1.0517828464508057, + "learning_rate": 9.768895570397585e-05, + "loss": 0.6652156114578247, + "step": 2410 + }, + { + "epoch": 1.0177215189873419, + "grad_norm": 1.1603758335113525, + "learning_rate": 9.768173360448912e-05, + "loss": 0.7278267741203308, + "step": 2412 + }, + { + "epoch": 1.0185654008438818, + "grad_norm": 1.3167752027511597, + "learning_rate": 9.767450050574218e-05, + "loss": 0.6082334518432617, + "step": 2414 + }, + { + "epoch": 1.019409282700422, + "grad_norm": 1.1754449605941772, + "learning_rate": 9.766725640940358e-05, + "loss": 0.67228102684021, + "step": 2416 + }, + { + "epoch": 1.0202531645569621, + "grad_norm": 1.060952067375183, + "learning_rate": 9.766000131714442e-05, + "loss": 0.5984366536140442, + "step": 2418 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 1.0826152563095093, + "learning_rate": 9.765273523063825e-05, + "loss": 0.690661609172821, + "step": 2420 + }, + { + "epoch": 1.0219409282700422, + "grad_norm": 1.423723816871643, + "learning_rate": 9.764545815156125e-05, + "loss": 0.7960668802261353, + "step": 2422 + }, + { + "epoch": 1.0227848101265822, + "grad_norm": 1.0882549285888672, + "learning_rate": 9.763817008159212e-05, + "loss": 0.6971074342727661, + "step": 2424 + }, + { + "epoch": 1.0236286919831223, + "grad_norm": 1.1053040027618408, + "learning_rate": 9.763087102241206e-05, + "loss": 0.6854458451271057, + "step": 2426 + }, + { + "epoch": 1.0244725738396625, + "grad_norm": 1.1975224018096924, + "learning_rate": 9.762356097570482e-05, + "loss": 0.6724489331245422, + "step": 2428 + }, + { + "epoch": 1.0253164556962024, + "grad_norm": 1.1692171096801758, + "learning_rate": 9.76162399431567e-05, + "loss": 0.7064506411552429, + "step": 2430 + }, + { + "epoch": 1.0261603375527426, + "grad_norm": 1.1927787065505981, + "learning_rate": 9.760890792645649e-05, + "loss": 0.6605257391929626, + "step": 2432 + }, + { + "epoch": 1.0270042194092828, + "grad_norm": 1.4147427082061768, + "learning_rate": 9.760156492729558e-05, + "loss": 0.6872501373291016, + "step": 2434 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 1.2503126859664917, + "learning_rate": 9.759421094736785e-05, + "loss": 0.7117500305175781, + "step": 2436 + }, + { + "epoch": 1.0286919831223629, + "grad_norm": 1.229978084564209, + "learning_rate": 9.758684598836971e-05, + "loss": 0.6740369200706482, + "step": 2438 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 1.4765945672988892, + "learning_rate": 9.757947005200014e-05, + "loss": 0.7215790748596191, + "step": 2440 + }, + { + "epoch": 1.030379746835443, + "grad_norm": 1.282632827758789, + "learning_rate": 9.757208313996061e-05, + "loss": 0.6961746215820312, + "step": 2442 + }, + { + "epoch": 1.0312236286919831, + "grad_norm": 1.259828805923462, + "learning_rate": 9.756468525395512e-05, + "loss": 0.6348349452018738, + "step": 2444 + }, + { + "epoch": 1.0320675105485233, + "grad_norm": 1.0984172821044922, + "learning_rate": 9.755727639569024e-05, + "loss": 0.6756057739257812, + "step": 2446 + }, + { + "epoch": 1.0329113924050632, + "grad_norm": 1.235835075378418, + "learning_rate": 9.754985656687506e-05, + "loss": 0.6968509554862976, + "step": 2448 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 1.273032546043396, + "learning_rate": 9.754242576922119e-05, + "loss": 0.6793950796127319, + "step": 2450 + }, + { + "epoch": 1.0345991561181433, + "grad_norm": 1.251996397972107, + "learning_rate": 9.753498400444274e-05, + "loss": 0.645270586013794, + "step": 2452 + }, + { + "epoch": 1.0354430379746835, + "grad_norm": 1.4310805797576904, + "learning_rate": 9.752753127425642e-05, + "loss": 0.7291322350502014, + "step": 2454 + }, + { + "epoch": 1.0362869198312237, + "grad_norm": 1.6582196950912476, + "learning_rate": 9.752006758038142e-05, + "loss": 0.7553019523620605, + "step": 2456 + }, + { + "epoch": 1.0371308016877636, + "grad_norm": 1.081773042678833, + "learning_rate": 9.751259292453947e-05, + "loss": 0.5637331008911133, + "step": 2458 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 1.1483876705169678, + "learning_rate": 9.750510730845483e-05, + "loss": 0.6012396216392517, + "step": 2460 + }, + { + "epoch": 1.038818565400844, + "grad_norm": 1.0879185199737549, + "learning_rate": 9.749761073385428e-05, + "loss": 0.6795822381973267, + "step": 2462 + }, + { + "epoch": 1.0396624472573839, + "grad_norm": 1.2378218173980713, + "learning_rate": 9.749010320246714e-05, + "loss": 0.6895145773887634, + "step": 2464 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 1.253233790397644, + "learning_rate": 9.748258471602527e-05, + "loss": 0.7124115228652954, + "step": 2466 + }, + { + "epoch": 1.0413502109704642, + "grad_norm": 1.3994864225387573, + "learning_rate": 9.747505527626302e-05, + "loss": 0.7304861545562744, + "step": 2468 + }, + { + "epoch": 1.0421940928270041, + "grad_norm": 1.2360669374465942, + "learning_rate": 9.74675148849173e-05, + "loss": 0.6845837831497192, + "step": 2470 + }, + { + "epoch": 1.0430379746835443, + "grad_norm": 1.126849889755249, + "learning_rate": 9.74599635437275e-05, + "loss": 0.6780203580856323, + "step": 2472 + }, + { + "epoch": 1.0438818565400845, + "grad_norm": 1.169788122177124, + "learning_rate": 9.745240125443562e-05, + "loss": 0.7550003528594971, + "step": 2474 + }, + { + "epoch": 1.0447257383966244, + "grad_norm": 1.1311867237091064, + "learning_rate": 9.744482801878612e-05, + "loss": 0.6910399198532104, + "step": 2476 + }, + { + "epoch": 1.0455696202531646, + "grad_norm": 1.1267731189727783, + "learning_rate": 9.743724383852597e-05, + "loss": 0.7164814472198486, + "step": 2478 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 1.2239704132080078, + "learning_rate": 9.742964871540472e-05, + "loss": 0.6428439617156982, + "step": 2480 + }, + { + "epoch": 1.0472573839662447, + "grad_norm": 1.1854743957519531, + "learning_rate": 9.742204265117443e-05, + "loss": 0.6994290351867676, + "step": 2482 + }, + { + "epoch": 1.0481012658227848, + "grad_norm": 1.0695894956588745, + "learning_rate": 9.741442564758964e-05, + "loss": 0.6725777983665466, + "step": 2484 + }, + { + "epoch": 1.048945147679325, + "grad_norm": 1.1799863576889038, + "learning_rate": 9.740679770640748e-05, + "loss": 0.6538674235343933, + "step": 2486 + }, + { + "epoch": 1.049789029535865, + "grad_norm": 1.295546293258667, + "learning_rate": 9.739915882938754e-05, + "loss": 0.780756950378418, + "step": 2488 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 1.2371755838394165, + "learning_rate": 9.739150901829198e-05, + "loss": 0.6657930612564087, + "step": 2490 + }, + { + "epoch": 1.051476793248945, + "grad_norm": 1.103037714958191, + "learning_rate": 9.738384827488547e-05, + "loss": 0.6675208210945129, + "step": 2492 + }, + { + "epoch": 1.0523206751054852, + "grad_norm": 1.1835435628890991, + "learning_rate": 9.737617660093517e-05, + "loss": 0.6693358421325684, + "step": 2494 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 1.003771424293518, + "learning_rate": 9.736849399821082e-05, + "loss": 0.624502956867218, + "step": 2496 + }, + { + "epoch": 1.0540084388185653, + "grad_norm": 1.1391769647598267, + "learning_rate": 9.736080046848463e-05, + "loss": 0.6350868344306946, + "step": 2498 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 1.376518726348877, + "learning_rate": 9.735309601353134e-05, + "loss": 0.6721012592315674, + "step": 2500 + }, + { + "epoch": 1.0548523206751055, + "eval_loss": 0.741338849067688, + "eval_runtime": 847.7478, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2500 + }, + { + "epoch": 1.0556962025316456, + "grad_norm": 1.194190502166748, + "learning_rate": 9.734538063512824e-05, + "loss": 0.6888233423233032, + "step": 2502 + }, + { + "epoch": 1.0565400843881856, + "grad_norm": 1.378830909729004, + "learning_rate": 9.733765433505513e-05, + "loss": 0.7095553278923035, + "step": 2504 + }, + { + "epoch": 1.0573839662447257, + "grad_norm": 1.1289541721343994, + "learning_rate": 9.732991711509428e-05, + "loss": 0.6734166145324707, + "step": 2506 + }, + { + "epoch": 1.058227848101266, + "grad_norm": 1.1858116388320923, + "learning_rate": 9.732216897703054e-05, + "loss": 0.7006195187568665, + "step": 2508 + }, + { + "epoch": 1.0590717299578059, + "grad_norm": 1.1365686655044556, + "learning_rate": 9.731440992265127e-05, + "loss": 0.6481205821037292, + "step": 2510 + }, + { + "epoch": 1.059915611814346, + "grad_norm": 1.2886228561401367, + "learning_rate": 9.730663995374632e-05, + "loss": 0.679282546043396, + "step": 2512 + }, + { + "epoch": 1.0607594936708862, + "grad_norm": 1.355322003364563, + "learning_rate": 9.729885907210808e-05, + "loss": 0.7656359672546387, + "step": 2514 + }, + { + "epoch": 1.0616033755274261, + "grad_norm": 1.1552364826202393, + "learning_rate": 9.729106727953142e-05, + "loss": 0.5996183156967163, + "step": 2516 + }, + { + "epoch": 1.0624472573839663, + "grad_norm": 1.1419235467910767, + "learning_rate": 9.728326457781381e-05, + "loss": 0.7599716782569885, + "step": 2518 + }, + { + "epoch": 1.0632911392405062, + "grad_norm": 1.2240079641342163, + "learning_rate": 9.727545096875512e-05, + "loss": 0.7150241732597351, + "step": 2520 + }, + { + "epoch": 1.0641350210970464, + "grad_norm": 1.2463440895080566, + "learning_rate": 9.726762645415785e-05, + "loss": 0.734352171421051, + "step": 2522 + }, + { + "epoch": 1.0649789029535865, + "grad_norm": 1.1680364608764648, + "learning_rate": 9.725979103582697e-05, + "loss": 0.6950796842575073, + "step": 2524 + }, + { + "epoch": 1.0658227848101265, + "grad_norm": 1.1680421829223633, + "learning_rate": 9.725194471556991e-05, + "loss": 0.7096341252326965, + "step": 2526 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.043717861175537, + "learning_rate": 9.724408749519671e-05, + "loss": 0.6486304402351379, + "step": 2528 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 1.1240284442901611, + "learning_rate": 9.723621937651985e-05, + "loss": 0.6519505381584167, + "step": 2530 + }, + { + "epoch": 1.0683544303797468, + "grad_norm": 1.185223937034607, + "learning_rate": 9.722834036135439e-05, + "loss": 0.6724293231964111, + "step": 2532 + }, + { + "epoch": 1.069198312236287, + "grad_norm": 1.3234196901321411, + "learning_rate": 9.722045045151784e-05, + "loss": 0.6886576414108276, + "step": 2534 + }, + { + "epoch": 1.070042194092827, + "grad_norm": 1.333084225654602, + "learning_rate": 9.721254964883024e-05, + "loss": 0.688493549823761, + "step": 2536 + }, + { + "epoch": 1.070886075949367, + "grad_norm": 1.2435462474822998, + "learning_rate": 9.720463795511419e-05, + "loss": 0.6527412533760071, + "step": 2538 + }, + { + "epoch": 1.0717299578059072, + "grad_norm": 1.1521880626678467, + "learning_rate": 9.719671537219472e-05, + "loss": 0.6508163809776306, + "step": 2540 + }, + { + "epoch": 1.0725738396624473, + "grad_norm": 1.015013575553894, + "learning_rate": 9.718878190189947e-05, + "loss": 0.6954023838043213, + "step": 2542 + }, + { + "epoch": 1.0734177215189873, + "grad_norm": 1.1507678031921387, + "learning_rate": 9.718083754605851e-05, + "loss": 0.7201322913169861, + "step": 2544 + }, + { + "epoch": 1.0742616033755275, + "grad_norm": 1.0569016933441162, + "learning_rate": 9.717288230650444e-05, + "loss": 0.6688649654388428, + "step": 2546 + }, + { + "epoch": 1.0751054852320676, + "grad_norm": 1.2178492546081543, + "learning_rate": 9.716491618507241e-05, + "loss": 0.7077898979187012, + "step": 2548 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 1.3587230443954468, + "learning_rate": 9.715693918360002e-05, + "loss": 0.7312119603157043, + "step": 2550 + }, + { + "epoch": 1.0767932489451477, + "grad_norm": 1.1930122375488281, + "learning_rate": 9.714895130392744e-05, + "loss": 0.6910589337348938, + "step": 2552 + }, + { + "epoch": 1.0776371308016879, + "grad_norm": 1.2440707683563232, + "learning_rate": 9.71409525478973e-05, + "loss": 0.7942836284637451, + "step": 2554 + }, + { + "epoch": 1.0784810126582278, + "grad_norm": 1.3755065202713013, + "learning_rate": 9.713294291735477e-05, + "loss": 0.6652286052703857, + "step": 2556 + }, + { + "epoch": 1.079324894514768, + "grad_norm": 1.165448784828186, + "learning_rate": 9.71249224141475e-05, + "loss": 0.6025735139846802, + "step": 2558 + }, + { + "epoch": 1.080168776371308, + "grad_norm": 1.2981204986572266, + "learning_rate": 9.711689104012569e-05, + "loss": 0.7343734502792358, + "step": 2560 + }, + { + "epoch": 1.081012658227848, + "grad_norm": 1.2040622234344482, + "learning_rate": 9.710884879714202e-05, + "loss": 0.6903306841850281, + "step": 2562 + }, + { + "epoch": 1.0818565400843883, + "grad_norm": 1.1835904121398926, + "learning_rate": 9.710079568705168e-05, + "loss": 0.69134920835495, + "step": 2564 + }, + { + "epoch": 1.0827004219409282, + "grad_norm": 1.3345229625701904, + "learning_rate": 9.709273171171235e-05, + "loss": 0.6471185088157654, + "step": 2566 + }, + { + "epoch": 1.0835443037974684, + "grad_norm": 1.0884469747543335, + "learning_rate": 9.708465687298425e-05, + "loss": 0.6302382349967957, + "step": 2568 + }, + { + "epoch": 1.0843881856540085, + "grad_norm": 1.1994211673736572, + "learning_rate": 9.707657117273007e-05, + "loss": 0.7329678535461426, + "step": 2570 + }, + { + "epoch": 1.0852320675105485, + "grad_norm": 1.2609503269195557, + "learning_rate": 9.706847461281507e-05, + "loss": 0.719862163066864, + "step": 2572 + }, + { + "epoch": 1.0860759493670886, + "grad_norm": 1.2686879634857178, + "learning_rate": 9.706036719510694e-05, + "loss": 0.7142901420593262, + "step": 2574 + }, + { + "epoch": 1.0869198312236288, + "grad_norm": 1.2763310670852661, + "learning_rate": 9.705224892147591e-05, + "loss": 0.7009075284004211, + "step": 2576 + }, + { + "epoch": 1.0877637130801687, + "grad_norm": 1.1704022884368896, + "learning_rate": 9.70441197937947e-05, + "loss": 0.6873779296875, + "step": 2578 + }, + { + "epoch": 1.0886075949367089, + "grad_norm": 1.0482875108718872, + "learning_rate": 9.703597981393856e-05, + "loss": 0.6437726020812988, + "step": 2580 + }, + { + "epoch": 1.0894514767932488, + "grad_norm": 1.28431236743927, + "learning_rate": 9.702782898378521e-05, + "loss": 0.6933431625366211, + "step": 2582 + }, + { + "epoch": 1.090295358649789, + "grad_norm": 1.0962283611297607, + "learning_rate": 9.701966730521491e-05, + "loss": 0.6488757133483887, + "step": 2584 + }, + { + "epoch": 1.0911392405063292, + "grad_norm": 1.2177873849868774, + "learning_rate": 9.70114947801104e-05, + "loss": 0.6385396122932434, + "step": 2586 + }, + { + "epoch": 1.091983122362869, + "grad_norm": 1.197059988975525, + "learning_rate": 9.70033114103569e-05, + "loss": 0.6826614737510681, + "step": 2588 + }, + { + "epoch": 1.0928270042194093, + "grad_norm": 1.1624075174331665, + "learning_rate": 9.699511719784217e-05, + "loss": 0.605629563331604, + "step": 2590 + }, + { + "epoch": 1.0936708860759494, + "grad_norm": 1.2975167036056519, + "learning_rate": 9.698691214445648e-05, + "loss": 0.734926700592041, + "step": 2592 + }, + { + "epoch": 1.0945147679324894, + "grad_norm": 1.215414047241211, + "learning_rate": 9.697869625209255e-05, + "loss": 0.7281333804130554, + "step": 2594 + }, + { + "epoch": 1.0953586497890295, + "grad_norm": 1.1862860918045044, + "learning_rate": 9.697046952264563e-05, + "loss": 0.7388250827789307, + "step": 2596 + }, + { + "epoch": 1.0962025316455697, + "grad_norm": 1.1127797365188599, + "learning_rate": 9.696223195801348e-05, + "loss": 0.6495320796966553, + "step": 2598 + }, + { + "epoch": 1.0970464135021096, + "grad_norm": 1.0863338708877563, + "learning_rate": 9.695398356009636e-05, + "loss": 0.7157143950462341, + "step": 2600 + }, + { + "epoch": 1.0970464135021096, + "eval_loss": 0.7377332448959351, + "eval_runtime": 859.6612, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 2600 + }, + { + "epoch": 1.0978902953586498, + "grad_norm": 1.1228652000427246, + "learning_rate": 9.694572433079699e-05, + "loss": 0.6597335934638977, + "step": 2602 + }, + { + "epoch": 1.09873417721519, + "grad_norm": 1.3077653646469116, + "learning_rate": 9.69374542720206e-05, + "loss": 0.6715680360794067, + "step": 2604 + }, + { + "epoch": 1.09957805907173, + "grad_norm": 1.241603970527649, + "learning_rate": 9.692917338567499e-05, + "loss": 0.6910243034362793, + "step": 2606 + }, + { + "epoch": 1.10042194092827, + "grad_norm": 1.1372551918029785, + "learning_rate": 9.692088167367037e-05, + "loss": 0.6519553065299988, + "step": 2608 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 1.2894765138626099, + "learning_rate": 9.691257913791949e-05, + "loss": 0.6542758941650391, + "step": 2610 + }, + { + "epoch": 1.1021097046413502, + "grad_norm": 1.0800915956497192, + "learning_rate": 9.690426578033755e-05, + "loss": 0.6886795163154602, + "step": 2612 + }, + { + "epoch": 1.1029535864978903, + "grad_norm": 1.3394384384155273, + "learning_rate": 9.689594160284233e-05, + "loss": 0.7512150406837463, + "step": 2614 + }, + { + "epoch": 1.1037974683544305, + "grad_norm": 1.2175323963165283, + "learning_rate": 9.688760660735402e-05, + "loss": 0.67207932472229, + "step": 2616 + }, + { + "epoch": 1.1046413502109704, + "grad_norm": 1.2181185483932495, + "learning_rate": 9.687926079579537e-05, + "loss": 0.6591740846633911, + "step": 2618 + }, + { + "epoch": 1.1054852320675106, + "grad_norm": 1.1740983724594116, + "learning_rate": 9.68709041700916e-05, + "loss": 0.6431041359901428, + "step": 2620 + }, + { + "epoch": 1.1063291139240505, + "grad_norm": 1.1792434453964233, + "learning_rate": 9.686253673217038e-05, + "loss": 0.6573615074157715, + "step": 2622 + }, + { + "epoch": 1.1071729957805907, + "grad_norm": 1.058391809463501, + "learning_rate": 9.685415848396196e-05, + "loss": 0.5576209425926208, + "step": 2624 + }, + { + "epoch": 1.1080168776371309, + "grad_norm": 1.3203206062316895, + "learning_rate": 9.684576942739903e-05, + "loss": 0.668684184551239, + "step": 2626 + }, + { + "epoch": 1.1088607594936708, + "grad_norm": 1.2391762733459473, + "learning_rate": 9.68373695644168e-05, + "loss": 0.6800089478492737, + "step": 2628 + }, + { + "epoch": 1.109704641350211, + "grad_norm": 1.2323405742645264, + "learning_rate": 9.682895889695292e-05, + "loss": 0.6433757543563843, + "step": 2630 + }, + { + "epoch": 1.1105485232067511, + "grad_norm": 1.2656551599502563, + "learning_rate": 9.682053742694759e-05, + "loss": 0.6628785729408264, + "step": 2632 + }, + { + "epoch": 1.111392405063291, + "grad_norm": 1.2984392642974854, + "learning_rate": 9.681210515634349e-05, + "loss": 0.6838971972465515, + "step": 2634 + }, + { + "epoch": 1.1122362869198312, + "grad_norm": 1.3200393915176392, + "learning_rate": 9.680366208708576e-05, + "loss": 0.7548647522926331, + "step": 2636 + }, + { + "epoch": 1.1130801687763714, + "grad_norm": 1.225388526916504, + "learning_rate": 9.679520822112208e-05, + "loss": 0.6553335189819336, + "step": 2638 + }, + { + "epoch": 1.1139240506329113, + "grad_norm": 1.2350653409957886, + "learning_rate": 9.678674356040259e-05, + "loss": 0.631401538848877, + "step": 2640 + }, + { + "epoch": 1.1147679324894515, + "grad_norm": 1.2325507402420044, + "learning_rate": 9.677826810687989e-05, + "loss": 0.6459156274795532, + "step": 2642 + }, + { + "epoch": 1.1156118143459917, + "grad_norm": 1.0008996725082397, + "learning_rate": 9.676978186250915e-05, + "loss": 0.6425284743309021, + "step": 2644 + }, + { + "epoch": 1.1164556962025316, + "grad_norm": 1.3767247200012207, + "learning_rate": 9.676128482924796e-05, + "loss": 0.6451422572135925, + "step": 2646 + }, + { + "epoch": 1.1172995780590718, + "grad_norm": 1.2070895433425903, + "learning_rate": 9.675277700905643e-05, + "loss": 0.6713272929191589, + "step": 2648 + }, + { + "epoch": 1.1181434599156117, + "grad_norm": 1.1582069396972656, + "learning_rate": 9.674425840389716e-05, + "loss": 0.6285044550895691, + "step": 2650 + }, + { + "epoch": 1.1189873417721519, + "grad_norm": 1.1641311645507812, + "learning_rate": 9.67357290157352e-05, + "loss": 0.624229907989502, + "step": 2652 + }, + { + "epoch": 1.119831223628692, + "grad_norm": 1.3071147203445435, + "learning_rate": 9.672718884653814e-05, + "loss": 0.7214919328689575, + "step": 2654 + }, + { + "epoch": 1.120675105485232, + "grad_norm": 1.2157800197601318, + "learning_rate": 9.671863789827602e-05, + "loss": 0.8062215447425842, + "step": 2656 + }, + { + "epoch": 1.1215189873417721, + "grad_norm": 1.2843927145004272, + "learning_rate": 9.671007617292138e-05, + "loss": 0.6362426280975342, + "step": 2658 + }, + { + "epoch": 1.1223628691983123, + "grad_norm": 1.1182712316513062, + "learning_rate": 9.670150367244927e-05, + "loss": 0.6181318163871765, + "step": 2660 + }, + { + "epoch": 1.1232067510548522, + "grad_norm": 1.566605806350708, + "learning_rate": 9.669292039883717e-05, + "loss": 0.6973897218704224, + "step": 2662 + }, + { + "epoch": 1.1240506329113924, + "grad_norm": 1.0726850032806396, + "learning_rate": 9.66843263540651e-05, + "loss": 0.6117324829101562, + "step": 2664 + }, + { + "epoch": 1.1248945147679326, + "grad_norm": 1.2953020334243774, + "learning_rate": 9.66757215401155e-05, + "loss": 0.642676830291748, + "step": 2666 + }, + { + "epoch": 1.1257383966244725, + "grad_norm": 1.1184383630752563, + "learning_rate": 9.66671059589734e-05, + "loss": 0.6757452487945557, + "step": 2668 + }, + { + "epoch": 1.1265822784810127, + "grad_norm": 1.2732970714569092, + "learning_rate": 9.66584796126262e-05, + "loss": 0.6861951947212219, + "step": 2670 + }, + { + "epoch": 1.1274261603375528, + "grad_norm": 1.2713000774383545, + "learning_rate": 9.664984250306383e-05, + "loss": 0.6727077960968018, + "step": 2672 + }, + { + "epoch": 1.1282700421940928, + "grad_norm": 1.269827961921692, + "learning_rate": 9.664119463227874e-05, + "loss": 0.7355974912643433, + "step": 2674 + }, + { + "epoch": 1.129113924050633, + "grad_norm": 1.3067172765731812, + "learning_rate": 9.663253600226581e-05, + "loss": 0.7121313214302063, + "step": 2676 + }, + { + "epoch": 1.129957805907173, + "grad_norm": 1.2958797216415405, + "learning_rate": 9.662386661502242e-05, + "loss": 0.6671369075775146, + "step": 2678 + }, + { + "epoch": 1.130801687763713, + "grad_norm": 1.2943401336669922, + "learning_rate": 9.661518647254842e-05, + "loss": 0.6153768301010132, + "step": 2680 + }, + { + "epoch": 1.1316455696202532, + "grad_norm": 1.1744167804718018, + "learning_rate": 9.660649557684616e-05, + "loss": 0.6070778965950012, + "step": 2682 + }, + { + "epoch": 1.1324894514767934, + "grad_norm": 1.159209132194519, + "learning_rate": 9.659779392992047e-05, + "loss": 0.676887035369873, + "step": 2684 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.1937510967254639, + "learning_rate": 9.658908153377866e-05, + "loss": 0.6086745262145996, + "step": 2686 + }, + { + "epoch": 1.1341772151898735, + "grad_norm": 1.1461687088012695, + "learning_rate": 9.658035839043049e-05, + "loss": 0.6493708491325378, + "step": 2688 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 2.066361665725708, + "learning_rate": 9.657162450188824e-05, + "loss": 0.6813004016876221, + "step": 2690 + }, + { + "epoch": 1.1358649789029536, + "grad_norm": 1.086910367012024, + "learning_rate": 9.656287987016664e-05, + "loss": 0.721062183380127, + "step": 2692 + }, + { + "epoch": 1.1367088607594937, + "grad_norm": 1.1869292259216309, + "learning_rate": 9.65541244972829e-05, + "loss": 0.5975021123886108, + "step": 2694 + }, + { + "epoch": 1.1375527426160337, + "grad_norm": 1.2456518411636353, + "learning_rate": 9.654535838525674e-05, + "loss": 0.6818324327468872, + "step": 2696 + }, + { + "epoch": 1.1383966244725738, + "grad_norm": 1.5271464586257935, + "learning_rate": 9.653658153611031e-05, + "loss": 0.6844469308853149, + "step": 2698 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.1403794288635254, + "learning_rate": 9.652779395186827e-05, + "loss": 0.6388684511184692, + "step": 2700 + }, + { + "epoch": 1.139240506329114, + "eval_loss": 0.7335711717605591, + "eval_runtime": 861.9651, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 2700 + }, + { + "epoch": 1.140084388185654, + "grad_norm": 1.1091634035110474, + "learning_rate": 9.651899563455775e-05, + "loss": 0.6154619455337524, + "step": 2702 + }, + { + "epoch": 1.140928270042194, + "grad_norm": 1.3280601501464844, + "learning_rate": 9.651018658620837e-05, + "loss": 0.629319429397583, + "step": 2704 + }, + { + "epoch": 1.1417721518987343, + "grad_norm": 1.226806402206421, + "learning_rate": 9.650136680885216e-05, + "loss": 0.6088175773620605, + "step": 2706 + }, + { + "epoch": 1.1426160337552742, + "grad_norm": 1.0593408346176147, + "learning_rate": 9.649253630452372e-05, + "loss": 0.6199659705162048, + "step": 2708 + }, + { + "epoch": 1.1434599156118144, + "grad_norm": 1.1112475395202637, + "learning_rate": 9.648369507526008e-05, + "loss": 0.7233364582061768, + "step": 2710 + }, + { + "epoch": 1.1443037974683543, + "grad_norm": 1.1737885475158691, + "learning_rate": 9.647484312310068e-05, + "loss": 0.6687955856323242, + "step": 2712 + }, + { + "epoch": 1.1451476793248945, + "grad_norm": 1.194532036781311, + "learning_rate": 9.646598045008756e-05, + "loss": 0.6508969068527222, + "step": 2714 + }, + { + "epoch": 1.1459915611814346, + "grad_norm": 1.069395899772644, + "learning_rate": 9.645710705826517e-05, + "loss": 0.6408317685127258, + "step": 2716 + }, + { + "epoch": 1.1468354430379746, + "grad_norm": 1.2429133653640747, + "learning_rate": 9.644822294968037e-05, + "loss": 0.650763750076294, + "step": 2718 + }, + { + "epoch": 1.1476793248945147, + "grad_norm": 1.2950133085250854, + "learning_rate": 9.64393281263826e-05, + "loss": 0.6952191591262817, + "step": 2720 + }, + { + "epoch": 1.148523206751055, + "grad_norm": 1.1972628831863403, + "learning_rate": 9.643042259042372e-05, + "loss": 0.6772956252098083, + "step": 2722 + }, + { + "epoch": 1.1493670886075948, + "grad_norm": 1.1670407056808472, + "learning_rate": 9.642150634385805e-05, + "loss": 0.6734447479248047, + "step": 2724 + }, + { + "epoch": 1.150210970464135, + "grad_norm": 1.120302677154541, + "learning_rate": 9.641257938874243e-05, + "loss": 0.6387717127799988, + "step": 2726 + }, + { + "epoch": 1.1510548523206752, + "grad_norm": 1.1241344213485718, + "learning_rate": 9.640364172713609e-05, + "loss": 0.6592874526977539, + "step": 2728 + }, + { + "epoch": 1.1518987341772151, + "grad_norm": 1.2627261877059937, + "learning_rate": 9.639469336110083e-05, + "loss": 0.7257466912269592, + "step": 2730 + }, + { + "epoch": 1.1527426160337553, + "grad_norm": 1.0528618097305298, + "learning_rate": 9.638573429270083e-05, + "loss": 0.572188138961792, + "step": 2732 + }, + { + "epoch": 1.1535864978902954, + "grad_norm": 1.212536334991455, + "learning_rate": 9.637676452400277e-05, + "loss": 0.678981602191925, + "step": 2734 + }, + { + "epoch": 1.1544303797468354, + "grad_norm": 1.152167797088623, + "learning_rate": 9.636778405707582e-05, + "loss": 0.6375001072883606, + "step": 2736 + }, + { + "epoch": 1.1552742616033755, + "grad_norm": 1.2400429248809814, + "learning_rate": 9.635879289399161e-05, + "loss": 0.7602289319038391, + "step": 2738 + }, + { + "epoch": 1.1561181434599157, + "grad_norm": 1.3488622903823853, + "learning_rate": 9.634979103682421e-05, + "loss": 0.6209543943405151, + "step": 2740 + }, + { + "epoch": 1.1569620253164556, + "grad_norm": 1.1999555826187134, + "learning_rate": 9.634077848765019e-05, + "loss": 0.6215830445289612, + "step": 2742 + }, + { + "epoch": 1.1578059071729958, + "grad_norm": 1.2008578777313232, + "learning_rate": 9.633175524854855e-05, + "loss": 0.6634654998779297, + "step": 2744 + }, + { + "epoch": 1.158649789029536, + "grad_norm": 1.3920676708221436, + "learning_rate": 9.63227213216008e-05, + "loss": 0.7515161633491516, + "step": 2746 + }, + { + "epoch": 1.159493670886076, + "grad_norm": 1.0551656484603882, + "learning_rate": 9.631367670889089e-05, + "loss": 0.724361777305603, + "step": 2748 + }, + { + "epoch": 1.160337552742616, + "grad_norm": 1.2820028066635132, + "learning_rate": 9.630462141250523e-05, + "loss": 0.6673553586006165, + "step": 2750 + }, + { + "epoch": 1.1611814345991562, + "grad_norm": 1.1452983617782593, + "learning_rate": 9.62955554345327e-05, + "loss": 0.7029784917831421, + "step": 2752 + }, + { + "epoch": 1.1620253164556962, + "grad_norm": 1.1808624267578125, + "learning_rate": 9.628647877706466e-05, + "loss": 0.7355457544326782, + "step": 2754 + }, + { + "epoch": 1.1628691983122363, + "grad_norm": 1.0574703216552734, + "learning_rate": 9.627739144219492e-05, + "loss": 0.6144933700561523, + "step": 2756 + }, + { + "epoch": 1.1637130801687763, + "grad_norm": 1.215733528137207, + "learning_rate": 9.626829343201974e-05, + "loss": 0.6843759417533875, + "step": 2758 + }, + { + "epoch": 1.1645569620253164, + "grad_norm": 1.1667706966400146, + "learning_rate": 9.625918474863787e-05, + "loss": 0.6197049617767334, + "step": 2760 + }, + { + "epoch": 1.1654008438818566, + "grad_norm": 1.3765631914138794, + "learning_rate": 9.62500653941505e-05, + "loss": 0.715958297252655, + "step": 2762 + }, + { + "epoch": 1.1662447257383965, + "grad_norm": 1.173715591430664, + "learning_rate": 9.62409353706613e-05, + "loss": 0.7433139085769653, + "step": 2764 + }, + { + "epoch": 1.1670886075949367, + "grad_norm": 1.1837430000305176, + "learning_rate": 9.623179468027637e-05, + "loss": 0.7174371480941772, + "step": 2766 + }, + { + "epoch": 1.1679324894514769, + "grad_norm": 1.1577154397964478, + "learning_rate": 9.622264332510432e-05, + "loss": 0.7184823751449585, + "step": 2768 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 1.165246605873108, + "learning_rate": 9.621348130725617e-05, + "loss": 0.693343460559845, + "step": 2770 + }, + { + "epoch": 1.169620253164557, + "grad_norm": 1.2853080034255981, + "learning_rate": 9.620430862884542e-05, + "loss": 0.6999852061271667, + "step": 2772 + }, + { + "epoch": 1.1704641350210971, + "grad_norm": 1.1782865524291992, + "learning_rate": 9.619512529198806e-05, + "loss": 0.6034331321716309, + "step": 2774 + }, + { + "epoch": 1.171308016877637, + "grad_norm": 1.4055447578430176, + "learning_rate": 9.61859312988025e-05, + "loss": 0.7588269710540771, + "step": 2776 + }, + { + "epoch": 1.1721518987341772, + "grad_norm": 1.1148805618286133, + "learning_rate": 9.617672665140957e-05, + "loss": 0.6913981437683105, + "step": 2778 + }, + { + "epoch": 1.1729957805907172, + "grad_norm": 1.1311042308807373, + "learning_rate": 9.616751135193266e-05, + "loss": 0.5976925492286682, + "step": 2780 + }, + { + "epoch": 1.1738396624472573, + "grad_norm": 1.2378602027893066, + "learning_rate": 9.615828540249754e-05, + "loss": 0.6897050142288208, + "step": 2782 + }, + { + "epoch": 1.1746835443037975, + "grad_norm": 1.3445732593536377, + "learning_rate": 9.614904880523248e-05, + "loss": 0.6772098541259766, + "step": 2784 + }, + { + "epoch": 1.1755274261603375, + "grad_norm": 1.3380862474441528, + "learning_rate": 9.613980156226815e-05, + "loss": 0.6354818344116211, + "step": 2786 + }, + { + "epoch": 1.1763713080168776, + "grad_norm": 1.0955157279968262, + "learning_rate": 9.613054367573773e-05, + "loss": 0.6541208028793335, + "step": 2788 + }, + { + "epoch": 1.1772151898734178, + "grad_norm": 1.0176626443862915, + "learning_rate": 9.612127514777686e-05, + "loss": 0.6472887992858887, + "step": 2790 + }, + { + "epoch": 1.1780590717299577, + "grad_norm": 1.2644864320755005, + "learning_rate": 9.611199598052357e-05, + "loss": 0.7511212229728699, + "step": 2792 + }, + { + "epoch": 1.1789029535864979, + "grad_norm": 1.248197317123413, + "learning_rate": 9.61027061761184e-05, + "loss": 0.696236789226532, + "step": 2794 + }, + { + "epoch": 1.179746835443038, + "grad_norm": 1.189935564994812, + "learning_rate": 9.609340573670436e-05, + "loss": 0.5962010622024536, + "step": 2796 + }, + { + "epoch": 1.180590717299578, + "grad_norm": 1.1760492324829102, + "learning_rate": 9.608409466442685e-05, + "loss": 0.5981685519218445, + "step": 2798 + }, + { + "epoch": 1.1814345991561181, + "grad_norm": 1.1820716857910156, + "learning_rate": 9.607477296143374e-05, + "loss": 0.6186091303825378, + "step": 2800 + }, + { + "epoch": 1.1814345991561181, + "eval_loss": 0.7298192977905273, + "eval_runtime": 849.544, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 2.48, + "step": 2800 + }, + { + "epoch": 1.1822784810126583, + "grad_norm": 1.0353888273239136, + "learning_rate": 9.606544062987541e-05, + "loss": 0.5859389901161194, + "step": 2802 + }, + { + "epoch": 1.1831223628691983, + "grad_norm": 1.3141933679580688, + "learning_rate": 9.605609767190464e-05, + "loss": 0.6573460698127747, + "step": 2804 + }, + { + "epoch": 1.1839662447257384, + "grad_norm": 1.1209372282028198, + "learning_rate": 9.604674408967664e-05, + "loss": 0.6991921067237854, + "step": 2806 + }, + { + "epoch": 1.1848101265822786, + "grad_norm": 1.2830493450164795, + "learning_rate": 9.603737988534913e-05, + "loss": 0.6438087821006775, + "step": 2808 + }, + { + "epoch": 1.1856540084388185, + "grad_norm": 1.1427195072174072, + "learning_rate": 9.602800506108225e-05, + "loss": 0.6452094316482544, + "step": 2810 + }, + { + "epoch": 1.1864978902953587, + "grad_norm": 1.316420078277588, + "learning_rate": 9.601861961903857e-05, + "loss": 0.6745601296424866, + "step": 2812 + }, + { + "epoch": 1.1873417721518988, + "grad_norm": 1.1643308401107788, + "learning_rate": 9.600922356138317e-05, + "loss": 0.6761514544487, + "step": 2814 + }, + { + "epoch": 1.1881856540084388, + "grad_norm": 1.036056399345398, + "learning_rate": 9.59998168902835e-05, + "loss": 0.6453908681869507, + "step": 2816 + }, + { + "epoch": 1.189029535864979, + "grad_norm": 1.2211129665374756, + "learning_rate": 9.599039960790954e-05, + "loss": 0.6576406359672546, + "step": 2818 + }, + { + "epoch": 1.189873417721519, + "grad_norm": 1.084114670753479, + "learning_rate": 9.598097171643364e-05, + "loss": 0.6214181780815125, + "step": 2820 + }, + { + "epoch": 1.190717299578059, + "grad_norm": 1.1297314167022705, + "learning_rate": 9.597153321803064e-05, + "loss": 0.6381646990776062, + "step": 2822 + }, + { + "epoch": 1.1915611814345992, + "grad_norm": 1.2568120956420898, + "learning_rate": 9.596208411487784e-05, + "loss": 0.7129076719284058, + "step": 2824 + }, + { + "epoch": 1.1924050632911392, + "grad_norm": 1.07041335105896, + "learning_rate": 9.595262440915493e-05, + "loss": 0.7123546004295349, + "step": 2826 + }, + { + "epoch": 1.1932489451476793, + "grad_norm": 1.3950074911117554, + "learning_rate": 9.594315410304413e-05, + "loss": 0.7263038158416748, + "step": 2828 + }, + { + "epoch": 1.1940928270042195, + "grad_norm": 1.2470672130584717, + "learning_rate": 9.593367319873002e-05, + "loss": 0.6863036751747131, + "step": 2830 + }, + { + "epoch": 1.1949367088607594, + "grad_norm": 1.2065461874008179, + "learning_rate": 9.592418169839968e-05, + "loss": 0.745354175567627, + "step": 2832 + }, + { + "epoch": 1.1957805907172996, + "grad_norm": 1.1710152626037598, + "learning_rate": 9.591467960424261e-05, + "loss": 0.6401656866073608, + "step": 2834 + }, + { + "epoch": 1.1966244725738397, + "grad_norm": 1.3324087858200073, + "learning_rate": 9.590516691845077e-05, + "loss": 0.7402615547180176, + "step": 2836 + }, + { + "epoch": 1.1974683544303797, + "grad_norm": 1.0100195407867432, + "learning_rate": 9.589564364321855e-05, + "loss": 0.5723769068717957, + "step": 2838 + }, + { + "epoch": 1.1983122362869199, + "grad_norm": 1.2706246376037598, + "learning_rate": 9.588610978074277e-05, + "loss": 0.6618966460227966, + "step": 2840 + }, + { + "epoch": 1.1991561181434598, + "grad_norm": 1.1921758651733398, + "learning_rate": 9.587656533322273e-05, + "loss": 0.7090804576873779, + "step": 2842 + }, + { + "epoch": 1.2, + "grad_norm": 1.36713445186615, + "learning_rate": 9.586701030286014e-05, + "loss": 0.6930652856826782, + "step": 2844 + }, + { + "epoch": 1.2008438818565401, + "grad_norm": 1.3084295988082886, + "learning_rate": 9.585744469185917e-05, + "loss": 0.7386236190795898, + "step": 2846 + }, + { + "epoch": 1.20168776371308, + "grad_norm": 1.198922038078308, + "learning_rate": 9.584786850242642e-05, + "loss": 0.6179903149604797, + "step": 2848 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.2106369733810425, + "learning_rate": 9.583828173677092e-05, + "loss": 0.7027528882026672, + "step": 2850 + }, + { + "epoch": 1.2033755274261604, + "grad_norm": 1.2959522008895874, + "learning_rate": 9.582868439710418e-05, + "loss": 0.6612945199012756, + "step": 2852 + }, + { + "epoch": 1.2042194092827003, + "grad_norm": 1.1441705226898193, + "learning_rate": 9.58190764856401e-05, + "loss": 0.7085917592048645, + "step": 2854 + }, + { + "epoch": 1.2050632911392405, + "grad_norm": 1.1586185693740845, + "learning_rate": 9.580945800459504e-05, + "loss": 0.7480600476264954, + "step": 2856 + }, + { + "epoch": 1.2059071729957807, + "grad_norm": 1.2068266868591309, + "learning_rate": 9.579982895618783e-05, + "loss": 0.7185836434364319, + "step": 2858 + }, + { + "epoch": 1.2067510548523206, + "grad_norm": 1.2188525199890137, + "learning_rate": 9.579018934263966e-05, + "loss": 0.6737306118011475, + "step": 2860 + }, + { + "epoch": 1.2075949367088608, + "grad_norm": 1.1513181924819946, + "learning_rate": 9.578053916617423e-05, + "loss": 0.7239293456077576, + "step": 2862 + }, + { + "epoch": 1.208438818565401, + "grad_norm": 1.2063703536987305, + "learning_rate": 9.577087842901764e-05, + "loss": 0.6416276097297668, + "step": 2864 + }, + { + "epoch": 1.2092827004219409, + "grad_norm": 1.102460503578186, + "learning_rate": 9.576120713339844e-05, + "loss": 0.697213351726532, + "step": 2866 + }, + { + "epoch": 1.210126582278481, + "grad_norm": 1.2484638690948486, + "learning_rate": 9.575152528154763e-05, + "loss": 0.6664742231369019, + "step": 2868 + }, + { + "epoch": 1.2109704641350212, + "grad_norm": 1.4476624727249146, + "learning_rate": 9.57418328756986e-05, + "loss": 0.6914868354797363, + "step": 2870 + }, + { + "epoch": 1.2118143459915611, + "grad_norm": 1.0130122900009155, + "learning_rate": 9.573212991808722e-05, + "loss": 0.662024736404419, + "step": 2872 + }, + { + "epoch": 1.2126582278481013, + "grad_norm": 1.014470100402832, + "learning_rate": 9.572241641095177e-05, + "loss": 0.6330409646034241, + "step": 2874 + }, + { + "epoch": 1.2135021097046415, + "grad_norm": 1.1803333759307861, + "learning_rate": 9.571269235653298e-05, + "loss": 0.6607463955879211, + "step": 2876 + }, + { + "epoch": 1.2143459915611814, + "grad_norm": 1.261366844177246, + "learning_rate": 9.570295775707398e-05, + "loss": 0.6925629377365112, + "step": 2878 + }, + { + "epoch": 1.2151898734177216, + "grad_norm": 1.226670503616333, + "learning_rate": 9.569321261482037e-05, + "loss": 0.7070510983467102, + "step": 2880 + }, + { + "epoch": 1.2160337552742617, + "grad_norm": 1.164565920829773, + "learning_rate": 9.568345693202016e-05, + "loss": 0.7243561744689941, + "step": 2882 + }, + { + "epoch": 1.2168776371308017, + "grad_norm": 1.060331106185913, + "learning_rate": 9.567369071092382e-05, + "loss": 0.6316909790039062, + "step": 2884 + }, + { + "epoch": 1.2177215189873418, + "grad_norm": 1.1998693943023682, + "learning_rate": 9.566391395378419e-05, + "loss": 0.6139125227928162, + "step": 2886 + }, + { + "epoch": 1.2185654008438818, + "grad_norm": 1.1875834465026855, + "learning_rate": 9.565412666285661e-05, + "loss": 0.688897430896759, + "step": 2888 + }, + { + "epoch": 1.219409282700422, + "grad_norm": 1.199174404144287, + "learning_rate": 9.564432884039882e-05, + "loss": 0.684590756893158, + "step": 2890 + }, + { + "epoch": 1.220253164556962, + "grad_norm": 1.2428219318389893, + "learning_rate": 9.563452048867099e-05, + "loss": 0.67433100938797, + "step": 2892 + }, + { + "epoch": 1.221097046413502, + "grad_norm": 1.0826431512832642, + "learning_rate": 9.562470160993568e-05, + "loss": 0.6959785223007202, + "step": 2894 + }, + { + "epoch": 1.2219409282700422, + "grad_norm": 1.3140246868133545, + "learning_rate": 9.561487220645797e-05, + "loss": 0.6443175673484802, + "step": 2896 + }, + { + "epoch": 1.2227848101265824, + "grad_norm": 1.2758334875106812, + "learning_rate": 9.560503228050529e-05, + "loss": 0.6715332865715027, + "step": 2898 + }, + { + "epoch": 1.2236286919831223, + "grad_norm": 1.3326421976089478, + "learning_rate": 9.559518183434753e-05, + "loss": 0.6896081566810608, + "step": 2900 + }, + { + "epoch": 1.2236286919831223, + "eval_loss": 0.7281573414802551, + "eval_runtime": 854.563, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 2.466, + "step": 2900 + }, + { + "epoch": 1.2244725738396625, + "grad_norm": 1.3225606679916382, + "learning_rate": 9.558532087025697e-05, + "loss": 0.6797633171081543, + "step": 2902 + }, + { + "epoch": 1.2253164556962026, + "grad_norm": 1.3058340549468994, + "learning_rate": 9.55754493905084e-05, + "loss": 0.6510948538780212, + "step": 2904 + }, + { + "epoch": 1.2261603375527426, + "grad_norm": 1.140268087387085, + "learning_rate": 9.556556739737892e-05, + "loss": 0.6481176614761353, + "step": 2906 + }, + { + "epoch": 1.2270042194092827, + "grad_norm": 1.465113639831543, + "learning_rate": 9.555567489314816e-05, + "loss": 0.7533771991729736, + "step": 2908 + }, + { + "epoch": 1.2278481012658227, + "grad_norm": 1.1468979120254517, + "learning_rate": 9.554577188009812e-05, + "loss": 0.6924305558204651, + "step": 2910 + }, + { + "epoch": 1.2286919831223628, + "grad_norm": 1.2193517684936523, + "learning_rate": 9.553585836051321e-05, + "loss": 0.7082820534706116, + "step": 2912 + }, + { + "epoch": 1.229535864978903, + "grad_norm": 1.2015037536621094, + "learning_rate": 9.552593433668034e-05, + "loss": 0.6735695004463196, + "step": 2914 + }, + { + "epoch": 1.230379746835443, + "grad_norm": 1.1915435791015625, + "learning_rate": 9.551599981088874e-05, + "loss": 0.7312048673629761, + "step": 2916 + }, + { + "epoch": 1.231223628691983, + "grad_norm": 1.2849410772323608, + "learning_rate": 9.550605478543013e-05, + "loss": 0.6590308547019958, + "step": 2918 + }, + { + "epoch": 1.2320675105485233, + "grad_norm": 1.192238688468933, + "learning_rate": 9.549609926259866e-05, + "loss": 0.6237715482711792, + "step": 2920 + }, + { + "epoch": 1.2329113924050632, + "grad_norm": 1.141845703125, + "learning_rate": 9.548613324469085e-05, + "loss": 0.6546295881271362, + "step": 2922 + }, + { + "epoch": 1.2337552742616034, + "grad_norm": 1.1662311553955078, + "learning_rate": 9.547615673400566e-05, + "loss": 0.5800934433937073, + "step": 2924 + }, + { + "epoch": 1.2345991561181435, + "grad_norm": 1.120578646659851, + "learning_rate": 9.546616973284453e-05, + "loss": 0.6487136483192444, + "step": 2926 + }, + { + "epoch": 1.2354430379746835, + "grad_norm": 1.0884860754013062, + "learning_rate": 9.54561722435112e-05, + "loss": 0.7515342235565186, + "step": 2928 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 1.4208670854568481, + "learning_rate": 9.544616426831196e-05, + "loss": 0.7162003517150879, + "step": 2930 + }, + { + "epoch": 1.2371308016877638, + "grad_norm": 1.083389401435852, + "learning_rate": 9.543614580955543e-05, + "loss": 0.708450198173523, + "step": 2932 + }, + { + "epoch": 1.2379746835443037, + "grad_norm": 1.141364336013794, + "learning_rate": 9.542611686955268e-05, + "loss": 0.6255859732627869, + "step": 2934 + }, + { + "epoch": 1.238818565400844, + "grad_norm": 1.122036099433899, + "learning_rate": 9.54160774506172e-05, + "loss": 0.6485402584075928, + "step": 2936 + }, + { + "epoch": 1.239662447257384, + "grad_norm": 1.3514165878295898, + "learning_rate": 9.540602755506487e-05, + "loss": 0.6735473871231079, + "step": 2938 + }, + { + "epoch": 1.240506329113924, + "grad_norm": 1.1762629747390747, + "learning_rate": 9.539596718521403e-05, + "loss": 0.6154970526695251, + "step": 2940 + }, + { + "epoch": 1.2413502109704642, + "grad_norm": 1.1609408855438232, + "learning_rate": 9.53858963433854e-05, + "loss": 0.6410251259803772, + "step": 2942 + }, + { + "epoch": 1.2421940928270043, + "grad_norm": 1.1750361919403076, + "learning_rate": 9.537581503190214e-05, + "loss": 0.6841039657592773, + "step": 2944 + }, + { + "epoch": 1.2430379746835443, + "grad_norm": 1.3125680685043335, + "learning_rate": 9.536572325308982e-05, + "loss": 0.7293462753295898, + "step": 2946 + }, + { + "epoch": 1.2438818565400844, + "grad_norm": 1.1737277507781982, + "learning_rate": 9.53556210092764e-05, + "loss": 0.7713663578033447, + "step": 2948 + }, + { + "epoch": 1.2447257383966246, + "grad_norm": 1.1702152490615845, + "learning_rate": 9.53455083027923e-05, + "loss": 0.6612298488616943, + "step": 2950 + }, + { + "epoch": 1.2455696202531645, + "grad_norm": 1.2594486474990845, + "learning_rate": 9.533538513597028e-05, + "loss": 0.6725803017616272, + "step": 2952 + }, + { + "epoch": 1.2464135021097047, + "grad_norm": 1.180816411972046, + "learning_rate": 9.532525151114562e-05, + "loss": 0.6421069502830505, + "step": 2954 + }, + { + "epoch": 1.2472573839662446, + "grad_norm": 1.25814688205719, + "learning_rate": 9.531510743065593e-05, + "loss": 0.7042996287345886, + "step": 2956 + }, + { + "epoch": 1.2481012658227848, + "grad_norm": 1.2101783752441406, + "learning_rate": 9.530495289684122e-05, + "loss": 0.7359137535095215, + "step": 2958 + }, + { + "epoch": 1.248945147679325, + "grad_norm": 1.1438405513763428, + "learning_rate": 9.5294787912044e-05, + "loss": 0.6186386346817017, + "step": 2960 + }, + { + "epoch": 1.249789029535865, + "grad_norm": 1.163364291191101, + "learning_rate": 9.52846124786091e-05, + "loss": 0.6243056058883667, + "step": 2962 + }, + { + "epoch": 1.250632911392405, + "grad_norm": 1.0695953369140625, + "learning_rate": 9.52744265988838e-05, + "loss": 0.6568763852119446, + "step": 2964 + }, + { + "epoch": 1.2514767932489452, + "grad_norm": 1.2228879928588867, + "learning_rate": 9.52642302752178e-05, + "loss": 0.6486776471138, + "step": 2966 + }, + { + "epoch": 1.2523206751054852, + "grad_norm": 1.2262967824935913, + "learning_rate": 9.52540235099632e-05, + "loss": 0.6293455958366394, + "step": 2968 + }, + { + "epoch": 1.2531645569620253, + "grad_norm": 1.0862956047058105, + "learning_rate": 9.524380630547449e-05, + "loss": 0.6549884080886841, + "step": 2970 + }, + { + "epoch": 1.2540084388185653, + "grad_norm": 1.1721880435943604, + "learning_rate": 9.52335786641086e-05, + "loss": 0.6126490831375122, + "step": 2972 + }, + { + "epoch": 1.2548523206751054, + "grad_norm": 1.2452391386032104, + "learning_rate": 9.522334058822483e-05, + "loss": 0.7078590393066406, + "step": 2974 + }, + { + "epoch": 1.2556962025316456, + "grad_norm": 1.2290222644805908, + "learning_rate": 9.521309208018492e-05, + "loss": 0.6166214942932129, + "step": 2976 + }, + { + "epoch": 1.2565400843881855, + "grad_norm": 1.1823618412017822, + "learning_rate": 9.520283314235299e-05, + "loss": 0.666228175163269, + "step": 2978 + }, + { + "epoch": 1.2573839662447257, + "grad_norm": 1.1702475547790527, + "learning_rate": 9.51925637770956e-05, + "loss": 0.7436795830726624, + "step": 2980 + }, + { + "epoch": 1.2582278481012659, + "grad_norm": 1.0879321098327637, + "learning_rate": 9.518228398678168e-05, + "loss": 0.7120893001556396, + "step": 2982 + }, + { + "epoch": 1.2590717299578058, + "grad_norm": 1.1608418226242065, + "learning_rate": 9.517199377378261e-05, + "loss": 0.6931713223457336, + "step": 2984 + }, + { + "epoch": 1.259915611814346, + "grad_norm": 1.1289087533950806, + "learning_rate": 9.51616931404721e-05, + "loss": 0.6803538799285889, + "step": 2986 + }, + { + "epoch": 1.2607594936708861, + "grad_norm": 1.1622236967086792, + "learning_rate": 9.515138208922633e-05, + "loss": 0.6499706506729126, + "step": 2988 + }, + { + "epoch": 1.261603375527426, + "grad_norm": 1.2492594718933105, + "learning_rate": 9.514106062242386e-05, + "loss": 0.6132655739784241, + "step": 2990 + }, + { + "epoch": 1.2624472573839662, + "grad_norm": 1.1538822650909424, + "learning_rate": 9.513072874244567e-05, + "loss": 0.6309265494346619, + "step": 2992 + }, + { + "epoch": 1.2632911392405064, + "grad_norm": 1.0828478336334229, + "learning_rate": 9.512038645167509e-05, + "loss": 0.6297751665115356, + "step": 2994 + }, + { + "epoch": 1.2641350210970463, + "grad_norm": 1.2440937757492065, + "learning_rate": 9.511003375249792e-05, + "loss": 0.6335258483886719, + "step": 2996 + }, + { + "epoch": 1.2649789029535865, + "grad_norm": 1.1259970664978027, + "learning_rate": 9.50996706473023e-05, + "loss": 0.6513770818710327, + "step": 2998 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.1530309915542603, + "learning_rate": 9.508929713847884e-05, + "loss": 0.6490892767906189, + "step": 3000 + }, + { + "epoch": 1.2658227848101267, + "eval_loss": 0.72515869140625, + "eval_runtime": 868.0515, + "eval_samples_per_second": 2.427, + "eval_steps_per_second": 2.427, + "step": 3000 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.2257169485092163, + "learning_rate": 9.507891322842048e-05, + "loss": 0.6936060786247253, + "step": 3002 + }, + { + "epoch": 1.2675105485232068, + "grad_norm": 1.0380109548568726, + "learning_rate": 9.506851891952259e-05, + "loss": 0.5941951870918274, + "step": 3004 + }, + { + "epoch": 1.268354430379747, + "grad_norm": 1.2830222845077515, + "learning_rate": 9.505811421418296e-05, + "loss": 0.648429811000824, + "step": 3006 + }, + { + "epoch": 1.2691983122362869, + "grad_norm": 1.2212986946105957, + "learning_rate": 9.504769911480171e-05, + "loss": 0.6868565678596497, + "step": 3008 + }, + { + "epoch": 1.270042194092827, + "grad_norm": 1.104656457901001, + "learning_rate": 9.503727362378145e-05, + "loss": 0.6777986288070679, + "step": 3010 + }, + { + "epoch": 1.2708860759493672, + "grad_norm": 1.1449005603790283, + "learning_rate": 9.502683774352713e-05, + "loss": 0.6581128239631653, + "step": 3012 + }, + { + "epoch": 1.2717299578059071, + "grad_norm": 1.2753362655639648, + "learning_rate": 9.501639147644608e-05, + "loss": 0.689930260181427, + "step": 3014 + }, + { + "epoch": 1.2725738396624473, + "grad_norm": 1.3367106914520264, + "learning_rate": 9.500593482494809e-05, + "loss": 0.7549214363098145, + "step": 3016 + }, + { + "epoch": 1.2734177215189875, + "grad_norm": 1.2309048175811768, + "learning_rate": 9.499546779144528e-05, + "loss": 0.6713513135910034, + "step": 3018 + }, + { + "epoch": 1.2742616033755274, + "grad_norm": 1.3833240270614624, + "learning_rate": 9.49849903783522e-05, + "loss": 0.7045458555221558, + "step": 3020 + }, + { + "epoch": 1.2751054852320676, + "grad_norm": 1.1402570009231567, + "learning_rate": 9.49745025880858e-05, + "loss": 0.708249568939209, + "step": 3022 + }, + { + "epoch": 1.2759493670886077, + "grad_norm": 1.0476267337799072, + "learning_rate": 9.496400442306541e-05, + "loss": 0.616210401058197, + "step": 3024 + }, + { + "epoch": 1.2767932489451477, + "grad_norm": 1.1045979261398315, + "learning_rate": 9.495349588571274e-05, + "loss": 0.6691827178001404, + "step": 3026 + }, + { + "epoch": 1.2776371308016878, + "grad_norm": 1.1760368347167969, + "learning_rate": 9.494297697845194e-05, + "loss": 0.6198306083679199, + "step": 3028 + }, + { + "epoch": 1.2784810126582278, + "grad_norm": 1.0015549659729004, + "learning_rate": 9.493244770370946e-05, + "loss": 0.5756480097770691, + "step": 3030 + }, + { + "epoch": 1.279324894514768, + "grad_norm": 1.2190428972244263, + "learning_rate": 9.492190806391427e-05, + "loss": 0.6794419884681702, + "step": 3032 + }, + { + "epoch": 1.2801687763713079, + "grad_norm": 1.0210410356521606, + "learning_rate": 9.491135806149762e-05, + "loss": 0.5847988724708557, + "step": 3034 + }, + { + "epoch": 1.281012658227848, + "grad_norm": 1.0678503513336182, + "learning_rate": 9.490079769889319e-05, + "loss": 0.6760231256484985, + "step": 3036 + }, + { + "epoch": 1.2818565400843882, + "grad_norm": 1.1811012029647827, + "learning_rate": 9.489022697853709e-05, + "loss": 0.7188448309898376, + "step": 3038 + }, + { + "epoch": 1.2827004219409281, + "grad_norm": 1.1134302616119385, + "learning_rate": 9.487964590286776e-05, + "loss": 0.674904465675354, + "step": 3040 + }, + { + "epoch": 1.2835443037974683, + "grad_norm": 1.1868232488632202, + "learning_rate": 9.486905447432603e-05, + "loss": 0.6016344428062439, + "step": 3042 + }, + { + "epoch": 1.2843881856540085, + "grad_norm": 1.1586613655090332, + "learning_rate": 9.485845269535517e-05, + "loss": 0.6965603828430176, + "step": 3044 + }, + { + "epoch": 1.2852320675105484, + "grad_norm": 1.149837613105774, + "learning_rate": 9.48478405684008e-05, + "loss": 0.656144380569458, + "step": 3046 + }, + { + "epoch": 1.2860759493670886, + "grad_norm": 1.228752613067627, + "learning_rate": 9.48372180959109e-05, + "loss": 0.6388653516769409, + "step": 3048 + }, + { + "epoch": 1.2869198312236287, + "grad_norm": 1.2403100728988647, + "learning_rate": 9.482658528033595e-05, + "loss": 0.6255465745925903, + "step": 3050 + }, + { + "epoch": 1.2877637130801687, + "grad_norm": 1.2483839988708496, + "learning_rate": 9.481594212412865e-05, + "loss": 0.6828253269195557, + "step": 3052 + }, + { + "epoch": 1.2886075949367088, + "grad_norm": 1.4161021709442139, + "learning_rate": 9.480528862974422e-05, + "loss": 0.7072080373764038, + "step": 3054 + }, + { + "epoch": 1.289451476793249, + "grad_norm": 1.1500437259674072, + "learning_rate": 9.479462479964021e-05, + "loss": 0.6082415580749512, + "step": 3056 + }, + { + "epoch": 1.290295358649789, + "grad_norm": 1.196595549583435, + "learning_rate": 9.478395063627654e-05, + "loss": 0.6653015613555908, + "step": 3058 + }, + { + "epoch": 1.2911392405063291, + "grad_norm": 1.2832285165786743, + "learning_rate": 9.477326614211557e-05, + "loss": 0.7095832824707031, + "step": 3060 + }, + { + "epoch": 1.2919831223628693, + "grad_norm": 1.2234288454055786, + "learning_rate": 9.476257131962198e-05, + "loss": 0.7183426022529602, + "step": 3062 + }, + { + "epoch": 1.2928270042194092, + "grad_norm": 1.2350459098815918, + "learning_rate": 9.475186617126286e-05, + "loss": 0.713284432888031, + "step": 3064 + }, + { + "epoch": 1.2936708860759494, + "grad_norm": 1.2079555988311768, + "learning_rate": 9.47411506995077e-05, + "loss": 0.6580002307891846, + "step": 3066 + }, + { + "epoch": 1.2945147679324895, + "grad_norm": 1.129796028137207, + "learning_rate": 9.473042490682835e-05, + "loss": 0.5967763662338257, + "step": 3068 + }, + { + "epoch": 1.2953586497890295, + "grad_norm": 1.1706618070602417, + "learning_rate": 9.471968879569901e-05, + "loss": 0.6724388003349304, + "step": 3070 + }, + { + "epoch": 1.2962025316455696, + "grad_norm": 1.0336005687713623, + "learning_rate": 9.470894236859635e-05, + "loss": 0.6527577638626099, + "step": 3072 + }, + { + "epoch": 1.2970464135021098, + "grad_norm": 1.1124558448791504, + "learning_rate": 9.469818562799932e-05, + "loss": 0.677132785320282, + "step": 3074 + }, + { + "epoch": 1.2978902953586497, + "grad_norm": 1.158069372177124, + "learning_rate": 9.468741857638933e-05, + "loss": 0.649718165397644, + "step": 3076 + }, + { + "epoch": 1.29873417721519, + "grad_norm": 1.092926263809204, + "learning_rate": 9.46766412162501e-05, + "loss": 0.6872133612632751, + "step": 3078 + }, + { + "epoch": 1.29957805907173, + "grad_norm": 1.1324822902679443, + "learning_rate": 9.466585355006777e-05, + "loss": 0.6495246291160583, + "step": 3080 + }, + { + "epoch": 1.30042194092827, + "grad_norm": 1.5882837772369385, + "learning_rate": 9.465505558033086e-05, + "loss": 0.6730570197105408, + "step": 3082 + }, + { + "epoch": 1.3012658227848102, + "grad_norm": 0.9866069555282593, + "learning_rate": 9.464424730953023e-05, + "loss": 0.5677527785301208, + "step": 3084 + }, + { + "epoch": 1.3021097046413503, + "grad_norm": 1.1560224294662476, + "learning_rate": 9.463342874015917e-05, + "loss": 0.6247856020927429, + "step": 3086 + }, + { + "epoch": 1.3029535864978903, + "grad_norm": 1.135939359664917, + "learning_rate": 9.462259987471329e-05, + "loss": 0.6889358758926392, + "step": 3088 + }, + { + "epoch": 1.3037974683544304, + "grad_norm": 1.3935760259628296, + "learning_rate": 9.461176071569063e-05, + "loss": 0.7097522020339966, + "step": 3090 + }, + { + "epoch": 1.3046413502109704, + "grad_norm": 1.153518795967102, + "learning_rate": 9.460091126559155e-05, + "loss": 0.7044580578804016, + "step": 3092 + }, + { + "epoch": 1.3054852320675105, + "grad_norm": 1.2112717628479004, + "learning_rate": 9.45900515269188e-05, + "loss": 0.6119300723075867, + "step": 3094 + }, + { + "epoch": 1.3063291139240507, + "grad_norm": 1.295591115951538, + "learning_rate": 9.457918150217754e-05, + "loss": 0.7150222063064575, + "step": 3096 + }, + { + "epoch": 1.3071729957805907, + "grad_norm": 1.1175775527954102, + "learning_rate": 9.456830119387527e-05, + "loss": 0.6043334007263184, + "step": 3098 + }, + { + "epoch": 1.3080168776371308, + "grad_norm": 1.4022588729858398, + "learning_rate": 9.455741060452186e-05, + "loss": 0.6354425549507141, + "step": 3100 + }, + { + "epoch": 1.3080168776371308, + "eval_loss": 0.7225774526596069, + "eval_runtime": 862.4006, + "eval_samples_per_second": 2.443, + "eval_steps_per_second": 2.443, + "step": 3100 + }, + { + "epoch": 1.3088607594936708, + "grad_norm": 1.1657692193984985, + "learning_rate": 9.454650973662957e-05, + "loss": 0.7281571626663208, + "step": 3102 + }, + { + "epoch": 1.309704641350211, + "grad_norm": 1.6169127225875854, + "learning_rate": 9.453559859271301e-05, + "loss": 0.8038214445114136, + "step": 3104 + }, + { + "epoch": 1.310548523206751, + "grad_norm": 1.1256520748138428, + "learning_rate": 9.452467717528918e-05, + "loss": 0.6488606333732605, + "step": 3106 + }, + { + "epoch": 1.311392405063291, + "grad_norm": 1.1224530935287476, + "learning_rate": 9.451374548687745e-05, + "loss": 0.6897066235542297, + "step": 3108 + }, + { + "epoch": 1.3122362869198312, + "grad_norm": 1.1123055219650269, + "learning_rate": 9.450280352999952e-05, + "loss": 0.6332913041114807, + "step": 3110 + }, + { + "epoch": 1.3130801687763713, + "grad_norm": 1.1688940525054932, + "learning_rate": 9.449185130717952e-05, + "loss": 0.7426630854606628, + "step": 3112 + }, + { + "epoch": 1.3139240506329113, + "grad_norm": 1.1898044347763062, + "learning_rate": 9.44808888209439e-05, + "loss": 0.7156099677085876, + "step": 3114 + }, + { + "epoch": 1.3147679324894515, + "grad_norm": 1.3030686378479004, + "learning_rate": 9.44699160738215e-05, + "loss": 0.7150979042053223, + "step": 3116 + }, + { + "epoch": 1.3156118143459916, + "grad_norm": 1.1539074182510376, + "learning_rate": 9.445893306834352e-05, + "loss": 0.6687285900115967, + "step": 3118 + }, + { + "epoch": 1.3164556962025316, + "grad_norm": 1.311808466911316, + "learning_rate": 9.444793980704355e-05, + "loss": 0.7340983152389526, + "step": 3120 + }, + { + "epoch": 1.3172995780590717, + "grad_norm": 1.3325430154800415, + "learning_rate": 9.44369362924575e-05, + "loss": 0.6620677709579468, + "step": 3122 + }, + { + "epoch": 1.3181434599156119, + "grad_norm": 1.201518177986145, + "learning_rate": 9.442592252712365e-05, + "loss": 0.6169955134391785, + "step": 3124 + }, + { + "epoch": 1.3189873417721518, + "grad_norm": 1.2124013900756836, + "learning_rate": 9.441489851358272e-05, + "loss": 0.6696792840957642, + "step": 3126 + }, + { + "epoch": 1.319831223628692, + "grad_norm": 1.2186850309371948, + "learning_rate": 9.440386425437768e-05, + "loss": 0.7303428649902344, + "step": 3128 + }, + { + "epoch": 1.3206751054852321, + "grad_norm": 1.3780523538589478, + "learning_rate": 9.439281975205396e-05, + "loss": 0.7093026638031006, + "step": 3130 + }, + { + "epoch": 1.321518987341772, + "grad_norm": 1.233353614807129, + "learning_rate": 9.438176500915932e-05, + "loss": 0.6821767687797546, + "step": 3132 + }, + { + "epoch": 1.3223628691983123, + "grad_norm": 1.2425329685211182, + "learning_rate": 9.437070002824385e-05, + "loss": 0.700680136680603, + "step": 3134 + }, + { + "epoch": 1.3232067510548524, + "grad_norm": 1.1600432395935059, + "learning_rate": 9.435962481186003e-05, + "loss": 0.6173145771026611, + "step": 3136 + }, + { + "epoch": 1.3240506329113924, + "grad_norm": 1.279336929321289, + "learning_rate": 9.434853936256272e-05, + "loss": 0.6597106456756592, + "step": 3138 + }, + { + "epoch": 1.3248945147679325, + "grad_norm": 1.1787258386611938, + "learning_rate": 9.433744368290909e-05, + "loss": 0.6655287742614746, + "step": 3140 + }, + { + "epoch": 1.3257383966244727, + "grad_norm": 1.3658509254455566, + "learning_rate": 9.432633777545874e-05, + "loss": 0.6312944889068604, + "step": 3142 + }, + { + "epoch": 1.3265822784810126, + "grad_norm": 1.1220000982284546, + "learning_rate": 9.431522164277356e-05, + "loss": 0.6696156859397888, + "step": 3144 + }, + { + "epoch": 1.3274261603375528, + "grad_norm": 1.224761724472046, + "learning_rate": 9.430409528741783e-05, + "loss": 0.6586571335792542, + "step": 3146 + }, + { + "epoch": 1.328270042194093, + "grad_norm": 1.227510929107666, + "learning_rate": 9.429295871195821e-05, + "loss": 0.64905846118927, + "step": 3148 + }, + { + "epoch": 1.3291139240506329, + "grad_norm": 1.1359103918075562, + "learning_rate": 9.428181191896366e-05, + "loss": 0.6407933831214905, + "step": 3150 + }, + { + "epoch": 1.329957805907173, + "grad_norm": 1.2729473114013672, + "learning_rate": 9.427065491100556e-05, + "loss": 0.7004884481430054, + "step": 3152 + }, + { + "epoch": 1.3308016877637132, + "grad_norm": 1.1182841062545776, + "learning_rate": 9.42594876906576e-05, + "loss": 0.6835907101631165, + "step": 3154 + }, + { + "epoch": 1.3316455696202532, + "grad_norm": 1.2309781312942505, + "learning_rate": 9.424831026049585e-05, + "loss": 0.7476315498352051, + "step": 3156 + }, + { + "epoch": 1.3324894514767933, + "grad_norm": 1.0857728719711304, + "learning_rate": 9.423712262309873e-05, + "loss": 0.6811426281929016, + "step": 3158 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.299680233001709, + "learning_rate": 9.4225924781047e-05, + "loss": 0.6403942108154297, + "step": 3160 + }, + { + "epoch": 1.3341772151898734, + "grad_norm": 1.226472020149231, + "learning_rate": 9.421471673692382e-05, + "loss": 0.6758930683135986, + "step": 3162 + }, + { + "epoch": 1.3350210970464136, + "grad_norm": 1.1403205394744873, + "learning_rate": 9.420349849331463e-05, + "loss": 0.7119444608688354, + "step": 3164 + }, + { + "epoch": 1.3358649789029535, + "grad_norm": 1.2888442277908325, + "learning_rate": 9.419227005280729e-05, + "loss": 0.7411463260650635, + "step": 3166 + }, + { + "epoch": 1.3367088607594937, + "grad_norm": 1.1929190158843994, + "learning_rate": 9.418103141799197e-05, + "loss": 0.5992606282234192, + "step": 3168 + }, + { + "epoch": 1.3375527426160336, + "grad_norm": 1.2574355602264404, + "learning_rate": 9.416978259146122e-05, + "loss": 0.6728890538215637, + "step": 3170 + }, + { + "epoch": 1.3383966244725738, + "grad_norm": 0.9653727412223816, + "learning_rate": 9.415852357580992e-05, + "loss": 0.6294883489608765, + "step": 3172 + }, + { + "epoch": 1.339240506329114, + "grad_norm": 1.2107670307159424, + "learning_rate": 9.414725437363532e-05, + "loss": 0.6816665530204773, + "step": 3174 + }, + { + "epoch": 1.340084388185654, + "grad_norm": 1.024849534034729, + "learning_rate": 9.4135974987537e-05, + "loss": 0.6186381578445435, + "step": 3176 + }, + { + "epoch": 1.340928270042194, + "grad_norm": 1.1556614637374878, + "learning_rate": 9.41246854201169e-05, + "loss": 0.6071005463600159, + "step": 3178 + }, + { + "epoch": 1.3417721518987342, + "grad_norm": 1.2382808923721313, + "learning_rate": 9.41133856739793e-05, + "loss": 0.7871434092521667, + "step": 3180 + }, + { + "epoch": 1.3426160337552742, + "grad_norm": 1.0499578714370728, + "learning_rate": 9.410207575173082e-05, + "loss": 0.6578201651573181, + "step": 3182 + }, + { + "epoch": 1.3434599156118143, + "grad_norm": 1.2048250436782837, + "learning_rate": 9.409075565598049e-05, + "loss": 0.6271620392799377, + "step": 3184 + }, + { + "epoch": 1.3443037974683545, + "grad_norm": 1.0287591218948364, + "learning_rate": 9.407942538933958e-05, + "loss": 0.5773864388465881, + "step": 3186 + }, + { + "epoch": 1.3451476793248944, + "grad_norm": 1.1125097274780273, + "learning_rate": 9.406808495442181e-05, + "loss": 0.6745175719261169, + "step": 3188 + }, + { + "epoch": 1.3459915611814346, + "grad_norm": 1.036125898361206, + "learning_rate": 9.405673435384319e-05, + "loss": 0.6001214385032654, + "step": 3190 + }, + { + "epoch": 1.3468354430379748, + "grad_norm": 1.2771985530853271, + "learning_rate": 9.404537359022207e-05, + "loss": 0.6703945994377136, + "step": 3192 + }, + { + "epoch": 1.3476793248945147, + "grad_norm": 1.0891097784042358, + "learning_rate": 9.403400266617918e-05, + "loss": 0.6159096360206604, + "step": 3194 + }, + { + "epoch": 1.3485232067510549, + "grad_norm": 1.1926233768463135, + "learning_rate": 9.402262158433755e-05, + "loss": 0.6439315676689148, + "step": 3196 + }, + { + "epoch": 1.349367088607595, + "grad_norm": 1.272557020187378, + "learning_rate": 9.40112303473226e-05, + "loss": 0.7125352025032043, + "step": 3198 + }, + { + "epoch": 1.350210970464135, + "grad_norm": 1.052037239074707, + "learning_rate": 9.399982895776207e-05, + "loss": 0.594719648361206, + "step": 3200 + }, + { + "epoch": 1.350210970464135, + "eval_loss": 0.7200453281402588, + "eval_runtime": 846.2953, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 3200 + }, + { + "epoch": 1.3510548523206751, + "grad_norm": 1.204728126525879, + "learning_rate": 9.398841741828601e-05, + "loss": 0.6390520334243774, + "step": 3202 + }, + { + "epoch": 1.3518987341772153, + "grad_norm": 1.0873899459838867, + "learning_rate": 9.397699573152689e-05, + "loss": 0.6010531187057495, + "step": 3204 + }, + { + "epoch": 1.3527426160337552, + "grad_norm": 1.3124359846115112, + "learning_rate": 9.396556390011944e-05, + "loss": 0.724280834197998, + "step": 3206 + }, + { + "epoch": 1.3535864978902954, + "grad_norm": 1.2179948091506958, + "learning_rate": 9.395412192670075e-05, + "loss": 0.6430405378341675, + "step": 3208 + }, + { + "epoch": 1.3544303797468356, + "grad_norm": 1.2617219686508179, + "learning_rate": 9.394266981391031e-05, + "loss": 0.7188641428947449, + "step": 3210 + }, + { + "epoch": 1.3552742616033755, + "grad_norm": 1.2151501178741455, + "learning_rate": 9.393120756438988e-05, + "loss": 0.6724364757537842, + "step": 3212 + }, + { + "epoch": 1.3561181434599157, + "grad_norm": 1.221528172492981, + "learning_rate": 9.391973518078357e-05, + "loss": 0.6340664625167847, + "step": 3214 + }, + { + "epoch": 1.3569620253164558, + "grad_norm": 1.3180092573165894, + "learning_rate": 9.390825266573786e-05, + "loss": 0.6914255023002625, + "step": 3216 + }, + { + "epoch": 1.3578059071729958, + "grad_norm": 1.103994369506836, + "learning_rate": 9.38967600219015e-05, + "loss": 0.6137136220932007, + "step": 3218 + }, + { + "epoch": 1.358649789029536, + "grad_norm": 1.33389413356781, + "learning_rate": 9.38852572519257e-05, + "loss": 0.7173700332641602, + "step": 3220 + }, + { + "epoch": 1.3594936708860759, + "grad_norm": 1.1074159145355225, + "learning_rate": 9.387374435846386e-05, + "loss": 0.5942243933677673, + "step": 3222 + }, + { + "epoch": 1.360337552742616, + "grad_norm": 1.1157063245773315, + "learning_rate": 9.386222134417182e-05, + "loss": 0.6362866163253784, + "step": 3224 + }, + { + "epoch": 1.3611814345991562, + "grad_norm": 1.1717792749404907, + "learning_rate": 9.38506882117077e-05, + "loss": 0.6784523129463196, + "step": 3226 + }, + { + "epoch": 1.3620253164556961, + "grad_norm": 1.0946043729782104, + "learning_rate": 9.383914496373197e-05, + "loss": 0.6647377014160156, + "step": 3228 + }, + { + "epoch": 1.3628691983122363, + "grad_norm": 1.1519699096679688, + "learning_rate": 9.382759160290746e-05, + "loss": 0.6302075982093811, + "step": 3230 + }, + { + "epoch": 1.3637130801687762, + "grad_norm": 0.9928684830665588, + "learning_rate": 9.381602813189929e-05, + "loss": 0.5979090332984924, + "step": 3232 + }, + { + "epoch": 1.3645569620253164, + "grad_norm": 1.2488124370574951, + "learning_rate": 9.380445455337492e-05, + "loss": 0.6949353218078613, + "step": 3234 + }, + { + "epoch": 1.3654008438818566, + "grad_norm": 1.3884797096252441, + "learning_rate": 9.379287087000416e-05, + "loss": 0.7225558161735535, + "step": 3236 + }, + { + "epoch": 1.3662447257383965, + "grad_norm": 1.2981176376342773, + "learning_rate": 9.378127708445917e-05, + "loss": 0.6993390917778015, + "step": 3238 + }, + { + "epoch": 1.3670886075949367, + "grad_norm": 0.9884640574455261, + "learning_rate": 9.376967319941438e-05, + "loss": 0.6983805894851685, + "step": 3240 + }, + { + "epoch": 1.3679324894514768, + "grad_norm": 1.2051894664764404, + "learning_rate": 9.375805921754659e-05, + "loss": 0.7062534689903259, + "step": 3242 + }, + { + "epoch": 1.3687763713080168, + "grad_norm": 1.1943434476852417, + "learning_rate": 9.374643514153494e-05, + "loss": 0.6405107378959656, + "step": 3244 + }, + { + "epoch": 1.369620253164557, + "grad_norm": 1.249214768409729, + "learning_rate": 9.373480097406086e-05, + "loss": 0.6844781637191772, + "step": 3246 + }, + { + "epoch": 1.370464135021097, + "grad_norm": 1.1847131252288818, + "learning_rate": 9.372315671780813e-05, + "loss": 0.6048306226730347, + "step": 3248 + }, + { + "epoch": 1.371308016877637, + "grad_norm": 1.125545859336853, + "learning_rate": 9.37115023754629e-05, + "loss": 0.6772685050964355, + "step": 3250 + }, + { + "epoch": 1.3721518987341772, + "grad_norm": 1.466615915298462, + "learning_rate": 9.369983794971354e-05, + "loss": 0.7536272406578064, + "step": 3252 + }, + { + "epoch": 1.3729957805907174, + "grad_norm": 1.066699504852295, + "learning_rate": 9.368816344325084e-05, + "loss": 0.6640655398368835, + "step": 3254 + }, + { + "epoch": 1.3738396624472573, + "grad_norm": 1.4793988466262817, + "learning_rate": 9.367647885876787e-05, + "loss": 0.7029458284378052, + "step": 3256 + }, + { + "epoch": 1.3746835443037975, + "grad_norm": 1.258540153503418, + "learning_rate": 9.366478419896006e-05, + "loss": 0.7231863737106323, + "step": 3258 + }, + { + "epoch": 1.3755274261603376, + "grad_norm": 1.176106333732605, + "learning_rate": 9.365307946652512e-05, + "loss": 0.6679144501686096, + "step": 3260 + }, + { + "epoch": 1.3763713080168776, + "grad_norm": 1.3301753997802734, + "learning_rate": 9.364136466416316e-05, + "loss": 0.6282188296318054, + "step": 3262 + }, + { + "epoch": 1.3772151898734177, + "grad_norm": 1.3616732358932495, + "learning_rate": 9.362963979457648e-05, + "loss": 0.6870840191841125, + "step": 3264 + }, + { + "epoch": 1.378059071729958, + "grad_norm": 1.1982418298721313, + "learning_rate": 9.361790486046985e-05, + "loss": 0.6823731660842896, + "step": 3266 + }, + { + "epoch": 1.3789029535864978, + "grad_norm": 1.1869033575057983, + "learning_rate": 9.360615986455024e-05, + "loss": 0.6582897305488586, + "step": 3268 + }, + { + "epoch": 1.379746835443038, + "grad_norm": 1.1192975044250488, + "learning_rate": 9.359440480952703e-05, + "loss": 0.716654360294342, + "step": 3270 + }, + { + "epoch": 1.3805907172995782, + "grad_norm": 1.2210016250610352, + "learning_rate": 9.358263969811189e-05, + "loss": 0.6880061626434326, + "step": 3272 + }, + { + "epoch": 1.381434599156118, + "grad_norm": 1.0358284711837769, + "learning_rate": 9.357086453301878e-05, + "loss": 0.666864812374115, + "step": 3274 + }, + { + "epoch": 1.3822784810126583, + "grad_norm": 1.2790803909301758, + "learning_rate": 9.355907931696401e-05, + "loss": 0.6872087121009827, + "step": 3276 + }, + { + "epoch": 1.3831223628691984, + "grad_norm": 1.182991623878479, + "learning_rate": 9.354728405266623e-05, + "loss": 0.5929665565490723, + "step": 3278 + }, + { + "epoch": 1.3839662447257384, + "grad_norm": 1.1071184873580933, + "learning_rate": 9.353547874284634e-05, + "loss": 0.5928181409835815, + "step": 3280 + }, + { + "epoch": 1.3848101265822785, + "grad_norm": 1.3139623403549194, + "learning_rate": 9.352366339022763e-05, + "loss": 0.6783652901649475, + "step": 3282 + }, + { + "epoch": 1.3856540084388187, + "grad_norm": 1.2534632682800293, + "learning_rate": 9.351183799753567e-05, + "loss": 0.7652941346168518, + "step": 3284 + }, + { + "epoch": 1.3864978902953586, + "grad_norm": 1.4487930536270142, + "learning_rate": 9.350000256749833e-05, + "loss": 0.7430433630943298, + "step": 3286 + }, + { + "epoch": 1.3873417721518988, + "grad_norm": 1.0786021947860718, + "learning_rate": 9.348815710284584e-05, + "loss": 0.5854598879814148, + "step": 3288 + }, + { + "epoch": 1.3881856540084387, + "grad_norm": 1.0544480085372925, + "learning_rate": 9.347630160631071e-05, + "loss": 0.6365222334861755, + "step": 3290 + }, + { + "epoch": 1.389029535864979, + "grad_norm": 0.9989988207817078, + "learning_rate": 9.346443608062778e-05, + "loss": 0.6485803127288818, + "step": 3292 + }, + { + "epoch": 1.389873417721519, + "grad_norm": 1.100951910018921, + "learning_rate": 9.345256052853419e-05, + "loss": 0.6417753100395203, + "step": 3294 + }, + { + "epoch": 1.390717299578059, + "grad_norm": 1.1398471593856812, + "learning_rate": 9.344067495276942e-05, + "loss": 0.6333693861961365, + "step": 3296 + }, + { + "epoch": 1.3915611814345992, + "grad_norm": 1.1745941638946533, + "learning_rate": 9.342877935607521e-05, + "loss": 0.677288293838501, + "step": 3298 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 1.2651115655899048, + "learning_rate": 9.34168737411957e-05, + "loss": 0.7408396005630493, + "step": 3300 + }, + { + "epoch": 1.3924050632911391, + "eval_loss": 0.7173135876655579, + "eval_runtime": 853.5344, + "eval_samples_per_second": 2.469, + "eval_steps_per_second": 2.469, + "step": 3300 + }, + { + "epoch": 1.3932489451476793, + "grad_norm": 1.0747730731964111, + "learning_rate": 9.340495811087723e-05, + "loss": 0.6810371279716492, + "step": 3302 + }, + { + "epoch": 1.3940928270042194, + "grad_norm": 1.2857651710510254, + "learning_rate": 9.339303246786854e-05, + "loss": 0.6693953275680542, + "step": 3304 + }, + { + "epoch": 1.3949367088607594, + "grad_norm": 1.4544212818145752, + "learning_rate": 9.338109681492063e-05, + "loss": 0.7019274234771729, + "step": 3306 + }, + { + "epoch": 1.3957805907172995, + "grad_norm": 1.687755823135376, + "learning_rate": 9.336915115478685e-05, + "loss": 0.6074224710464478, + "step": 3308 + }, + { + "epoch": 1.3966244725738397, + "grad_norm": 1.1645431518554688, + "learning_rate": 9.33571954902228e-05, + "loss": 0.6981383562088013, + "step": 3310 + }, + { + "epoch": 1.3974683544303796, + "grad_norm": 1.6173527240753174, + "learning_rate": 9.334522982398646e-05, + "loss": 0.7282926440238953, + "step": 3312 + }, + { + "epoch": 1.3983122362869198, + "grad_norm": 1.3132909536361694, + "learning_rate": 9.333325415883804e-05, + "loss": 0.6574883460998535, + "step": 3314 + }, + { + "epoch": 1.39915611814346, + "grad_norm": 1.1629762649536133, + "learning_rate": 9.332126849754014e-05, + "loss": 0.6559937596321106, + "step": 3316 + }, + { + "epoch": 1.4, + "grad_norm": 1.1666897535324097, + "learning_rate": 9.33092728428576e-05, + "loss": 0.683718740940094, + "step": 3318 + }, + { + "epoch": 1.40084388185654, + "grad_norm": 1.2269554138183594, + "learning_rate": 9.329726719755756e-05, + "loss": 0.6909779906272888, + "step": 3320 + }, + { + "epoch": 1.4016877637130802, + "grad_norm": 1.1010066270828247, + "learning_rate": 9.328525156440952e-05, + "loss": 0.6051948666572571, + "step": 3322 + }, + { + "epoch": 1.4025316455696202, + "grad_norm": 1.127143144607544, + "learning_rate": 9.327322594618528e-05, + "loss": 0.6266679763793945, + "step": 3324 + }, + { + "epoch": 1.4033755274261603, + "grad_norm": 1.2160708904266357, + "learning_rate": 9.326119034565887e-05, + "loss": 0.6587526202201843, + "step": 3326 + }, + { + "epoch": 1.4042194092827005, + "grad_norm": 1.0853947401046753, + "learning_rate": 9.32491447656067e-05, + "loss": 0.5916946530342102, + "step": 3328 + }, + { + "epoch": 1.4050632911392404, + "grad_norm": 1.2205027341842651, + "learning_rate": 9.323708920880744e-05, + "loss": 0.6032452583312988, + "step": 3330 + }, + { + "epoch": 1.4059071729957806, + "grad_norm": 1.1964668035507202, + "learning_rate": 9.32250236780421e-05, + "loss": 0.6649114489555359, + "step": 3332 + }, + { + "epoch": 1.4067510548523208, + "grad_norm": 1.2507994174957275, + "learning_rate": 9.321294817609394e-05, + "loss": 0.7142994403839111, + "step": 3334 + }, + { + "epoch": 1.4075949367088607, + "grad_norm": 1.1310259103775024, + "learning_rate": 9.320086270574854e-05, + "loss": 0.709568977355957, + "step": 3336 + }, + { + "epoch": 1.4084388185654009, + "grad_norm": 1.2454090118408203, + "learning_rate": 9.318876726979385e-05, + "loss": 0.7800853848457336, + "step": 3338 + }, + { + "epoch": 1.409282700421941, + "grad_norm": 1.1168389320373535, + "learning_rate": 9.317666187101996e-05, + "loss": 0.6187908053398132, + "step": 3340 + }, + { + "epoch": 1.410126582278481, + "grad_norm": 1.6696287393569946, + "learning_rate": 9.316454651221942e-05, + "loss": 0.6222613453865051, + "step": 3342 + }, + { + "epoch": 1.4109704641350211, + "grad_norm": 0.9500295519828796, + "learning_rate": 9.315242119618698e-05, + "loss": 0.6116594672203064, + "step": 3344 + }, + { + "epoch": 1.4118143459915613, + "grad_norm": 1.186358094215393, + "learning_rate": 9.314028592571973e-05, + "loss": 0.633224368095398, + "step": 3346 + }, + { + "epoch": 1.4126582278481012, + "grad_norm": 1.1855978965759277, + "learning_rate": 9.312814070361705e-05, + "loss": 0.6675921082496643, + "step": 3348 + }, + { + "epoch": 1.4135021097046414, + "grad_norm": 1.2465872764587402, + "learning_rate": 9.311598553268059e-05, + "loss": 0.7268879413604736, + "step": 3350 + }, + { + "epoch": 1.4143459915611816, + "grad_norm": 1.151274561882019, + "learning_rate": 9.310382041571435e-05, + "loss": 0.6147416830062866, + "step": 3352 + }, + { + "epoch": 1.4151898734177215, + "grad_norm": 1.1226807832717896, + "learning_rate": 9.309164535552453e-05, + "loss": 0.6678543090820312, + "step": 3354 + }, + { + "epoch": 1.4160337552742617, + "grad_norm": 1.375842571258545, + "learning_rate": 9.307946035491975e-05, + "loss": 0.6334129571914673, + "step": 3356 + }, + { + "epoch": 1.4168776371308016, + "grad_norm": 1.058353066444397, + "learning_rate": 9.306726541671081e-05, + "loss": 0.6582583785057068, + "step": 3358 + }, + { + "epoch": 1.4177215189873418, + "grad_norm": 1.0511330366134644, + "learning_rate": 9.305506054371084e-05, + "loss": 0.5877419114112854, + "step": 3360 + }, + { + "epoch": 1.4185654008438817, + "grad_norm": 1.2246462106704712, + "learning_rate": 9.304284573873532e-05, + "loss": 0.711665689945221, + "step": 3362 + }, + { + "epoch": 1.4194092827004219, + "grad_norm": 1.0242294073104858, + "learning_rate": 9.303062100460193e-05, + "loss": 0.6743642687797546, + "step": 3364 + }, + { + "epoch": 1.420253164556962, + "grad_norm": 1.1432100534439087, + "learning_rate": 9.301838634413069e-05, + "loss": 0.6825576424598694, + "step": 3366 + }, + { + "epoch": 1.421097046413502, + "grad_norm": 1.0128604173660278, + "learning_rate": 9.30061417601439e-05, + "loss": 0.624455988407135, + "step": 3368 + }, + { + "epoch": 1.4219409282700421, + "grad_norm": 1.2738330364227295, + "learning_rate": 9.299388725546617e-05, + "loss": 0.7029586434364319, + "step": 3370 + }, + { + "epoch": 1.4227848101265823, + "grad_norm": 1.0857324600219727, + "learning_rate": 9.298162283292435e-05, + "loss": 0.5994319915771484, + "step": 3372 + }, + { + "epoch": 1.4236286919831223, + "grad_norm": 1.0811917781829834, + "learning_rate": 9.296934849534763e-05, + "loss": 0.6537772417068481, + "step": 3374 + }, + { + "epoch": 1.4244725738396624, + "grad_norm": 1.006913185119629, + "learning_rate": 9.295706424556745e-05, + "loss": 0.5775008201599121, + "step": 3376 + }, + { + "epoch": 1.4253164556962026, + "grad_norm": 1.2306486368179321, + "learning_rate": 9.294477008641755e-05, + "loss": 0.7445536255836487, + "step": 3378 + }, + { + "epoch": 1.4261603375527425, + "grad_norm": 1.223608374595642, + "learning_rate": 9.293246602073398e-05, + "loss": 0.6081538796424866, + "step": 3380 + }, + { + "epoch": 1.4270042194092827, + "grad_norm": 1.0933321714401245, + "learning_rate": 9.2920152051355e-05, + "loss": 0.6134634613990784, + "step": 3382 + }, + { + "epoch": 1.4278481012658228, + "grad_norm": 1.1738401651382446, + "learning_rate": 9.290782818112127e-05, + "loss": 0.5961087346076965, + "step": 3384 + }, + { + "epoch": 1.4286919831223628, + "grad_norm": 1.1493438482284546, + "learning_rate": 9.289549441287561e-05, + "loss": 0.6284122467041016, + "step": 3386 + }, + { + "epoch": 1.429535864978903, + "grad_norm": 1.1907998323440552, + "learning_rate": 9.288315074946324e-05, + "loss": 0.6654639840126038, + "step": 3388 + }, + { + "epoch": 1.4303797468354431, + "grad_norm": 1.3423025608062744, + "learning_rate": 9.287079719373157e-05, + "loss": 0.652850329875946, + "step": 3390 + }, + { + "epoch": 1.431223628691983, + "grad_norm": 1.3932039737701416, + "learning_rate": 9.285843374853034e-05, + "loss": 0.703445315361023, + "step": 3392 + }, + { + "epoch": 1.4320675105485232, + "grad_norm": 5.349400043487549, + "learning_rate": 9.284606041671155e-05, + "loss": 0.693265438079834, + "step": 3394 + }, + { + "epoch": 1.4329113924050634, + "grad_norm": 1.0921961069107056, + "learning_rate": 9.28336772011295e-05, + "loss": 0.6578536033630371, + "step": 3396 + }, + { + "epoch": 1.4337552742616033, + "grad_norm": 1.184157133102417, + "learning_rate": 9.282128410464074e-05, + "loss": 0.7092277407646179, + "step": 3398 + }, + { + "epoch": 1.4345991561181435, + "grad_norm": 1.0923491716384888, + "learning_rate": 9.280888113010415e-05, + "loss": 0.6866328120231628, + "step": 3400 + }, + { + "epoch": 1.4345991561181435, + "eval_loss": 0.715917706489563, + "eval_runtime": 868.51, + "eval_samples_per_second": 2.426, + "eval_steps_per_second": 2.426, + "step": 3400 + }, + { + "epoch": 1.4354430379746836, + "grad_norm": 1.2515597343444824, + "learning_rate": 9.279646828038083e-05, + "loss": 0.6617444157600403, + "step": 3402 + }, + { + "epoch": 1.4362869198312236, + "grad_norm": 1.2122540473937988, + "learning_rate": 9.278404555833422e-05, + "loss": 0.6373176574707031, + "step": 3404 + }, + { + "epoch": 1.4371308016877637, + "grad_norm": 1.191904902458191, + "learning_rate": 9.277161296682997e-05, + "loss": 0.6506488919258118, + "step": 3406 + }, + { + "epoch": 1.437974683544304, + "grad_norm": 1.2492214441299438, + "learning_rate": 9.275917050873606e-05, + "loss": 0.7172291874885559, + "step": 3408 + }, + { + "epoch": 1.4388185654008439, + "grad_norm": 1.0518640279769897, + "learning_rate": 9.274671818692272e-05, + "loss": 0.6180248260498047, + "step": 3410 + }, + { + "epoch": 1.439662447257384, + "grad_norm": 1.150563359260559, + "learning_rate": 9.273425600426245e-05, + "loss": 0.6828892827033997, + "step": 3412 + }, + { + "epoch": 1.4405063291139242, + "grad_norm": 1.76945960521698, + "learning_rate": 9.272178396363005e-05, + "loss": 0.6585919857025146, + "step": 3414 + }, + { + "epoch": 1.4413502109704641, + "grad_norm": 1.2367758750915527, + "learning_rate": 9.270930206790257e-05, + "loss": 0.7548692226409912, + "step": 3416 + }, + { + "epoch": 1.4421940928270043, + "grad_norm": 1.2292778491973877, + "learning_rate": 9.269681031995936e-05, + "loss": 0.7017102837562561, + "step": 3418 + }, + { + "epoch": 1.4430379746835442, + "grad_norm": 1.2193396091461182, + "learning_rate": 9.268430872268202e-05, + "loss": 0.6657648682594299, + "step": 3420 + }, + { + "epoch": 1.4438818565400844, + "grad_norm": 1.0505954027175903, + "learning_rate": 9.267179727895443e-05, + "loss": 0.6950910091400146, + "step": 3422 + }, + { + "epoch": 1.4447257383966245, + "grad_norm": 1.1560698747634888, + "learning_rate": 9.265927599166272e-05, + "loss": 0.689308226108551, + "step": 3424 + }, + { + "epoch": 1.4455696202531645, + "grad_norm": 1.189336895942688, + "learning_rate": 9.264674486369533e-05, + "loss": 0.6481659412384033, + "step": 3426 + }, + { + "epoch": 1.4464135021097047, + "grad_norm": 1.3527976274490356, + "learning_rate": 9.263420389794294e-05, + "loss": 0.6626612544059753, + "step": 3428 + }, + { + "epoch": 1.4472573839662446, + "grad_norm": 1.096303105354309, + "learning_rate": 9.262165309729854e-05, + "loss": 0.690841794013977, + "step": 3430 + }, + { + "epoch": 1.4481012658227848, + "grad_norm": 1.2131421566009521, + "learning_rate": 9.260909246465732e-05, + "loss": 0.6497649550437927, + "step": 3432 + }, + { + "epoch": 1.448945147679325, + "grad_norm": 1.1831032037734985, + "learning_rate": 9.259652200291678e-05, + "loss": 0.6236130595207214, + "step": 3434 + }, + { + "epoch": 1.4497890295358649, + "grad_norm": 0.9745979309082031, + "learning_rate": 9.25839417149767e-05, + "loss": 0.5223423838615417, + "step": 3436 + }, + { + "epoch": 1.450632911392405, + "grad_norm": 1.372460126876831, + "learning_rate": 9.257135160373912e-05, + "loss": 0.6642022728919983, + "step": 3438 + }, + { + "epoch": 1.4514767932489452, + "grad_norm": 1.421044111251831, + "learning_rate": 9.255875167210832e-05, + "loss": 0.5426992774009705, + "step": 3440 + }, + { + "epoch": 1.4523206751054851, + "grad_norm": 1.1694250106811523, + "learning_rate": 9.254614192299086e-05, + "loss": 0.6260567307472229, + "step": 3442 + }, + { + "epoch": 1.4531645569620253, + "grad_norm": 1.0892298221588135, + "learning_rate": 9.253352235929558e-05, + "loss": 0.5776100158691406, + "step": 3444 + }, + { + "epoch": 1.4540084388185655, + "grad_norm": 1.1841259002685547, + "learning_rate": 9.252089298393356e-05, + "loss": 0.6495202779769897, + "step": 3446 + }, + { + "epoch": 1.4548523206751054, + "grad_norm": 1.1133549213409424, + "learning_rate": 9.250825379981815e-05, + "loss": 0.6570594906806946, + "step": 3448 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 1.197100281715393, + "learning_rate": 9.249560480986498e-05, + "loss": 0.6496587991714478, + "step": 3450 + }, + { + "epoch": 1.4565400843881857, + "grad_norm": 1.1661107540130615, + "learning_rate": 9.248294601699193e-05, + "loss": 0.6644704341888428, + "step": 3452 + }, + { + "epoch": 1.4573839662447257, + "grad_norm": 1.2257879972457886, + "learning_rate": 9.247027742411912e-05, + "loss": 0.6451231241226196, + "step": 3454 + }, + { + "epoch": 1.4582278481012658, + "grad_norm": 1.3634982109069824, + "learning_rate": 9.245759903416897e-05, + "loss": 0.6108601093292236, + "step": 3456 + }, + { + "epoch": 1.459071729957806, + "grad_norm": 1.1802605390548706, + "learning_rate": 9.244491085006615e-05, + "loss": 0.6080004572868347, + "step": 3458 + }, + { + "epoch": 1.459915611814346, + "grad_norm": 1.280831217765808, + "learning_rate": 9.243221287473756e-05, + "loss": 0.6406423449516296, + "step": 3460 + }, + { + "epoch": 1.460759493670886, + "grad_norm": 1.3127192258834839, + "learning_rate": 9.241950511111237e-05, + "loss": 0.7320113778114319, + "step": 3462 + }, + { + "epoch": 1.4616033755274263, + "grad_norm": 1.1711835861206055, + "learning_rate": 9.240678756212204e-05, + "loss": 0.572110652923584, + "step": 3464 + }, + { + "epoch": 1.4624472573839662, + "grad_norm": 1.347143292427063, + "learning_rate": 9.239406023070028e-05, + "loss": 0.7446795105934143, + "step": 3466 + }, + { + "epoch": 1.4632911392405064, + "grad_norm": 1.4953652620315552, + "learning_rate": 9.238132311978299e-05, + "loss": 0.6709978580474854, + "step": 3468 + }, + { + "epoch": 1.4641350210970465, + "grad_norm": 1.2199387550354004, + "learning_rate": 9.236857623230842e-05, + "loss": 0.6691445112228394, + "step": 3470 + }, + { + "epoch": 1.4649789029535865, + "grad_norm": 1.0959199666976929, + "learning_rate": 9.235581957121702e-05, + "loss": 0.6964292526245117, + "step": 3472 + }, + { + "epoch": 1.4658227848101266, + "grad_norm": 1.455505609512329, + "learning_rate": 9.234305313945149e-05, + "loss": 0.6880454421043396, + "step": 3474 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.2820862531661987, + "learning_rate": 9.233027693995681e-05, + "loss": 0.6737138032913208, + "step": 3476 + }, + { + "epoch": 1.4675105485232067, + "grad_norm": 1.3459213972091675, + "learning_rate": 9.231749097568023e-05, + "loss": 0.6874006390571594, + "step": 3478 + }, + { + "epoch": 1.4683544303797469, + "grad_norm": 1.2815442085266113, + "learning_rate": 9.230469524957119e-05, + "loss": 0.7179469466209412, + "step": 3480 + }, + { + "epoch": 1.469198312236287, + "grad_norm": 1.6181597709655762, + "learning_rate": 9.229188976458145e-05, + "loss": 0.7525522112846375, + "step": 3482 + }, + { + "epoch": 1.470042194092827, + "grad_norm": 1.0633227825164795, + "learning_rate": 9.227907452366495e-05, + "loss": 0.5918128490447998, + "step": 3484 + }, + { + "epoch": 1.4708860759493672, + "grad_norm": 1.2055985927581787, + "learning_rate": 9.226624952977796e-05, + "loss": 0.6686186194419861, + "step": 3486 + }, + { + "epoch": 1.471729957805907, + "grad_norm": 1.2495088577270508, + "learning_rate": 9.225341478587893e-05, + "loss": 0.764410674571991, + "step": 3488 + }, + { + "epoch": 1.4725738396624473, + "grad_norm": 1.174229383468628, + "learning_rate": 9.22405702949286e-05, + "loss": 0.7066780924797058, + "step": 3490 + }, + { + "epoch": 1.4734177215189874, + "grad_norm": 1.0970302820205688, + "learning_rate": 9.222771605988995e-05, + "loss": 0.6740228533744812, + "step": 3492 + }, + { + "epoch": 1.4742616033755274, + "grad_norm": 1.2470436096191406, + "learning_rate": 9.221485208372822e-05, + "loss": 0.698371410369873, + "step": 3494 + }, + { + "epoch": 1.4751054852320675, + "grad_norm": 1.0750112533569336, + "learning_rate": 9.220197836941084e-05, + "loss": 0.6354188919067383, + "step": 3496 + }, + { + "epoch": 1.4759493670886075, + "grad_norm": 1.2656232118606567, + "learning_rate": 9.218909491990757e-05, + "loss": 0.7268608212471008, + "step": 3498 + }, + { + "epoch": 1.4767932489451476, + "grad_norm": 1.2389028072357178, + "learning_rate": 9.217620173819037e-05, + "loss": 0.6652966141700745, + "step": 3500 + }, + { + "epoch": 1.4767932489451476, + "eval_loss": 0.7155047059059143, + "eval_runtime": 855.8428, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 2.462, + "step": 3500 + }, + { + "epoch": 1.4776371308016878, + "grad_norm": 1.218304991722107, + "learning_rate": 9.216329882723343e-05, + "loss": 0.6845020651817322, + "step": 3502 + }, + { + "epoch": 1.4784810126582277, + "grad_norm": 1.123903512954712, + "learning_rate": 9.21503861900132e-05, + "loss": 0.6972519755363464, + "step": 3504 + }, + { + "epoch": 1.479324894514768, + "grad_norm": 1.1827739477157593, + "learning_rate": 9.213746382950839e-05, + "loss": 0.6699702739715576, + "step": 3506 + }, + { + "epoch": 1.480168776371308, + "grad_norm": 0.9934872984886169, + "learning_rate": 9.212453174869995e-05, + "loss": 0.5623225569725037, + "step": 3508 + }, + { + "epoch": 1.481012658227848, + "grad_norm": 1.221093773841858, + "learning_rate": 9.211158995057105e-05, + "loss": 0.6527173519134521, + "step": 3510 + }, + { + "epoch": 1.4818565400843882, + "grad_norm": 1.4569166898727417, + "learning_rate": 9.209863843810711e-05, + "loss": 0.7015712261199951, + "step": 3512 + }, + { + "epoch": 1.4827004219409283, + "grad_norm": 1.0764813423156738, + "learning_rate": 9.208567721429581e-05, + "loss": 0.6442505717277527, + "step": 3514 + }, + { + "epoch": 1.4835443037974683, + "grad_norm": 2.1307506561279297, + "learning_rate": 9.207270628212704e-05, + "loss": 0.666451096534729, + "step": 3516 + }, + { + "epoch": 1.4843881856540084, + "grad_norm": 1.180590271949768, + "learning_rate": 9.205972564459296e-05, + "loss": 0.6354807019233704, + "step": 3518 + }, + { + "epoch": 1.4852320675105486, + "grad_norm": 1.2999447584152222, + "learning_rate": 9.204673530468795e-05, + "loss": 0.6080324053764343, + "step": 3520 + }, + { + "epoch": 1.4860759493670885, + "grad_norm": 1.1680655479431152, + "learning_rate": 9.203373526540862e-05, + "loss": 0.6411244869232178, + "step": 3522 + }, + { + "epoch": 1.4869198312236287, + "grad_norm": 1.0565013885498047, + "learning_rate": 9.202072552975383e-05, + "loss": 0.6498287916183472, + "step": 3524 + }, + { + "epoch": 1.4877637130801689, + "grad_norm": 1.246267318725586, + "learning_rate": 9.20077061007247e-05, + "loss": 0.633613109588623, + "step": 3526 + }, + { + "epoch": 1.4886075949367088, + "grad_norm": 1.0626300573349, + "learning_rate": 9.199467698132453e-05, + "loss": 0.6102107167243958, + "step": 3528 + }, + { + "epoch": 1.489451476793249, + "grad_norm": 1.256600260734558, + "learning_rate": 9.198163817455892e-05, + "loss": 0.669352114200592, + "step": 3530 + }, + { + "epoch": 1.4902953586497891, + "grad_norm": 1.143188238143921, + "learning_rate": 9.196858968343565e-05, + "loss": 0.6305804252624512, + "step": 3532 + }, + { + "epoch": 1.491139240506329, + "grad_norm": 1.1471205949783325, + "learning_rate": 9.195553151096475e-05, + "loss": 0.6256994605064392, + "step": 3534 + }, + { + "epoch": 1.4919831223628692, + "grad_norm": 1.1771589517593384, + "learning_rate": 9.194246366015851e-05, + "loss": 0.6395107507705688, + "step": 3536 + }, + { + "epoch": 1.4928270042194094, + "grad_norm": 1.1997097730636597, + "learning_rate": 9.192938613403144e-05, + "loss": 0.6875160932540894, + "step": 3538 + }, + { + "epoch": 1.4936708860759493, + "grad_norm": 1.3962169885635376, + "learning_rate": 9.191629893560024e-05, + "loss": 0.7216510772705078, + "step": 3540 + }, + { + "epoch": 1.4945147679324895, + "grad_norm": 1.1835654973983765, + "learning_rate": 9.19032020678839e-05, + "loss": 0.6870693564414978, + "step": 3542 + }, + { + "epoch": 1.4953586497890297, + "grad_norm": 1.112331509590149, + "learning_rate": 9.18900955339036e-05, + "loss": 0.6266092658042908, + "step": 3544 + }, + { + "epoch": 1.4962025316455696, + "grad_norm": 1.0298354625701904, + "learning_rate": 9.187697933668278e-05, + "loss": 0.5906343460083008, + "step": 3546 + }, + { + "epoch": 1.4970464135021098, + "grad_norm": 1.2650012969970703, + "learning_rate": 9.186385347924709e-05, + "loss": 0.6203610897064209, + "step": 3548 + }, + { + "epoch": 1.49789029535865, + "grad_norm": 1.1208417415618896, + "learning_rate": 9.185071796462441e-05, + "loss": 0.6841281652450562, + "step": 3550 + }, + { + "epoch": 1.4987341772151899, + "grad_norm": 1.1319488286972046, + "learning_rate": 9.183757279584486e-05, + "loss": 0.7089514136314392, + "step": 3552 + }, + { + "epoch": 1.49957805907173, + "grad_norm": 1.1104235649108887, + "learning_rate": 9.182441797594076e-05, + "loss": 0.6663861870765686, + "step": 3554 + }, + { + "epoch": 1.5004219409282702, + "grad_norm": 1.161412000656128, + "learning_rate": 9.18112535079467e-05, + "loss": 0.6713237762451172, + "step": 3556 + }, + { + "epoch": 1.5012658227848101, + "grad_norm": 1.2925246953964233, + "learning_rate": 9.179807939489945e-05, + "loss": 0.6665274500846863, + "step": 3558 + }, + { + "epoch": 1.50210970464135, + "grad_norm": 1.0968270301818848, + "learning_rate": 9.178489563983802e-05, + "loss": 0.6881593465805054, + "step": 3560 + }, + { + "epoch": 1.5029535864978905, + "grad_norm": 1.111439824104309, + "learning_rate": 9.177170224580368e-05, + "loss": 0.631568431854248, + "step": 3562 + }, + { + "epoch": 1.5037974683544304, + "grad_norm": 1.6731075048446655, + "learning_rate": 9.175849921583986e-05, + "loss": 0.6896167397499084, + "step": 3564 + }, + { + "epoch": 1.5046413502109703, + "grad_norm": 1.226739525794983, + "learning_rate": 9.174528655299226e-05, + "loss": 0.6285277605056763, + "step": 3566 + }, + { + "epoch": 1.5054852320675105, + "grad_norm": 1.2030941247940063, + "learning_rate": 9.17320642603088e-05, + "loss": 0.6256678700447083, + "step": 3568 + }, + { + "epoch": 1.5063291139240507, + "grad_norm": 1.1980781555175781, + "learning_rate": 9.171883234083958e-05, + "loss": 0.6895992159843445, + "step": 3570 + }, + { + "epoch": 1.5071729957805906, + "grad_norm": 1.2083429098129272, + "learning_rate": 9.170559079763696e-05, + "loss": 0.6642275452613831, + "step": 3572 + }, + { + "epoch": 1.5080168776371308, + "grad_norm": 1.134020209312439, + "learning_rate": 9.169233963375552e-05, + "loss": 0.7441924214363098, + "step": 3574 + }, + { + "epoch": 1.508860759493671, + "grad_norm": 1.8178621530532837, + "learning_rate": 9.167907885225204e-05, + "loss": 0.6435995101928711, + "step": 3576 + }, + { + "epoch": 1.5097046413502109, + "grad_norm": 1.3850326538085938, + "learning_rate": 9.166580845618553e-05, + "loss": 0.6933603882789612, + "step": 3578 + }, + { + "epoch": 1.510548523206751, + "grad_norm": 1.2500641345977783, + "learning_rate": 9.165252844861723e-05, + "loss": 0.6686714887619019, + "step": 3580 + }, + { + "epoch": 1.5113924050632912, + "grad_norm": 1.0226643085479736, + "learning_rate": 9.163923883261056e-05, + "loss": 0.607890248298645, + "step": 3582 + }, + { + "epoch": 1.5122362869198311, + "grad_norm": 1.233402132987976, + "learning_rate": 9.162593961123118e-05, + "loss": 0.6604583859443665, + "step": 3584 + }, + { + "epoch": 1.5130801687763713, + "grad_norm": 1.2609056234359741, + "learning_rate": 9.161263078754698e-05, + "loss": 0.6756428480148315, + "step": 3586 + }, + { + "epoch": 1.5139240506329115, + "grad_norm": 1.22673761844635, + "learning_rate": 9.159931236462805e-05, + "loss": 0.6990940570831299, + "step": 3588 + }, + { + "epoch": 1.5147679324894514, + "grad_norm": 1.1386182308197021, + "learning_rate": 9.158598434554668e-05, + "loss": 0.6436648964881897, + "step": 3590 + }, + { + "epoch": 1.5156118143459916, + "grad_norm": 1.1136831045150757, + "learning_rate": 9.157264673337739e-05, + "loss": 0.6420145034790039, + "step": 3592 + }, + { + "epoch": 1.5164556962025317, + "grad_norm": 1.1957908868789673, + "learning_rate": 9.155929953119693e-05, + "loss": 0.6518592834472656, + "step": 3594 + }, + { + "epoch": 1.5172995780590717, + "grad_norm": 1.1049647331237793, + "learning_rate": 9.154594274208422e-05, + "loss": 0.6891129612922668, + "step": 3596 + }, + { + "epoch": 1.5181434599156118, + "grad_norm": 1.243675947189331, + "learning_rate": 9.153257636912043e-05, + "loss": 0.6945107579231262, + "step": 3598 + }, + { + "epoch": 1.518987341772152, + "grad_norm": 1.2633713483810425, + "learning_rate": 9.15192004153889e-05, + "loss": 0.7011660933494568, + "step": 3600 + }, + { + "epoch": 1.518987341772152, + "eval_loss": 0.7118256688117981, + "eval_runtime": 851.3079, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 2.475, + "step": 3600 + }, + { + "epoch": 1.519831223628692, + "grad_norm": 1.2995525598526, + "learning_rate": 9.150581488397525e-05, + "loss": 0.6843758821487427, + "step": 3602 + }, + { + "epoch": 1.520675105485232, + "grad_norm": 1.3140910863876343, + "learning_rate": 9.149241977796723e-05, + "loss": 0.6699353456497192, + "step": 3604 + }, + { + "epoch": 1.5215189873417723, + "grad_norm": 1.2674909830093384, + "learning_rate": 9.147901510045485e-05, + "loss": 0.7269271612167358, + "step": 3606 + }, + { + "epoch": 1.5223628691983122, + "grad_norm": 1.0232038497924805, + "learning_rate": 9.146560085453031e-05, + "loss": 0.5556837916374207, + "step": 3608 + }, + { + "epoch": 1.5232067510548524, + "grad_norm": 1.2598992586135864, + "learning_rate": 9.1452177043288e-05, + "loss": 0.7273092269897461, + "step": 3610 + }, + { + "epoch": 1.5240506329113925, + "grad_norm": 1.2002917528152466, + "learning_rate": 9.143874366982455e-05, + "loss": 0.6897470355033875, + "step": 3612 + }, + { + "epoch": 1.5248945147679325, + "grad_norm": 1.0959099531173706, + "learning_rate": 9.142530073723878e-05, + "loss": 0.6060715913772583, + "step": 3614 + }, + { + "epoch": 1.5257383966244724, + "grad_norm": 1.9890750646591187, + "learning_rate": 9.141184824863173e-05, + "loss": 0.6585046052932739, + "step": 3616 + }, + { + "epoch": 1.5265822784810128, + "grad_norm": 1.1460137367248535, + "learning_rate": 9.139838620710663e-05, + "loss": 0.6022046804428101, + "step": 3618 + }, + { + "epoch": 1.5274261603375527, + "grad_norm": 1.193206548690796, + "learning_rate": 9.138491461576888e-05, + "loss": 0.6332581639289856, + "step": 3620 + }, + { + "epoch": 1.5282700421940927, + "grad_norm": 1.2813689708709717, + "learning_rate": 9.137143347772614e-05, + "loss": 0.6690208315849304, + "step": 3622 + }, + { + "epoch": 1.529113924050633, + "grad_norm": 1.0950052738189697, + "learning_rate": 9.135794279608827e-05, + "loss": 0.6034293174743652, + "step": 3624 + }, + { + "epoch": 1.529957805907173, + "grad_norm": 1.208884358406067, + "learning_rate": 9.134444257396729e-05, + "loss": 0.7077960968017578, + "step": 3626 + }, + { + "epoch": 1.530801687763713, + "grad_norm": 1.093759298324585, + "learning_rate": 9.133093281447742e-05, + "loss": 0.6741147637367249, + "step": 3628 + }, + { + "epoch": 1.5316455696202531, + "grad_norm": 1.1280012130737305, + "learning_rate": 9.131741352073514e-05, + "loss": 0.6816818118095398, + "step": 3630 + }, + { + "epoch": 1.5324894514767933, + "grad_norm": 1.2868385314941406, + "learning_rate": 9.130388469585907e-05, + "loss": 0.7149180769920349, + "step": 3632 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.9654553532600403, + "learning_rate": 9.129034634297007e-05, + "loss": 0.613467812538147, + "step": 3634 + }, + { + "epoch": 1.5341772151898734, + "grad_norm": 1.8958736658096313, + "learning_rate": 9.127679846519115e-05, + "loss": 0.7034116387367249, + "step": 3636 + }, + { + "epoch": 1.5350210970464135, + "grad_norm": 1.305284857749939, + "learning_rate": 9.126324106564757e-05, + "loss": 0.7076106667518616, + "step": 3638 + }, + { + "epoch": 1.5358649789029535, + "grad_norm": 1.1843762397766113, + "learning_rate": 9.124967414746675e-05, + "loss": 0.6671180725097656, + "step": 3640 + }, + { + "epoch": 1.5367088607594936, + "grad_norm": 1.0460047721862793, + "learning_rate": 9.123609771377832e-05, + "loss": 0.667533814907074, + "step": 3642 + }, + { + "epoch": 1.5375527426160338, + "grad_norm": 1.0441135168075562, + "learning_rate": 9.122251176771409e-05, + "loss": 0.6454499959945679, + "step": 3644 + }, + { + "epoch": 1.5383966244725737, + "grad_norm": 1.5647634267807007, + "learning_rate": 9.120891631240811e-05, + "loss": 0.677007794380188, + "step": 3646 + }, + { + "epoch": 1.539240506329114, + "grad_norm": 1.0650273561477661, + "learning_rate": 9.119531135099655e-05, + "loss": 0.7017449736595154, + "step": 3648 + }, + { + "epoch": 1.540084388185654, + "grad_norm": 1.2904767990112305, + "learning_rate": 9.118169688661784e-05, + "loss": 0.683830738067627, + "step": 3650 + }, + { + "epoch": 1.540928270042194, + "grad_norm": 1.1278672218322754, + "learning_rate": 9.116807292241257e-05, + "loss": 0.5923286080360413, + "step": 3652 + }, + { + "epoch": 1.5417721518987342, + "grad_norm": 1.1107184886932373, + "learning_rate": 9.115443946152352e-05, + "loss": 0.6595140099525452, + "step": 3654 + }, + { + "epoch": 1.5426160337552743, + "grad_norm": 1.0917898416519165, + "learning_rate": 9.114079650709566e-05, + "loss": 0.655241072177887, + "step": 3656 + }, + { + "epoch": 1.5434599156118143, + "grad_norm": 1.1922433376312256, + "learning_rate": 9.11271440622762e-05, + "loss": 0.5987096428871155, + "step": 3658 + }, + { + "epoch": 1.5443037974683544, + "grad_norm": 0.9974617958068848, + "learning_rate": 9.111348213021445e-05, + "loss": 0.5710145235061646, + "step": 3660 + }, + { + "epoch": 1.5451476793248946, + "grad_norm": 1.133683443069458, + "learning_rate": 9.109981071406197e-05, + "loss": 0.6067734360694885, + "step": 3662 + }, + { + "epoch": 1.5459915611814345, + "grad_norm": 1.1958736181259155, + "learning_rate": 9.108612981697248e-05, + "loss": 0.622981071472168, + "step": 3664 + }, + { + "epoch": 1.5468354430379747, + "grad_norm": 1.234328031539917, + "learning_rate": 9.107243944210194e-05, + "loss": 0.6520710587501526, + "step": 3666 + }, + { + "epoch": 1.5476793248945149, + "grad_norm": 1.0374714136123657, + "learning_rate": 9.105873959260842e-05, + "loss": 0.5993341207504272, + "step": 3668 + }, + { + "epoch": 1.5485232067510548, + "grad_norm": 0.9987428784370422, + "learning_rate": 9.104503027165223e-05, + "loss": 0.6564813852310181, + "step": 3670 + }, + { + "epoch": 1.549367088607595, + "grad_norm": 1.0823339223861694, + "learning_rate": 9.103131148239584e-05, + "loss": 0.61710524559021, + "step": 3672 + }, + { + "epoch": 1.5502109704641351, + "grad_norm": 1.3481065034866333, + "learning_rate": 9.101758322800391e-05, + "loss": 0.687752366065979, + "step": 3674 + }, + { + "epoch": 1.551054852320675, + "grad_norm": 1.2243965864181519, + "learning_rate": 9.10038455116433e-05, + "loss": 0.5981095433235168, + "step": 3676 + }, + { + "epoch": 1.5518987341772152, + "grad_norm": 1.1384631395339966, + "learning_rate": 9.0990098336483e-05, + "loss": 0.7181004285812378, + "step": 3678 + }, + { + "epoch": 1.5527426160337554, + "grad_norm": 1.042925477027893, + "learning_rate": 9.097634170569426e-05, + "loss": 0.6137188076972961, + "step": 3680 + }, + { + "epoch": 1.5535864978902953, + "grad_norm": 1.372023105621338, + "learning_rate": 9.096257562245045e-05, + "loss": 0.6761168241500854, + "step": 3682 + }, + { + "epoch": 1.5544303797468353, + "grad_norm": 1.0574673414230347, + "learning_rate": 9.094880008992714e-05, + "loss": 0.614276647567749, + "step": 3684 + }, + { + "epoch": 1.5552742616033757, + "grad_norm": 1.2894645929336548, + "learning_rate": 9.093501511130208e-05, + "loss": 0.668122410774231, + "step": 3686 + }, + { + "epoch": 1.5561181434599156, + "grad_norm": 1.2241230010986328, + "learning_rate": 9.092122068975523e-05, + "loss": 0.6305631399154663, + "step": 3688 + }, + { + "epoch": 1.5569620253164556, + "grad_norm": 1.1316208839416504, + "learning_rate": 9.090741682846866e-05, + "loss": 0.633276641368866, + "step": 3690 + }, + { + "epoch": 1.557805907172996, + "grad_norm": 1.2857953310012817, + "learning_rate": 9.089360353062666e-05, + "loss": 0.6657599806785583, + "step": 3692 + }, + { + "epoch": 1.5586497890295359, + "grad_norm": 1.2325671911239624, + "learning_rate": 9.087978079941573e-05, + "loss": 0.6379332542419434, + "step": 3694 + }, + { + "epoch": 1.5594936708860758, + "grad_norm": 1.3286080360412598, + "learning_rate": 9.086594863802445e-05, + "loss": 0.6841909885406494, + "step": 3696 + }, + { + "epoch": 1.560337552742616, + "grad_norm": 1.261890172958374, + "learning_rate": 9.085210704964368e-05, + "loss": 0.6735964417457581, + "step": 3698 + }, + { + "epoch": 1.5611814345991561, + "grad_norm": 1.0922305583953857, + "learning_rate": 9.083825603746639e-05, + "loss": 0.6602351665496826, + "step": 3700 + }, + { + "epoch": 1.5611814345991561, + "eval_loss": 0.7099412679672241, + "eval_runtime": 857.2273, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 3700 + }, + { + "epoch": 1.562025316455696, + "grad_norm": 1.1113468408584595, + "learning_rate": 9.082439560468774e-05, + "loss": 0.6590834259986877, + "step": 3702 + }, + { + "epoch": 1.5628691983122363, + "grad_norm": 1.1476659774780273, + "learning_rate": 9.081052575450508e-05, + "loss": 0.6397460103034973, + "step": 3704 + }, + { + "epoch": 1.5637130801687764, + "grad_norm": 1.2270452976226807, + "learning_rate": 9.07966464901179e-05, + "loss": 0.6337460279464722, + "step": 3706 + }, + { + "epoch": 1.5645569620253164, + "grad_norm": 1.233667016029358, + "learning_rate": 9.07827578147279e-05, + "loss": 0.680374801158905, + "step": 3708 + }, + { + "epoch": 1.5654008438818565, + "grad_norm": 1.0761466026306152, + "learning_rate": 9.076885973153891e-05, + "loss": 0.6234241724014282, + "step": 3710 + }, + { + "epoch": 1.5662447257383967, + "grad_norm": 0.9219012260437012, + "learning_rate": 9.075495224375697e-05, + "loss": 0.6096800565719604, + "step": 3712 + }, + { + "epoch": 1.5670886075949366, + "grad_norm": 1.151168942451477, + "learning_rate": 9.074103535459026e-05, + "loss": 0.649919867515564, + "step": 3714 + }, + { + "epoch": 1.5679324894514768, + "grad_norm": 1.1380470991134644, + "learning_rate": 9.072710906724914e-05, + "loss": 0.6704574227333069, + "step": 3716 + }, + { + "epoch": 1.568776371308017, + "grad_norm": 1.2184447050094604, + "learning_rate": 9.071317338494614e-05, + "loss": 0.6619362831115723, + "step": 3718 + }, + { + "epoch": 1.5696202531645569, + "grad_norm": 1.131170630455017, + "learning_rate": 9.069922831089594e-05, + "loss": 0.6179121732711792, + "step": 3720 + }, + { + "epoch": 1.570464135021097, + "grad_norm": 1.2668405771255493, + "learning_rate": 9.06852738483154e-05, + "loss": 0.594958484172821, + "step": 3722 + }, + { + "epoch": 1.5713080168776372, + "grad_norm": 1.1624782085418701, + "learning_rate": 9.067131000042359e-05, + "loss": 0.6323778629302979, + "step": 3724 + }, + { + "epoch": 1.5721518987341772, + "grad_norm": 1.2936128377914429, + "learning_rate": 9.065733677044166e-05, + "loss": 0.628058910369873, + "step": 3726 + }, + { + "epoch": 1.5729957805907173, + "grad_norm": 1.1847784519195557, + "learning_rate": 9.064335416159296e-05, + "loss": 0.6472614407539368, + "step": 3728 + }, + { + "epoch": 1.5738396624472575, + "grad_norm": 1.8903449773788452, + "learning_rate": 9.062936217710305e-05, + "loss": 0.6395491361618042, + "step": 3730 + }, + { + "epoch": 1.5746835443037974, + "grad_norm": 1.1150785684585571, + "learning_rate": 9.061536082019956e-05, + "loss": 0.6911961436271667, + "step": 3732 + }, + { + "epoch": 1.5755274261603376, + "grad_norm": 1.1206107139587402, + "learning_rate": 9.060135009411239e-05, + "loss": 0.7051874399185181, + "step": 3734 + }, + { + "epoch": 1.5763713080168777, + "grad_norm": 1.27924382686615, + "learning_rate": 9.05873300020735e-05, + "loss": 0.7012752890586853, + "step": 3736 + }, + { + "epoch": 1.5772151898734177, + "grad_norm": 1.3970832824707031, + "learning_rate": 9.057330054731707e-05, + "loss": 0.7185142040252686, + "step": 3738 + }, + { + "epoch": 1.5780590717299579, + "grad_norm": 0.9732457995414734, + "learning_rate": 9.055926173307945e-05, + "loss": 0.6298858523368835, + "step": 3740 + }, + { + "epoch": 1.578902953586498, + "grad_norm": 1.230928897857666, + "learning_rate": 9.054521356259909e-05, + "loss": 0.7142943739891052, + "step": 3742 + }, + { + "epoch": 1.579746835443038, + "grad_norm": 1.1297426223754883, + "learning_rate": 9.053115603911664e-05, + "loss": 0.6535376310348511, + "step": 3744 + }, + { + "epoch": 1.580590717299578, + "grad_norm": 1.2132076025009155, + "learning_rate": 9.051708916587491e-05, + "loss": 0.6236510872840881, + "step": 3746 + }, + { + "epoch": 1.5814345991561183, + "grad_norm": 1.201319932937622, + "learning_rate": 9.050301294611885e-05, + "loss": 0.6752219200134277, + "step": 3748 + }, + { + "epoch": 1.5822784810126582, + "grad_norm": 1.2969163656234741, + "learning_rate": 9.048892738309559e-05, + "loss": 0.7248554825782776, + "step": 3750 + }, + { + "epoch": 1.5831223628691982, + "grad_norm": 1.0721957683563232, + "learning_rate": 9.047483248005439e-05, + "loss": 0.6488997340202332, + "step": 3752 + }, + { + "epoch": 1.5839662447257385, + "grad_norm": 0.9988508820533752, + "learning_rate": 9.046072824024667e-05, + "loss": 0.6191130876541138, + "step": 3754 + }, + { + "epoch": 1.5848101265822785, + "grad_norm": 1.260183572769165, + "learning_rate": 9.0446614666926e-05, + "loss": 0.6681985259056091, + "step": 3756 + }, + { + "epoch": 1.5856540084388184, + "grad_norm": 1.1288834810256958, + "learning_rate": 9.043249176334812e-05, + "loss": 0.662024736404419, + "step": 3758 + }, + { + "epoch": 1.5864978902953588, + "grad_norm": 1.4384263753890991, + "learning_rate": 9.04183595327709e-05, + "loss": 0.609916627407074, + "step": 3760 + }, + { + "epoch": 1.5873417721518988, + "grad_norm": 1.1109941005706787, + "learning_rate": 9.04042179784544e-05, + "loss": 0.6532528400421143, + "step": 3762 + }, + { + "epoch": 1.5881856540084387, + "grad_norm": 1.0959233045578003, + "learning_rate": 9.039006710366078e-05, + "loss": 0.7136290669441223, + "step": 3764 + }, + { + "epoch": 1.5890295358649789, + "grad_norm": 1.2313964366912842, + "learning_rate": 9.037590691165439e-05, + "loss": 0.6907190084457397, + "step": 3766 + }, + { + "epoch": 1.589873417721519, + "grad_norm": 1.3127682209014893, + "learning_rate": 9.036173740570172e-05, + "loss": 0.7114790678024292, + "step": 3768 + }, + { + "epoch": 1.590717299578059, + "grad_norm": 1.0038903951644897, + "learning_rate": 9.034755858907138e-05, + "loss": 0.6257581114768982, + "step": 3770 + }, + { + "epoch": 1.5915611814345991, + "grad_norm": 1.1058061122894287, + "learning_rate": 9.033337046503416e-05, + "loss": 0.578145444393158, + "step": 3772 + }, + { + "epoch": 1.5924050632911393, + "grad_norm": 1.0893515348434448, + "learning_rate": 9.0319173036863e-05, + "loss": 0.6312620043754578, + "step": 3774 + }, + { + "epoch": 1.5932489451476792, + "grad_norm": 1.1091047525405884, + "learning_rate": 9.030496630783297e-05, + "loss": 0.6799508333206177, + "step": 3776 + }, + { + "epoch": 1.5940928270042194, + "grad_norm": 1.1103609800338745, + "learning_rate": 9.029075028122127e-05, + "loss": 0.678726315498352, + "step": 3778 + }, + { + "epoch": 1.5949367088607596, + "grad_norm": 1.1918376684188843, + "learning_rate": 9.027652496030728e-05, + "loss": 0.7357890009880066, + "step": 3780 + }, + { + "epoch": 1.5957805907172995, + "grad_norm": 1.0541924238204956, + "learning_rate": 9.026229034837253e-05, + "loss": 0.6079391241073608, + "step": 3782 + }, + { + "epoch": 1.5966244725738397, + "grad_norm": 1.195845603942871, + "learning_rate": 9.024804644870062e-05, + "loss": 0.7173702120780945, + "step": 3784 + }, + { + "epoch": 1.5974683544303798, + "grad_norm": 1.1362866163253784, + "learning_rate": 9.023379326457737e-05, + "loss": 0.6431670188903809, + "step": 3786 + }, + { + "epoch": 1.5983122362869198, + "grad_norm": 1.2327499389648438, + "learning_rate": 9.021953079929074e-05, + "loss": 0.6346777677536011, + "step": 3788 + }, + { + "epoch": 1.59915611814346, + "grad_norm": 1.1623177528381348, + "learning_rate": 9.020525905613078e-05, + "loss": 0.6852784156799316, + "step": 3790 + }, + { + "epoch": 1.6, + "grad_norm": 1.0258424282073975, + "learning_rate": 9.019097803838971e-05, + "loss": 0.6357095241546631, + "step": 3792 + }, + { + "epoch": 1.60084388185654, + "grad_norm": 1.0825177431106567, + "learning_rate": 9.017668774936188e-05, + "loss": 0.6663659811019897, + "step": 3794 + }, + { + "epoch": 1.6016877637130802, + "grad_norm": 1.1190401315689087, + "learning_rate": 9.016238819234381e-05, + "loss": 0.6009758710861206, + "step": 3796 + }, + { + "epoch": 1.6025316455696204, + "grad_norm": 1.09871244430542, + "learning_rate": 9.01480793706341e-05, + "loss": 0.6907890439033508, + "step": 3798 + }, + { + "epoch": 1.6033755274261603, + "grad_norm": 1.2046958208084106, + "learning_rate": 9.013376128753354e-05, + "loss": 0.6709389090538025, + "step": 3800 + }, + { + "epoch": 1.6033755274261603, + "eval_loss": 0.7080941200256348, + "eval_runtime": 865.6774, + "eval_samples_per_second": 2.434, + "eval_steps_per_second": 2.434, + "step": 3800 + }, + { + "epoch": 1.6042194092827005, + "grad_norm": 1.0671489238739014, + "learning_rate": 9.011943394634505e-05, + "loss": 0.653937041759491, + "step": 3802 + }, + { + "epoch": 1.6050632911392406, + "grad_norm": 1.4205375909805298, + "learning_rate": 9.010509735037364e-05, + "loss": 0.6647229194641113, + "step": 3804 + }, + { + "epoch": 1.6059071729957806, + "grad_norm": 1.3793799877166748, + "learning_rate": 9.009075150292652e-05, + "loss": 0.6981267929077148, + "step": 3806 + }, + { + "epoch": 1.6067510548523207, + "grad_norm": 1.0534380674362183, + "learning_rate": 9.007639640731298e-05, + "loss": 0.6151314973831177, + "step": 3808 + }, + { + "epoch": 1.6075949367088609, + "grad_norm": 1.1359853744506836, + "learning_rate": 9.006203206684447e-05, + "loss": 0.6671237349510193, + "step": 3810 + }, + { + "epoch": 1.6084388185654008, + "grad_norm": 1.2385475635528564, + "learning_rate": 9.004765848483456e-05, + "loss": 0.7145646810531616, + "step": 3812 + }, + { + "epoch": 1.6092827004219408, + "grad_norm": 1.1323930025100708, + "learning_rate": 9.003327566459899e-05, + "loss": 0.6524789929389954, + "step": 3814 + }, + { + "epoch": 1.6101265822784812, + "grad_norm": 1.1863508224487305, + "learning_rate": 9.001888360945555e-05, + "loss": 0.7574670314788818, + "step": 3816 + }, + { + "epoch": 1.610970464135021, + "grad_norm": 1.0288994312286377, + "learning_rate": 9.000448232272425e-05, + "loss": 0.5858811736106873, + "step": 3818 + }, + { + "epoch": 1.611814345991561, + "grad_norm": 1.2674148082733154, + "learning_rate": 8.999007180772719e-05, + "loss": 0.6834250688552856, + "step": 3820 + }, + { + "epoch": 1.6126582278481014, + "grad_norm": 1.2014318704605103, + "learning_rate": 8.997565206778856e-05, + "loss": 0.6435309052467346, + "step": 3822 + }, + { + "epoch": 1.6135021097046414, + "grad_norm": 1.205741286277771, + "learning_rate": 8.996122310623476e-05, + "loss": 0.6212471127510071, + "step": 3824 + }, + { + "epoch": 1.6143459915611813, + "grad_norm": 1.0866186618804932, + "learning_rate": 8.994678492639426e-05, + "loss": 0.6832143664360046, + "step": 3826 + }, + { + "epoch": 1.6151898734177215, + "grad_norm": 1.0786924362182617, + "learning_rate": 8.993233753159768e-05, + "loss": 0.6129988431930542, + "step": 3828 + }, + { + "epoch": 1.6160337552742616, + "grad_norm": 1.176597237586975, + "learning_rate": 8.991788092517775e-05, + "loss": 0.6376019716262817, + "step": 3830 + }, + { + "epoch": 1.6168776371308016, + "grad_norm": 1.149990200996399, + "learning_rate": 8.99034151104693e-05, + "loss": 0.7300569415092468, + "step": 3832 + }, + { + "epoch": 1.6177215189873417, + "grad_norm": 1.0655301809310913, + "learning_rate": 8.988894009080936e-05, + "loss": 0.6163336634635925, + "step": 3834 + }, + { + "epoch": 1.618565400843882, + "grad_norm": 1.1596909761428833, + "learning_rate": 8.987445586953703e-05, + "loss": 0.6459008455276489, + "step": 3836 + }, + { + "epoch": 1.6194092827004218, + "grad_norm": 1.201897382736206, + "learning_rate": 8.985996244999352e-05, + "loss": 0.6166399121284485, + "step": 3838 + }, + { + "epoch": 1.620253164556962, + "grad_norm": 1.1000950336456299, + "learning_rate": 8.984545983552219e-05, + "loss": 0.6438087224960327, + "step": 3840 + }, + { + "epoch": 1.6210970464135022, + "grad_norm": 0.9962409734725952, + "learning_rate": 8.983094802946854e-05, + "loss": 0.6238043308258057, + "step": 3842 + }, + { + "epoch": 1.621940928270042, + "grad_norm": 1.2501682043075562, + "learning_rate": 8.981642703518015e-05, + "loss": 0.6445946097373962, + "step": 3844 + }, + { + "epoch": 1.6227848101265823, + "grad_norm": 1.2027913331985474, + "learning_rate": 8.980189685600673e-05, + "loss": 0.7147613167762756, + "step": 3846 + }, + { + "epoch": 1.6236286919831224, + "grad_norm": 1.1382197141647339, + "learning_rate": 8.97873574953001e-05, + "loss": 0.6531714200973511, + "step": 3848 + }, + { + "epoch": 1.6244725738396624, + "grad_norm": 1.2600723505020142, + "learning_rate": 8.977280895641425e-05, + "loss": 0.6811055541038513, + "step": 3850 + }, + { + "epoch": 1.6253164556962025, + "grad_norm": 0.9908071160316467, + "learning_rate": 8.97582512427052e-05, + "loss": 0.6142261624336243, + "step": 3852 + }, + { + "epoch": 1.6261603375527427, + "grad_norm": 1.171557068824768, + "learning_rate": 8.974368435753117e-05, + "loss": 0.6408987045288086, + "step": 3854 + }, + { + "epoch": 1.6270042194092826, + "grad_norm": 1.1839419603347778, + "learning_rate": 8.972910830425247e-05, + "loss": 0.7352069616317749, + "step": 3856 + }, + { + "epoch": 1.6278481012658228, + "grad_norm": 1.233730673789978, + "learning_rate": 8.971452308623148e-05, + "loss": 0.7663040161132812, + "step": 3858 + }, + { + "epoch": 1.628691983122363, + "grad_norm": 1.3636224269866943, + "learning_rate": 8.969992870683273e-05, + "loss": 0.6496971249580383, + "step": 3860 + }, + { + "epoch": 1.629535864978903, + "grad_norm": 1.2819573879241943, + "learning_rate": 8.96853251694229e-05, + "loss": 0.6079609394073486, + "step": 3862 + }, + { + "epoch": 1.630379746835443, + "grad_norm": 1.087265968322754, + "learning_rate": 8.967071247737071e-05, + "loss": 0.6299422979354858, + "step": 3864 + }, + { + "epoch": 1.6312236286919832, + "grad_norm": 1.24200439453125, + "learning_rate": 8.965609063404706e-05, + "loss": 0.6691840291023254, + "step": 3866 + }, + { + "epoch": 1.6320675105485232, + "grad_norm": 1.0771806240081787, + "learning_rate": 8.96414596428249e-05, + "loss": 0.6623613238334656, + "step": 3868 + }, + { + "epoch": 1.6329113924050633, + "grad_norm": 1.1830974817276, + "learning_rate": 8.962681950707932e-05, + "loss": 0.6663276553153992, + "step": 3870 + }, + { + "epoch": 1.6337552742616035, + "grad_norm": 1.1107177734375, + "learning_rate": 8.961217023018754e-05, + "loss": 0.6426810622215271, + "step": 3872 + }, + { + "epoch": 1.6345991561181434, + "grad_norm": 1.2528507709503174, + "learning_rate": 8.959751181552886e-05, + "loss": 0.7113696336746216, + "step": 3874 + }, + { + "epoch": 1.6354430379746834, + "grad_norm": 1.0656070709228516, + "learning_rate": 8.958284426648467e-05, + "loss": 0.6211581230163574, + "step": 3876 + }, + { + "epoch": 1.6362869198312238, + "grad_norm": 1.0627381801605225, + "learning_rate": 8.956816758643852e-05, + "loss": 0.5950066447257996, + "step": 3878 + }, + { + "epoch": 1.6371308016877637, + "grad_norm": 0.9812912344932556, + "learning_rate": 8.955348177877603e-05, + "loss": 0.6519815325737, + "step": 3880 + }, + { + "epoch": 1.6379746835443036, + "grad_norm": 1.1843842267990112, + "learning_rate": 8.953878684688493e-05, + "loss": 0.6830767393112183, + "step": 3882 + }, + { + "epoch": 1.638818565400844, + "grad_norm": 1.0393236875534058, + "learning_rate": 8.952408279415507e-05, + "loss": 0.5920302271842957, + "step": 3884 + }, + { + "epoch": 1.639662447257384, + "grad_norm": 0.9931944608688354, + "learning_rate": 8.950936962397838e-05, + "loss": 0.6269177198410034, + "step": 3886 + }, + { + "epoch": 1.640506329113924, + "grad_norm": 1.1461358070373535, + "learning_rate": 8.949464733974891e-05, + "loss": 0.7021532654762268, + "step": 3888 + }, + { + "epoch": 1.6413502109704643, + "grad_norm": 1.2654093503952026, + "learning_rate": 8.947991594486279e-05, + "loss": 0.7331246733665466, + "step": 3890 + }, + { + "epoch": 1.6421940928270042, + "grad_norm": 1.1487081050872803, + "learning_rate": 8.946517544271831e-05, + "loss": 0.6438513994216919, + "step": 3892 + }, + { + "epoch": 1.6430379746835442, + "grad_norm": 1.0876784324645996, + "learning_rate": 8.945042583671579e-05, + "loss": 0.6779276728630066, + "step": 3894 + }, + { + "epoch": 1.6438818565400843, + "grad_norm": 1.2382020950317383, + "learning_rate": 8.943566713025768e-05, + "loss": 0.7255419492721558, + "step": 3896 + }, + { + "epoch": 1.6447257383966245, + "grad_norm": 1.3502718210220337, + "learning_rate": 8.942089932674855e-05, + "loss": 0.7068934440612793, + "step": 3898 + }, + { + "epoch": 1.6455696202531644, + "grad_norm": 1.050878643989563, + "learning_rate": 8.940612242959503e-05, + "loss": 0.608700156211853, + "step": 3900 + }, + { + "epoch": 1.6455696202531644, + "eval_loss": 0.7049403786659241, + "eval_runtime": 854.9866, + "eval_samples_per_second": 2.464, + "eval_steps_per_second": 2.464, + "step": 3900 + }, + { + "epoch": 1.6464135021097046, + "grad_norm": 1.0536954402923584, + "learning_rate": 8.939133644220588e-05, + "loss": 0.6257222890853882, + "step": 3902 + }, + { + "epoch": 1.6472573839662448, + "grad_norm": 1.1903947591781616, + "learning_rate": 8.937654136799195e-05, + "loss": 0.6823404431343079, + "step": 3904 + }, + { + "epoch": 1.6481012658227847, + "grad_norm": 1.225679874420166, + "learning_rate": 8.936173721036616e-05, + "loss": 0.6596478819847107, + "step": 3906 + }, + { + "epoch": 1.6489451476793249, + "grad_norm": 1.0071430206298828, + "learning_rate": 8.934692397274354e-05, + "loss": 0.5638422966003418, + "step": 3908 + }, + { + "epoch": 1.649789029535865, + "grad_norm": 1.0146223306655884, + "learning_rate": 8.933210165854125e-05, + "loss": 0.5743419528007507, + "step": 3910 + }, + { + "epoch": 1.650632911392405, + "grad_norm": 1.122976541519165, + "learning_rate": 8.931727027117848e-05, + "loss": 0.6775169372558594, + "step": 3912 + }, + { + "epoch": 1.6514767932489451, + "grad_norm": 0.9223271012306213, + "learning_rate": 8.930242981407656e-05, + "loss": 0.5984215140342712, + "step": 3914 + }, + { + "epoch": 1.6523206751054853, + "grad_norm": 1.1599735021591187, + "learning_rate": 8.928758029065891e-05, + "loss": 0.6342158913612366, + "step": 3916 + }, + { + "epoch": 1.6531645569620252, + "grad_norm": 1.2680121660232544, + "learning_rate": 8.927272170435101e-05, + "loss": 0.678507924079895, + "step": 3918 + }, + { + "epoch": 1.6540084388185654, + "grad_norm": 1.3628549575805664, + "learning_rate": 8.925785405858047e-05, + "loss": 0.6739710569381714, + "step": 3920 + }, + { + "epoch": 1.6548523206751056, + "grad_norm": 1.163482427597046, + "learning_rate": 8.924297735677694e-05, + "loss": 0.7050020098686218, + "step": 3922 + }, + { + "epoch": 1.6556962025316455, + "grad_norm": 1.2057000398635864, + "learning_rate": 8.922809160237222e-05, + "loss": 0.6847540140151978, + "step": 3924 + }, + { + "epoch": 1.6565400843881857, + "grad_norm": 1.2784082889556885, + "learning_rate": 8.921319679880016e-05, + "loss": 0.7079069018363953, + "step": 3926 + }, + { + "epoch": 1.6573839662447258, + "grad_norm": 1.1701157093048096, + "learning_rate": 8.919829294949671e-05, + "loss": 0.665060818195343, + "step": 3928 + }, + { + "epoch": 1.6582278481012658, + "grad_norm": 1.3886606693267822, + "learning_rate": 8.918338005789988e-05, + "loss": 0.7547550201416016, + "step": 3930 + }, + { + "epoch": 1.659071729957806, + "grad_norm": 0.9504727721214294, + "learning_rate": 8.91684581274498e-05, + "loss": 0.5718522667884827, + "step": 3932 + }, + { + "epoch": 1.659915611814346, + "grad_norm": 1.1185030937194824, + "learning_rate": 8.915352716158869e-05, + "loss": 0.5984254479408264, + "step": 3934 + }, + { + "epoch": 1.660759493670886, + "grad_norm": 1.1489602327346802, + "learning_rate": 8.913858716376081e-05, + "loss": 0.6749780774116516, + "step": 3936 + }, + { + "epoch": 1.6616033755274262, + "grad_norm": 1.389431118965149, + "learning_rate": 8.912363813741255e-05, + "loss": 0.6537864804267883, + "step": 3938 + }, + { + "epoch": 1.6624472573839664, + "grad_norm": 1.0958757400512695, + "learning_rate": 8.910868008599235e-05, + "loss": 0.6033569574356079, + "step": 3940 + }, + { + "epoch": 1.6632911392405063, + "grad_norm": 1.2735344171524048, + "learning_rate": 8.909371301295075e-05, + "loss": 0.7404987215995789, + "step": 3942 + }, + { + "epoch": 1.6641350210970463, + "grad_norm": 1.123336911201477, + "learning_rate": 8.907873692174038e-05, + "loss": 0.6265006065368652, + "step": 3944 + }, + { + "epoch": 1.6649789029535866, + "grad_norm": 1.259470820426941, + "learning_rate": 8.90637518158159e-05, + "loss": 0.650705099105835, + "step": 3946 + }, + { + "epoch": 1.6658227848101266, + "grad_norm": 1.4020485877990723, + "learning_rate": 8.904875769863412e-05, + "loss": 0.7813970446586609, + "step": 3948 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.1709671020507812, + "learning_rate": 8.903375457365389e-05, + "loss": 0.6499447822570801, + "step": 3950 + }, + { + "epoch": 1.667510548523207, + "grad_norm": 1.085585355758667, + "learning_rate": 8.901874244433612e-05, + "loss": 0.6141875386238098, + "step": 3952 + }, + { + "epoch": 1.6683544303797468, + "grad_norm": 1.2340166568756104, + "learning_rate": 8.900372131414386e-05, + "loss": 0.7080221176147461, + "step": 3954 + }, + { + "epoch": 1.6691983122362868, + "grad_norm": 1.148576259613037, + "learning_rate": 8.898869118654216e-05, + "loss": 0.6340513229370117, + "step": 3956 + }, + { + "epoch": 1.6700421940928272, + "grad_norm": 1.2231999635696411, + "learning_rate": 8.89736520649982e-05, + "loss": 0.6999116539955139, + "step": 3958 + }, + { + "epoch": 1.6708860759493671, + "grad_norm": 1.1600396633148193, + "learning_rate": 8.895860395298121e-05, + "loss": 0.7177759408950806, + "step": 3960 + }, + { + "epoch": 1.671729957805907, + "grad_norm": 1.3019158840179443, + "learning_rate": 8.894354685396251e-05, + "loss": 0.6485702395439148, + "step": 3962 + }, + { + "epoch": 1.6725738396624472, + "grad_norm": 1.0153226852416992, + "learning_rate": 8.892848077141546e-05, + "loss": 0.6189450025558472, + "step": 3964 + }, + { + "epoch": 1.6734177215189874, + "grad_norm": 1.1953094005584717, + "learning_rate": 8.891340570881555e-05, + "loss": 0.6756728291511536, + "step": 3966 + }, + { + "epoch": 1.6742616033755273, + "grad_norm": 1.3376187086105347, + "learning_rate": 8.889832166964027e-05, + "loss": 0.6851167678833008, + "step": 3968 + }, + { + "epoch": 1.6751054852320675, + "grad_norm": 1.0045926570892334, + "learning_rate": 8.888322865736924e-05, + "loss": 0.5991915464401245, + "step": 3970 + }, + { + "epoch": 1.6759493670886076, + "grad_norm": 1.2115750312805176, + "learning_rate": 8.886812667548414e-05, + "loss": 0.713362455368042, + "step": 3972 + }, + { + "epoch": 1.6767932489451476, + "grad_norm": 1.1887929439544678, + "learning_rate": 8.88530157274687e-05, + "loss": 0.7058883309364319, + "step": 3974 + }, + { + "epoch": 1.6776371308016877, + "grad_norm": 1.1465295553207397, + "learning_rate": 8.883789581680868e-05, + "loss": 0.6501380801200867, + "step": 3976 + }, + { + "epoch": 1.678481012658228, + "grad_norm": 1.184693694114685, + "learning_rate": 8.882276694699204e-05, + "loss": 0.6109840273857117, + "step": 3978 + }, + { + "epoch": 1.6793248945147679, + "grad_norm": 1.2034777402877808, + "learning_rate": 8.880762912150862e-05, + "loss": 0.6815584897994995, + "step": 3980 + }, + { + "epoch": 1.680168776371308, + "grad_norm": 1.1312000751495361, + "learning_rate": 8.879248234385052e-05, + "loss": 0.6859248876571655, + "step": 3982 + }, + { + "epoch": 1.6810126582278482, + "grad_norm": 1.2273681163787842, + "learning_rate": 8.877732661751173e-05, + "loss": 0.6426702737808228, + "step": 3984 + }, + { + "epoch": 1.6818565400843881, + "grad_norm": 1.2550326585769653, + "learning_rate": 8.876216194598844e-05, + "loss": 0.6462456583976746, + "step": 3986 + }, + { + "epoch": 1.6827004219409283, + "grad_norm": 1.3111321926116943, + "learning_rate": 8.874698833277884e-05, + "loss": 0.6293925046920776, + "step": 3988 + }, + { + "epoch": 1.6835443037974684, + "grad_norm": 1.037883996963501, + "learning_rate": 8.873180578138316e-05, + "loss": 0.59798264503479, + "step": 3990 + }, + { + "epoch": 1.6843881856540084, + "grad_norm": 1.2411901950836182, + "learning_rate": 8.871661429530376e-05, + "loss": 0.6741529703140259, + "step": 3992 + }, + { + "epoch": 1.6852320675105485, + "grad_norm": 1.206354022026062, + "learning_rate": 8.8701413878045e-05, + "loss": 0.5972680449485779, + "step": 3994 + }, + { + "epoch": 1.6860759493670887, + "grad_norm": 1.1922144889831543, + "learning_rate": 8.868620453311334e-05, + "loss": 0.5879245400428772, + "step": 3996 + }, + { + "epoch": 1.6869198312236287, + "grad_norm": 1.3499996662139893, + "learning_rate": 8.867098626401729e-05, + "loss": 0.7381167411804199, + "step": 3998 + }, + { + "epoch": 1.6877637130801688, + "grad_norm": 1.3601514101028442, + "learning_rate": 8.865575907426737e-05, + "loss": 0.6590276956558228, + "step": 4000 + }, + { + "epoch": 1.6877637130801688, + "eval_loss": 0.7027890682220459, + "eval_runtime": 848.7529, + "eval_samples_per_second": 2.482, + "eval_steps_per_second": 2.482, + "step": 4000 + }, + { + "epoch": 1.688607594936709, + "grad_norm": 1.1060529947280884, + "learning_rate": 8.864052296737624e-05, + "loss": 0.5958077907562256, + "step": 4002 + }, + { + "epoch": 1.689451476793249, + "grad_norm": 1.2067371606826782, + "learning_rate": 8.862527794685858e-05, + "loss": 0.6802279353141785, + "step": 4004 + }, + { + "epoch": 1.690295358649789, + "grad_norm": 1.0094636678695679, + "learning_rate": 8.86100240162311e-05, + "loss": 0.5701603889465332, + "step": 4006 + }, + { + "epoch": 1.6911392405063292, + "grad_norm": 1.0976500511169434, + "learning_rate": 8.85947611790126e-05, + "loss": 0.6580625176429749, + "step": 4008 + }, + { + "epoch": 1.6919831223628692, + "grad_norm": 0.9448981285095215, + "learning_rate": 8.857948943872392e-05, + "loss": 0.5947542190551758, + "step": 4010 + }, + { + "epoch": 1.6928270042194091, + "grad_norm": 1.219609260559082, + "learning_rate": 8.856420879888796e-05, + "loss": 0.6361464262008667, + "step": 4012 + }, + { + "epoch": 1.6936708860759495, + "grad_norm": 1.2395503520965576, + "learning_rate": 8.854891926302966e-05, + "loss": 0.608664333820343, + "step": 4014 + }, + { + "epoch": 1.6945147679324895, + "grad_norm": 1.1300057172775269, + "learning_rate": 8.853362083467604e-05, + "loss": 0.6932460069656372, + "step": 4016 + }, + { + "epoch": 1.6953586497890294, + "grad_norm": 1.2300254106521606, + "learning_rate": 8.851831351735616e-05, + "loss": 0.646004855632782, + "step": 4018 + }, + { + "epoch": 1.6962025316455698, + "grad_norm": 1.2328956127166748, + "learning_rate": 8.85029973146011e-05, + "loss": 0.6760826110839844, + "step": 4020 + }, + { + "epoch": 1.6970464135021097, + "grad_norm": 1.1252286434173584, + "learning_rate": 8.848767222994401e-05, + "loss": 0.5943224430084229, + "step": 4022 + }, + { + "epoch": 1.6978902953586497, + "grad_norm": 1.1587592363357544, + "learning_rate": 8.847233826692012e-05, + "loss": 0.7535276412963867, + "step": 4024 + }, + { + "epoch": 1.6987341772151898, + "grad_norm": 1.0294606685638428, + "learning_rate": 8.845699542906667e-05, + "loss": 0.5903090834617615, + "step": 4026 + }, + { + "epoch": 1.69957805907173, + "grad_norm": 1.1940597295761108, + "learning_rate": 8.844164371992295e-05, + "loss": 0.6031379699707031, + "step": 4028 + }, + { + "epoch": 1.70042194092827, + "grad_norm": 1.0416409969329834, + "learning_rate": 8.842628314303031e-05, + "loss": 0.6185168623924255, + "step": 4030 + }, + { + "epoch": 1.70126582278481, + "grad_norm": 1.8715689182281494, + "learning_rate": 8.841091370193214e-05, + "loss": 0.6325570344924927, + "step": 4032 + }, + { + "epoch": 1.7021097046413503, + "grad_norm": 1.230658769607544, + "learning_rate": 8.839553540017387e-05, + "loss": 0.7413952350616455, + "step": 4034 + }, + { + "epoch": 1.7029535864978902, + "grad_norm": 1.298003077507019, + "learning_rate": 8.838014824130299e-05, + "loss": 0.6973189115524292, + "step": 4036 + }, + { + "epoch": 1.7037974683544304, + "grad_norm": 1.0246652364730835, + "learning_rate": 8.836475222886902e-05, + "loss": 0.6582493185997009, + "step": 4038 + }, + { + "epoch": 1.7046413502109705, + "grad_norm": 1.3652594089508057, + "learning_rate": 8.834934736642351e-05, + "loss": 0.6934399008750916, + "step": 4040 + }, + { + "epoch": 1.7054852320675105, + "grad_norm": 1.029778242111206, + "learning_rate": 8.833393365752007e-05, + "loss": 0.6437561511993408, + "step": 4042 + }, + { + "epoch": 1.7063291139240506, + "grad_norm": 1.1993004083633423, + "learning_rate": 8.831851110571437e-05, + "loss": 0.605059027671814, + "step": 4044 + }, + { + "epoch": 1.7071729957805908, + "grad_norm": 1.286389946937561, + "learning_rate": 8.830307971456406e-05, + "loss": 0.7035017609596252, + "step": 4046 + }, + { + "epoch": 1.7080168776371307, + "grad_norm": 1.1211459636688232, + "learning_rate": 8.82876394876289e-05, + "loss": 0.6429924964904785, + "step": 4048 + }, + { + "epoch": 1.7088607594936709, + "grad_norm": 1.1284868717193604, + "learning_rate": 8.827219042847064e-05, + "loss": 0.6454769968986511, + "step": 4050 + }, + { + "epoch": 1.709704641350211, + "grad_norm": 1.1934884786605835, + "learning_rate": 8.825673254065306e-05, + "loss": 0.707233190536499, + "step": 4052 + }, + { + "epoch": 1.710548523206751, + "grad_norm": 1.1560680866241455, + "learning_rate": 8.824126582774203e-05, + "loss": 0.6790444254875183, + "step": 4054 + }, + { + "epoch": 1.7113924050632912, + "grad_norm": 1.1924364566802979, + "learning_rate": 8.822579029330541e-05, + "loss": 0.6115295886993408, + "step": 4056 + }, + { + "epoch": 1.7122362869198313, + "grad_norm": 1.107370138168335, + "learning_rate": 8.82103059409131e-05, + "loss": 0.7039182186126709, + "step": 4058 + }, + { + "epoch": 1.7130801687763713, + "grad_norm": 1.2554657459259033, + "learning_rate": 8.819481277413707e-05, + "loss": 0.6580052971839905, + "step": 4060 + }, + { + "epoch": 1.7139240506329114, + "grad_norm": 1.2873135805130005, + "learning_rate": 8.817931079655127e-05, + "loss": 0.6042479276657104, + "step": 4062 + }, + { + "epoch": 1.7147679324894516, + "grad_norm": 1.027056097984314, + "learning_rate": 8.816380001173172e-05, + "loss": 0.5992372632026672, + "step": 4064 + }, + { + "epoch": 1.7156118143459915, + "grad_norm": 1.0694721937179565, + "learning_rate": 8.814828042325644e-05, + "loss": 0.7078655362129211, + "step": 4066 + }, + { + "epoch": 1.7164556962025317, + "grad_norm": 1.194984793663025, + "learning_rate": 8.813275203470555e-05, + "loss": 0.6618752479553223, + "step": 4068 + }, + { + "epoch": 1.7172995780590719, + "grad_norm": 1.1713165044784546, + "learning_rate": 8.811721484966109e-05, + "loss": 0.6328625679016113, + "step": 4070 + }, + { + "epoch": 1.7181434599156118, + "grad_norm": 0.9993656277656555, + "learning_rate": 8.810166887170724e-05, + "loss": 0.5916416645050049, + "step": 4072 + }, + { + "epoch": 1.7189873417721517, + "grad_norm": 1.172642707824707, + "learning_rate": 8.808611410443011e-05, + "loss": 0.6490002274513245, + "step": 4074 + }, + { + "epoch": 1.7198312236286921, + "grad_norm": 1.1404821872711182, + "learning_rate": 8.807055055141793e-05, + "loss": 0.6571791172027588, + "step": 4076 + }, + { + "epoch": 1.720675105485232, + "grad_norm": 1.2104214429855347, + "learning_rate": 8.80549782162609e-05, + "loss": 0.6233854293823242, + "step": 4078 + }, + { + "epoch": 1.721518987341772, + "grad_norm": 1.1691396236419678, + "learning_rate": 8.803939710255126e-05, + "loss": 0.6331531405448914, + "step": 4080 + }, + { + "epoch": 1.7223628691983124, + "grad_norm": 1.263174057006836, + "learning_rate": 8.802380721388325e-05, + "loss": 0.6321156620979309, + "step": 4082 + }, + { + "epoch": 1.7232067510548523, + "grad_norm": 1.0685606002807617, + "learning_rate": 8.80082085538532e-05, + "loss": 0.644904613494873, + "step": 4084 + }, + { + "epoch": 1.7240506329113923, + "grad_norm": 1.2289735078811646, + "learning_rate": 8.799260112605938e-05, + "loss": 0.6743831634521484, + "step": 4086 + }, + { + "epoch": 1.7248945147679327, + "grad_norm": 1.0661355257034302, + "learning_rate": 8.797698493410216e-05, + "loss": 0.6866999268531799, + "step": 4088 + }, + { + "epoch": 1.7257383966244726, + "grad_norm": 1.1001228094100952, + "learning_rate": 8.796135998158386e-05, + "loss": 0.691387414932251, + "step": 4090 + }, + { + "epoch": 1.7265822784810125, + "grad_norm": 1.1078115701675415, + "learning_rate": 8.794572627210887e-05, + "loss": 0.5882864594459534, + "step": 4092 + }, + { + "epoch": 1.7274261603375527, + "grad_norm": 1.0483999252319336, + "learning_rate": 8.79300838092836e-05, + "loss": 0.6192089319229126, + "step": 4094 + }, + { + "epoch": 1.7282700421940929, + "grad_norm": 1.1194913387298584, + "learning_rate": 8.791443259671645e-05, + "loss": 0.603322446346283, + "step": 4096 + }, + { + "epoch": 1.7291139240506328, + "grad_norm": 1.1800397634506226, + "learning_rate": 8.789877263801787e-05, + "loss": 0.6141818165779114, + "step": 4098 + }, + { + "epoch": 1.729957805907173, + "grad_norm": 1.261768102645874, + "learning_rate": 8.78831039368003e-05, + "loss": 0.6707983016967773, + "step": 4100 + }, + { + "epoch": 1.729957805907173, + "eval_loss": 0.7022181153297424, + "eval_runtime": 844.6405, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 2.495, + "step": 4100 + }, + { + "epoch": 1.7308016877637131, + "grad_norm": 1.2505232095718384, + "learning_rate": 8.786742649667822e-05, + "loss": 0.6440353989601135, + "step": 4102 + }, + { + "epoch": 1.731645569620253, + "grad_norm": 1.2631809711456299, + "learning_rate": 8.78517403212681e-05, + "loss": 0.6712808012962341, + "step": 4104 + }, + { + "epoch": 1.7324894514767932, + "grad_norm": 1.2781071662902832, + "learning_rate": 8.783604541418845e-05, + "loss": 0.6854958534240723, + "step": 4106 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 1.1065936088562012, + "learning_rate": 8.782034177905976e-05, + "loss": 0.6281477808952332, + "step": 4108 + }, + { + "epoch": 1.7341772151898733, + "grad_norm": 1.010961890220642, + "learning_rate": 8.780462941950457e-05, + "loss": 0.6835165619850159, + "step": 4110 + }, + { + "epoch": 1.7350210970464135, + "grad_norm": 1.1467366218566895, + "learning_rate": 8.778890833914744e-05, + "loss": 0.6674962639808655, + "step": 4112 + }, + { + "epoch": 1.7358649789029537, + "grad_norm": 1.0221859216690063, + "learning_rate": 8.77731785416149e-05, + "loss": 0.5967551469802856, + "step": 4114 + }, + { + "epoch": 1.7367088607594936, + "grad_norm": 1.347937822341919, + "learning_rate": 8.775744003053552e-05, + "loss": 0.7356855869293213, + "step": 4116 + }, + { + "epoch": 1.7375527426160338, + "grad_norm": 1.2952557802200317, + "learning_rate": 8.774169280953988e-05, + "loss": 0.6932644844055176, + "step": 4118 + }, + { + "epoch": 1.738396624472574, + "grad_norm": 1.0157089233398438, + "learning_rate": 8.772593688226052e-05, + "loss": 0.5917407870292664, + "step": 4120 + }, + { + "epoch": 1.7392405063291139, + "grad_norm": 1.1537878513336182, + "learning_rate": 8.77101722523321e-05, + "loss": 0.6335760354995728, + "step": 4122 + }, + { + "epoch": 1.740084388185654, + "grad_norm": 1.0989667177200317, + "learning_rate": 8.769439892339115e-05, + "loss": 0.6892110109329224, + "step": 4124 + }, + { + "epoch": 1.7409282700421942, + "grad_norm": 1.1293572187423706, + "learning_rate": 8.767861689907633e-05, + "loss": 0.5966230630874634, + "step": 4126 + }, + { + "epoch": 1.7417721518987341, + "grad_norm": 1.1167775392532349, + "learning_rate": 8.76628261830282e-05, + "loss": 0.5981804728507996, + "step": 4128 + }, + { + "epoch": 1.7426160337552743, + "grad_norm": 1.0572419166564941, + "learning_rate": 8.76470267788894e-05, + "loss": 0.5539529919624329, + "step": 4130 + }, + { + "epoch": 1.7434599156118145, + "grad_norm": 0.937256932258606, + "learning_rate": 8.763121869030456e-05, + "loss": 0.6238219141960144, + "step": 4132 + }, + { + "epoch": 1.7443037974683544, + "grad_norm": 1.082932472229004, + "learning_rate": 8.761540192092029e-05, + "loss": 0.6033329963684082, + "step": 4134 + }, + { + "epoch": 1.7451476793248946, + "grad_norm": 1.0495184659957886, + "learning_rate": 8.75995764743852e-05, + "loss": 0.5567626357078552, + "step": 4136 + }, + { + "epoch": 1.7459915611814347, + "grad_norm": 1.3143779039382935, + "learning_rate": 8.758374235434994e-05, + "loss": 0.6759346127510071, + "step": 4138 + }, + { + "epoch": 1.7468354430379747, + "grad_norm": 1.2385786771774292, + "learning_rate": 8.756789956446713e-05, + "loss": 0.6439400315284729, + "step": 4140 + }, + { + "epoch": 1.7476793248945146, + "grad_norm": 1.0453747510910034, + "learning_rate": 8.75520481083914e-05, + "loss": 0.627493679523468, + "step": 4142 + }, + { + "epoch": 1.748523206751055, + "grad_norm": 1.09946608543396, + "learning_rate": 8.753618798977935e-05, + "loss": 0.677209198474884, + "step": 4144 + }, + { + "epoch": 1.749367088607595, + "grad_norm": 1.2207063436508179, + "learning_rate": 8.752031921228965e-05, + "loss": 0.6874014735221863, + "step": 4146 + }, + { + "epoch": 1.7502109704641349, + "grad_norm": 1.2520697116851807, + "learning_rate": 8.750444177958288e-05, + "loss": 0.6332831382751465, + "step": 4148 + }, + { + "epoch": 1.7510548523206753, + "grad_norm": 1.2463186979293823, + "learning_rate": 8.748855569532168e-05, + "loss": 0.682744562625885, + "step": 4150 + }, + { + "epoch": 1.7518987341772152, + "grad_norm": 1.1895235776901245, + "learning_rate": 8.747266096317069e-05, + "loss": 0.7006803750991821, + "step": 4152 + }, + { + "epoch": 1.7527426160337551, + "grad_norm": 1.1627185344696045, + "learning_rate": 8.745675758679646e-05, + "loss": 0.6751191020011902, + "step": 4154 + }, + { + "epoch": 1.7535864978902953, + "grad_norm": 1.324127197265625, + "learning_rate": 8.744084556986764e-05, + "loss": 0.661848247051239, + "step": 4156 + }, + { + "epoch": 1.7544303797468355, + "grad_norm": 1.226809024810791, + "learning_rate": 8.74249249160548e-05, + "loss": 0.7057217955589294, + "step": 4158 + }, + { + "epoch": 1.7552742616033754, + "grad_norm": 1.2341214418411255, + "learning_rate": 8.740899562903056e-05, + "loss": 0.6856105923652649, + "step": 4160 + }, + { + "epoch": 1.7561181434599156, + "grad_norm": 1.3907564878463745, + "learning_rate": 8.739305771246946e-05, + "loss": 0.6616930365562439, + "step": 4162 + }, + { + "epoch": 1.7569620253164557, + "grad_norm": 1.2756825685501099, + "learning_rate": 8.737711117004812e-05, + "loss": 0.5791551470756531, + "step": 4164 + }, + { + "epoch": 1.7578059071729957, + "grad_norm": 1.2861095666885376, + "learning_rate": 8.736115600544506e-05, + "loss": 0.7074756622314453, + "step": 4166 + }, + { + "epoch": 1.7586497890295358, + "grad_norm": 1.2198424339294434, + "learning_rate": 8.734519222234083e-05, + "loss": 0.6494167447090149, + "step": 4168 + }, + { + "epoch": 1.759493670886076, + "grad_norm": 1.19169020652771, + "learning_rate": 8.732921982441799e-05, + "loss": 0.6546841859817505, + "step": 4170 + }, + { + "epoch": 1.760337552742616, + "grad_norm": 1.11533784866333, + "learning_rate": 8.731323881536108e-05, + "loss": 0.6701815724372864, + "step": 4172 + }, + { + "epoch": 1.761181434599156, + "grad_norm": 1.2148140668869019, + "learning_rate": 8.729724919885657e-05, + "loss": 0.6678179502487183, + "step": 4174 + }, + { + "epoch": 1.7620253164556963, + "grad_norm": 1.1968709230422974, + "learning_rate": 8.728125097859298e-05, + "loss": 0.6505144834518433, + "step": 4176 + }, + { + "epoch": 1.7628691983122362, + "grad_norm": 1.0954766273498535, + "learning_rate": 8.726524415826079e-05, + "loss": 0.6531696915626526, + "step": 4178 + }, + { + "epoch": 1.7637130801687764, + "grad_norm": 1.5149537324905396, + "learning_rate": 8.724922874155246e-05, + "loss": 0.710014283657074, + "step": 4180 + }, + { + "epoch": 1.7645569620253165, + "grad_norm": 1.145113229751587, + "learning_rate": 8.723320473216245e-05, + "loss": 0.714016318321228, + "step": 4182 + }, + { + "epoch": 1.7654008438818565, + "grad_norm": 0.9454524517059326, + "learning_rate": 8.721717213378719e-05, + "loss": 0.6775414347648621, + "step": 4184 + }, + { + "epoch": 1.7662447257383966, + "grad_norm": 1.1414754390716553, + "learning_rate": 8.720113095012507e-05, + "loss": 0.6279728412628174, + "step": 4186 + }, + { + "epoch": 1.7670886075949368, + "grad_norm": 1.212802767753601, + "learning_rate": 8.718508118487652e-05, + "loss": 0.5894309282302856, + "step": 4188 + }, + { + "epoch": 1.7679324894514767, + "grad_norm": 1.5213478803634644, + "learning_rate": 8.716902284174388e-05, + "loss": 0.6124046444892883, + "step": 4190 + }, + { + "epoch": 1.768776371308017, + "grad_norm": 0.9973840713500977, + "learning_rate": 8.715295592443154e-05, + "loss": 0.5990801453590393, + "step": 4192 + }, + { + "epoch": 1.769620253164557, + "grad_norm": 1.1084294319152832, + "learning_rate": 8.713688043664579e-05, + "loss": 0.6485559344291687, + "step": 4194 + }, + { + "epoch": 1.770464135021097, + "grad_norm": 1.1401913166046143, + "learning_rate": 8.712079638209493e-05, + "loss": 0.7083099484443665, + "step": 4196 + }, + { + "epoch": 1.7713080168776372, + "grad_norm": 1.278105616569519, + "learning_rate": 8.71047037644893e-05, + "loss": 0.7237915992736816, + "step": 4198 + }, + { + "epoch": 1.7721518987341773, + "grad_norm": 1.2407530546188354, + "learning_rate": 8.708860258754108e-05, + "loss": 0.6259870529174805, + "step": 4200 + }, + { + "epoch": 1.7721518987341773, + "eval_loss": 0.6993561387062073, + "eval_runtime": 542.0281, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 3.887, + "step": 4200 + }, + { + "epoch": 1.7729957805907173, + "grad_norm": 1.102859616279602, + "learning_rate": 8.707249285496457e-05, + "loss": 0.6604248285293579, + "step": 4202 + }, + { + "epoch": 1.7738396624472574, + "grad_norm": 1.2478244304656982, + "learning_rate": 8.705637457047594e-05, + "loss": 0.6799775958061218, + "step": 4204 + }, + { + "epoch": 1.7746835443037976, + "grad_norm": 1.1178022623062134, + "learning_rate": 8.704024773779338e-05, + "loss": 0.6136477589607239, + "step": 4206 + }, + { + "epoch": 1.7755274261603375, + "grad_norm": 1.904076337814331, + "learning_rate": 8.702411236063703e-05, + "loss": 0.6568390130996704, + "step": 4208 + }, + { + "epoch": 1.7763713080168775, + "grad_norm": 1.0902835130691528, + "learning_rate": 8.700796844272903e-05, + "loss": 0.6404406428337097, + "step": 4210 + }, + { + "epoch": 1.7772151898734179, + "grad_norm": 1.1858288049697876, + "learning_rate": 8.699181598779347e-05, + "loss": 0.6924911737442017, + "step": 4212 + }, + { + "epoch": 1.7780590717299578, + "grad_norm": 1.0015727281570435, + "learning_rate": 8.69756549995564e-05, + "loss": 0.572692334651947, + "step": 4214 + }, + { + "epoch": 1.7789029535864977, + "grad_norm": 1.440079689025879, + "learning_rate": 8.695948548174583e-05, + "loss": 0.7196018695831299, + "step": 4216 + }, + { + "epoch": 1.7797468354430381, + "grad_norm": 1.1320992708206177, + "learning_rate": 8.69433074380918e-05, + "loss": 0.5870906710624695, + "step": 4218 + }, + { + "epoch": 1.780590717299578, + "grad_norm": 1.3156964778900146, + "learning_rate": 8.692712087232626e-05, + "loss": 0.6501539349555969, + "step": 4220 + }, + { + "epoch": 1.781434599156118, + "grad_norm": 1.1869803667068481, + "learning_rate": 8.691092578818311e-05, + "loss": 0.7017278075218201, + "step": 4222 + }, + { + "epoch": 1.7822784810126582, + "grad_norm": 0.9708380699157715, + "learning_rate": 8.689472218939829e-05, + "loss": 0.5954802632331848, + "step": 4224 + }, + { + "epoch": 1.7831223628691983, + "grad_norm": 1.0753228664398193, + "learning_rate": 8.687851007970962e-05, + "loss": 0.6494144797325134, + "step": 4226 + }, + { + "epoch": 1.7839662447257383, + "grad_norm": 1.1038413047790527, + "learning_rate": 8.686228946285695e-05, + "loss": 0.7247282862663269, + "step": 4228 + }, + { + "epoch": 1.7848101265822784, + "grad_norm": 0.9666786789894104, + "learning_rate": 8.684606034258206e-05, + "loss": 0.5673812627792358, + "step": 4230 + }, + { + "epoch": 1.7856540084388186, + "grad_norm": 1.1972676515579224, + "learning_rate": 8.682982272262869e-05, + "loss": 0.5950504541397095, + "step": 4232 + }, + { + "epoch": 1.7864978902953585, + "grad_norm": 1.23736572265625, + "learning_rate": 8.681357660674255e-05, + "loss": 0.6477514505386353, + "step": 4234 + }, + { + "epoch": 1.7873417721518987, + "grad_norm": 1.0238158702850342, + "learning_rate": 8.679732199867127e-05, + "loss": 0.6180200576782227, + "step": 4236 + }, + { + "epoch": 1.7881856540084389, + "grad_norm": 1.0333375930786133, + "learning_rate": 8.678105890216455e-05, + "loss": 0.5771099328994751, + "step": 4238 + }, + { + "epoch": 1.7890295358649788, + "grad_norm": 1.30390202999115, + "learning_rate": 8.676478732097393e-05, + "loss": 0.6592516899108887, + "step": 4240 + }, + { + "epoch": 1.789873417721519, + "grad_norm": 1.115160346031189, + "learning_rate": 8.674850725885294e-05, + "loss": 0.6662757396697998, + "step": 4242 + }, + { + "epoch": 1.7907172995780591, + "grad_norm": 1.2130142450332642, + "learning_rate": 8.67322187195571e-05, + "loss": 0.6673333048820496, + "step": 4244 + }, + { + "epoch": 1.791561181434599, + "grad_norm": 1.1505554914474487, + "learning_rate": 8.671592170684386e-05, + "loss": 0.6698325872421265, + "step": 4246 + }, + { + "epoch": 1.7924050632911392, + "grad_norm": 1.0758062601089478, + "learning_rate": 8.669961622447262e-05, + "loss": 0.6216199398040771, + "step": 4248 + }, + { + "epoch": 1.7932489451476794, + "grad_norm": 0.9300920367240906, + "learning_rate": 8.668330227620475e-05, + "loss": 0.6460495591163635, + "step": 4250 + }, + { + "epoch": 1.7940928270042193, + "grad_norm": 1.3860046863555908, + "learning_rate": 8.666697986580357e-05, + "loss": 0.6949506998062134, + "step": 4252 + }, + { + "epoch": 1.7949367088607595, + "grad_norm": 1.2287555932998657, + "learning_rate": 8.665064899703433e-05, + "loss": 0.6320405602455139, + "step": 4254 + }, + { + "epoch": 1.7957805907172997, + "grad_norm": 1.1585466861724854, + "learning_rate": 8.663430967366426e-05, + "loss": 0.6635019779205322, + "step": 4256 + }, + { + "epoch": 1.7966244725738396, + "grad_norm": 1.1007941961288452, + "learning_rate": 8.661796189946252e-05, + "loss": 0.645052969455719, + "step": 4258 + }, + { + "epoch": 1.7974683544303798, + "grad_norm": 1.2059847116470337, + "learning_rate": 8.660160567820023e-05, + "loss": 0.70420902967453, + "step": 4260 + }, + { + "epoch": 1.79831223628692, + "grad_norm": 1.0648717880249023, + "learning_rate": 8.658524101365044e-05, + "loss": 0.6263765096664429, + "step": 4262 + }, + { + "epoch": 1.7991561181434599, + "grad_norm": 1.017052412033081, + "learning_rate": 8.656886790958821e-05, + "loss": 0.6199937462806702, + "step": 4264 + }, + { + "epoch": 1.8, + "grad_norm": 1.1153450012207031, + "learning_rate": 8.655248636979045e-05, + "loss": 0.5891271233558655, + "step": 4266 + }, + { + "epoch": 1.8008438818565402, + "grad_norm": 1.0661747455596924, + "learning_rate": 8.65360963980361e-05, + "loss": 0.5442121028900146, + "step": 4268 + }, + { + "epoch": 1.8016877637130801, + "grad_norm": 1.3049758672714233, + "learning_rate": 8.6519697998106e-05, + "loss": 0.6988245248794556, + "step": 4270 + }, + { + "epoch": 1.80253164556962, + "grad_norm": 1.2679938077926636, + "learning_rate": 8.650329117378294e-05, + "loss": 0.7260398864746094, + "step": 4272 + }, + { + "epoch": 1.8033755274261605, + "grad_norm": 1.0899536609649658, + "learning_rate": 8.648687592885168e-05, + "loss": 0.5757678151130676, + "step": 4274 + }, + { + "epoch": 1.8042194092827004, + "grad_norm": 1.4088575839996338, + "learning_rate": 8.647045226709887e-05, + "loss": 0.7042108178138733, + "step": 4276 + }, + { + "epoch": 1.8050632911392404, + "grad_norm": 1.2143783569335938, + "learning_rate": 8.645402019231316e-05, + "loss": 0.641275942325592, + "step": 4278 + }, + { + "epoch": 1.8059071729957807, + "grad_norm": 1.4072896242141724, + "learning_rate": 8.64375797082851e-05, + "loss": 0.7657124996185303, + "step": 4280 + }, + { + "epoch": 1.8067510548523207, + "grad_norm": 1.2563380002975464, + "learning_rate": 8.642113081880718e-05, + "loss": 0.713768720626831, + "step": 4282 + }, + { + "epoch": 1.8075949367088606, + "grad_norm": 1.1195416450500488, + "learning_rate": 8.64046735276739e-05, + "loss": 0.6276429295539856, + "step": 4284 + }, + { + "epoch": 1.808438818565401, + "grad_norm": 1.2472422122955322, + "learning_rate": 8.638820783868158e-05, + "loss": 0.5641238689422607, + "step": 4286 + }, + { + "epoch": 1.809282700421941, + "grad_norm": 1.1974313259124756, + "learning_rate": 8.637173375562855e-05, + "loss": 0.6312015056610107, + "step": 4288 + }, + { + "epoch": 1.810126582278481, + "grad_norm": 1.1673604249954224, + "learning_rate": 8.63552512823151e-05, + "loss": 0.6674410104751587, + "step": 4290 + }, + { + "epoch": 1.810970464135021, + "grad_norm": 1.199095368385315, + "learning_rate": 8.633876042254337e-05, + "loss": 0.6772016286849976, + "step": 4292 + }, + { + "epoch": 1.8118143459915612, + "grad_norm": 1.2302746772766113, + "learning_rate": 8.632226118011752e-05, + "loss": 0.6621671915054321, + "step": 4294 + }, + { + "epoch": 1.8126582278481012, + "grad_norm": 1.304010033607483, + "learning_rate": 8.63057535588436e-05, + "loss": 0.6965363621711731, + "step": 4296 + }, + { + "epoch": 1.8135021097046413, + "grad_norm": 1.223366618156433, + "learning_rate": 8.62892375625296e-05, + "loss": 0.6300807595252991, + "step": 4298 + }, + { + "epoch": 1.8143459915611815, + "grad_norm": 1.028496265411377, + "learning_rate": 8.627271319498544e-05, + "loss": 0.5610660910606384, + "step": 4300 + }, + { + "epoch": 1.8143459915611815, + "eval_loss": 0.6981000900268555, + "eval_runtime": 514.4659, + "eval_samples_per_second": 4.096, + "eval_steps_per_second": 4.096, + "step": 4300 + }, + { + "epoch": 1.8151898734177214, + "grad_norm": 1.2050007581710815, + "learning_rate": 8.625618046002298e-05, + "loss": 0.6666551232337952, + "step": 4302 + }, + { + "epoch": 1.8160337552742616, + "grad_norm": 1.1233220100402832, + "learning_rate": 8.6239639361456e-05, + "loss": 0.6631835103034973, + "step": 4304 + }, + { + "epoch": 1.8168776371308017, + "grad_norm": 1.1262956857681274, + "learning_rate": 8.622308990310021e-05, + "loss": 0.6395270228385925, + "step": 4306 + }, + { + "epoch": 1.8177215189873417, + "grad_norm": 1.0448222160339355, + "learning_rate": 8.620653208877328e-05, + "loss": 0.6165015697479248, + "step": 4308 + }, + { + "epoch": 1.8185654008438819, + "grad_norm": 1.1555759906768799, + "learning_rate": 8.618996592229473e-05, + "loss": 0.5915844440460205, + "step": 4310 + }, + { + "epoch": 1.819409282700422, + "grad_norm": 1.5407506227493286, + "learning_rate": 8.617339140748608e-05, + "loss": 0.6491456627845764, + "step": 4312 + }, + { + "epoch": 1.820253164556962, + "grad_norm": 1.3690788745880127, + "learning_rate": 8.615680854817077e-05, + "loss": 0.6053901314735413, + "step": 4314 + }, + { + "epoch": 1.8210970464135021, + "grad_norm": 1.052583932876587, + "learning_rate": 8.614021734817413e-05, + "loss": 0.5821644067764282, + "step": 4316 + }, + { + "epoch": 1.8219409282700423, + "grad_norm": 1.090567708015442, + "learning_rate": 8.612361781132344e-05, + "loss": 0.645878255367279, + "step": 4318 + }, + { + "epoch": 1.8227848101265822, + "grad_norm": 1.122719645500183, + "learning_rate": 8.610700994144787e-05, + "loss": 0.6883123517036438, + "step": 4320 + }, + { + "epoch": 1.8236286919831224, + "grad_norm": 1.3273001909255981, + "learning_rate": 8.609039374237856e-05, + "loss": 0.6918330788612366, + "step": 4322 + }, + { + "epoch": 1.8244725738396625, + "grad_norm": 1.0628443956375122, + "learning_rate": 8.607376921794855e-05, + "loss": 0.6292204856872559, + "step": 4324 + }, + { + "epoch": 1.8253164556962025, + "grad_norm": 1.287466287612915, + "learning_rate": 8.605713637199279e-05, + "loss": 0.6136105060577393, + "step": 4326 + }, + { + "epoch": 1.8261603375527427, + "grad_norm": 1.1399345397949219, + "learning_rate": 8.604049520834816e-05, + "loss": 0.6099681854248047, + "step": 4328 + }, + { + "epoch": 1.8270042194092828, + "grad_norm": 1.1131435632705688, + "learning_rate": 8.602384573085345e-05, + "loss": 0.6267056465148926, + "step": 4330 + }, + { + "epoch": 1.8278481012658228, + "grad_norm": 1.1312925815582275, + "learning_rate": 8.600718794334939e-05, + "loss": 0.609437882900238, + "step": 4332 + }, + { + "epoch": 1.828691983122363, + "grad_norm": 1.3711494207382202, + "learning_rate": 8.599052184967859e-05, + "loss": 0.727881669998169, + "step": 4334 + }, + { + "epoch": 1.829535864978903, + "grad_norm": 1.1403605937957764, + "learning_rate": 8.597384745368562e-05, + "loss": 0.6771696209907532, + "step": 4336 + }, + { + "epoch": 1.830379746835443, + "grad_norm": 1.2769951820373535, + "learning_rate": 8.595716475921693e-05, + "loss": 0.6812924742698669, + "step": 4338 + }, + { + "epoch": 1.831223628691983, + "grad_norm": 1.055721402168274, + "learning_rate": 8.59404737701209e-05, + "loss": 0.6403515338897705, + "step": 4340 + }, + { + "epoch": 1.8320675105485233, + "grad_norm": 1.1047639846801758, + "learning_rate": 8.592377449024784e-05, + "loss": 0.663240373134613, + "step": 4342 + }, + { + "epoch": 1.8329113924050633, + "grad_norm": 1.0808883905410767, + "learning_rate": 8.590706692344991e-05, + "loss": 0.6398993134498596, + "step": 4344 + }, + { + "epoch": 1.8337552742616032, + "grad_norm": 1.2433407306671143, + "learning_rate": 8.589035107358125e-05, + "loss": 0.6838348507881165, + "step": 4346 + }, + { + "epoch": 1.8345991561181436, + "grad_norm": 1.031216025352478, + "learning_rate": 8.58736269444979e-05, + "loss": 0.640884280204773, + "step": 4348 + }, + { + "epoch": 1.8354430379746836, + "grad_norm": 1.1417057514190674, + "learning_rate": 8.585689454005776e-05, + "loss": 0.6346741914749146, + "step": 4350 + }, + { + "epoch": 1.8362869198312235, + "grad_norm": 1.210988998413086, + "learning_rate": 8.584015386412072e-05, + "loss": 0.6209521889686584, + "step": 4352 + }, + { + "epoch": 1.8371308016877637, + "grad_norm": 1.2120760679244995, + "learning_rate": 8.582340492054847e-05, + "loss": 0.6699252128601074, + "step": 4354 + }, + { + "epoch": 1.8379746835443038, + "grad_norm": 1.1768114566802979, + "learning_rate": 8.580664771320475e-05, + "loss": 0.6472980380058289, + "step": 4356 + }, + { + "epoch": 1.8388185654008438, + "grad_norm": 1.060070276260376, + "learning_rate": 8.578988224595506e-05, + "loss": 0.6440452933311462, + "step": 4358 + }, + { + "epoch": 1.839662447257384, + "grad_norm": 1.1366443634033203, + "learning_rate": 8.57731085226669e-05, + "loss": 0.5894474387168884, + "step": 4360 + }, + { + "epoch": 1.840506329113924, + "grad_norm": 1.1571751832962036, + "learning_rate": 8.575632654720963e-05, + "loss": 0.5868900418281555, + "step": 4362 + }, + { + "epoch": 1.841350210970464, + "grad_norm": 1.1983840465545654, + "learning_rate": 8.573953632345453e-05, + "loss": 0.5841533541679382, + "step": 4364 + }, + { + "epoch": 1.8421940928270042, + "grad_norm": 1.101806640625, + "learning_rate": 8.572273785527481e-05, + "loss": 0.5503215193748474, + "step": 4366 + }, + { + "epoch": 1.8430379746835444, + "grad_norm": 1.0327471494674683, + "learning_rate": 8.570593114654552e-05, + "loss": 0.6131128072738647, + "step": 4368 + }, + { + "epoch": 1.8438818565400843, + "grad_norm": 1.1421098709106445, + "learning_rate": 8.568911620114368e-05, + "loss": 0.6614060401916504, + "step": 4370 + }, + { + "epoch": 1.8447257383966245, + "grad_norm": 1.1707026958465576, + "learning_rate": 8.567229302294814e-05, + "loss": 0.6392307877540588, + "step": 4372 + }, + { + "epoch": 1.8455696202531646, + "grad_norm": 1.1704418659210205, + "learning_rate": 8.565546161583969e-05, + "loss": 0.6560825109481812, + "step": 4374 + }, + { + "epoch": 1.8464135021097046, + "grad_norm": 1.3618037700653076, + "learning_rate": 8.563862198370103e-05, + "loss": 0.6996290683746338, + "step": 4376 + }, + { + "epoch": 1.8472573839662447, + "grad_norm": 1.116645097732544, + "learning_rate": 8.562177413041674e-05, + "loss": 0.6776535511016846, + "step": 4378 + }, + { + "epoch": 1.8481012658227849, + "grad_norm": 1.1669151782989502, + "learning_rate": 8.560491805987327e-05, + "loss": 0.6390423774719238, + "step": 4380 + }, + { + "epoch": 1.8489451476793248, + "grad_norm": 1.2188117504119873, + "learning_rate": 8.558805377595904e-05, + "loss": 0.6554020047187805, + "step": 4382 + }, + { + "epoch": 1.849789029535865, + "grad_norm": 1.216829776763916, + "learning_rate": 8.557118128256425e-05, + "loss": 0.6291787624359131, + "step": 4384 + }, + { + "epoch": 1.8506329113924052, + "grad_norm": 1.0431596040725708, + "learning_rate": 8.555430058358111e-05, + "loss": 0.6484442949295044, + "step": 4386 + }, + { + "epoch": 1.851476793248945, + "grad_norm": 1.3015289306640625, + "learning_rate": 8.553741168290367e-05, + "loss": 0.7034047842025757, + "step": 4388 + }, + { + "epoch": 1.8523206751054853, + "grad_norm": 1.2062040567398071, + "learning_rate": 8.552051458442785e-05, + "loss": 0.644135594367981, + "step": 4390 + }, + { + "epoch": 1.8531645569620254, + "grad_norm": 1.238461971282959, + "learning_rate": 8.55036092920515e-05, + "loss": 0.6767282485961914, + "step": 4392 + }, + { + "epoch": 1.8540084388185654, + "grad_norm": 1.2978830337524414, + "learning_rate": 8.548669580967435e-05, + "loss": 0.7292267680168152, + "step": 4394 + }, + { + "epoch": 1.8548523206751055, + "grad_norm": 1.1448328495025635, + "learning_rate": 8.546977414119801e-05, + "loss": 0.6788421273231506, + "step": 4396 + }, + { + "epoch": 1.8556962025316457, + "grad_norm": 1.0685368776321411, + "learning_rate": 8.5452844290526e-05, + "loss": 0.6745942234992981, + "step": 4398 + }, + { + "epoch": 1.8565400843881856, + "grad_norm": 1.125707983970642, + "learning_rate": 8.543590626156368e-05, + "loss": 0.6351125836372375, + "step": 4400 + }, + { + "epoch": 1.8565400843881856, + "eval_loss": 0.6961485147476196, + "eval_runtime": 513.5724, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 4400 + }, + { + "epoch": 1.8573839662447258, + "grad_norm": 1.072179913520813, + "learning_rate": 8.541896005821835e-05, + "loss": 0.5840762257575989, + "step": 4402 + }, + { + "epoch": 1.858227848101266, + "grad_norm": 1.2572803497314453, + "learning_rate": 8.540200568439915e-05, + "loss": 0.6431074738502502, + "step": 4404 + }, + { + "epoch": 1.859071729957806, + "grad_norm": 1.3294413089752197, + "learning_rate": 8.538504314401718e-05, + "loss": 0.708808183670044, + "step": 4406 + }, + { + "epoch": 1.8599156118143458, + "grad_norm": 1.1775587797164917, + "learning_rate": 8.536807244098533e-05, + "loss": 0.6580085754394531, + "step": 4408 + }, + { + "epoch": 1.8607594936708862, + "grad_norm": 1.1880089044570923, + "learning_rate": 8.53510935792184e-05, + "loss": 0.6500136256217957, + "step": 4410 + }, + { + "epoch": 1.8616033755274262, + "grad_norm": 1.2166204452514648, + "learning_rate": 8.533410656263313e-05, + "loss": 0.6922352313995361, + "step": 4412 + }, + { + "epoch": 1.862447257383966, + "grad_norm": 1.0405415296554565, + "learning_rate": 8.531711139514808e-05, + "loss": 0.6761626601219177, + "step": 4414 + }, + { + "epoch": 1.8632911392405065, + "grad_norm": 1.0674270391464233, + "learning_rate": 8.530010808068371e-05, + "loss": 0.672576904296875, + "step": 4416 + }, + { + "epoch": 1.8641350210970464, + "grad_norm": 1.0584741830825806, + "learning_rate": 8.528309662316236e-05, + "loss": 0.5521218180656433, + "step": 4418 + }, + { + "epoch": 1.8649789029535864, + "grad_norm": 1.3619039058685303, + "learning_rate": 8.526607702650824e-05, + "loss": 0.6546680927276611, + "step": 4420 + }, + { + "epoch": 1.8658227848101265, + "grad_norm": 0.9904745221138, + "learning_rate": 8.524904929464745e-05, + "loss": 0.6043933629989624, + "step": 4422 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 1.3046703338623047, + "learning_rate": 8.523201343150795e-05, + "loss": 0.7106801271438599, + "step": 4424 + }, + { + "epoch": 1.8675105485232066, + "grad_norm": 1.1166832447052002, + "learning_rate": 8.52149694410196e-05, + "loss": 0.6456703543663025, + "step": 4426 + }, + { + "epoch": 1.8683544303797468, + "grad_norm": 1.1260632276535034, + "learning_rate": 8.519791732711412e-05, + "loss": 0.5963318347930908, + "step": 4428 + }, + { + "epoch": 1.869198312236287, + "grad_norm": 1.0990599393844604, + "learning_rate": 8.51808570937251e-05, + "loss": 0.6295356750488281, + "step": 4430 + }, + { + "epoch": 1.870042194092827, + "grad_norm": 1.3689274787902832, + "learning_rate": 8.516378874478801e-05, + "loss": 0.6984617114067078, + "step": 4432 + }, + { + "epoch": 1.870886075949367, + "grad_norm": 1.0986580848693848, + "learning_rate": 8.514671228424018e-05, + "loss": 0.5598900318145752, + "step": 4434 + }, + { + "epoch": 1.8717299578059072, + "grad_norm": 0.9570761322975159, + "learning_rate": 8.512962771602085e-05, + "loss": 0.6286435723304749, + "step": 4436 + }, + { + "epoch": 1.8725738396624472, + "grad_norm": 1.1480669975280762, + "learning_rate": 8.511253504407107e-05, + "loss": 0.5956313014030457, + "step": 4438 + }, + { + "epoch": 1.8734177215189873, + "grad_norm": 1.1132479906082153, + "learning_rate": 8.50954342723338e-05, + "loss": 0.6523844599723816, + "step": 4440 + }, + { + "epoch": 1.8742616033755275, + "grad_norm": 1.1569167375564575, + "learning_rate": 8.507832540475387e-05, + "loss": 0.6231355667114258, + "step": 4442 + }, + { + "epoch": 1.8751054852320674, + "grad_norm": 1.1327043771743774, + "learning_rate": 8.506120844527796e-05, + "loss": 0.660773754119873, + "step": 4444 + }, + { + "epoch": 1.8759493670886076, + "grad_norm": 0.8939630389213562, + "learning_rate": 8.504408339785463e-05, + "loss": 0.6319235563278198, + "step": 4446 + }, + { + "epoch": 1.8767932489451478, + "grad_norm": 1.1910638809204102, + "learning_rate": 8.50269502664343e-05, + "loss": 0.6753001809120178, + "step": 4448 + }, + { + "epoch": 1.8776371308016877, + "grad_norm": 1.1502408981323242, + "learning_rate": 8.500980905496923e-05, + "loss": 0.6300671696662903, + "step": 4450 + }, + { + "epoch": 1.8784810126582279, + "grad_norm": 1.0639009475708008, + "learning_rate": 8.49926597674136e-05, + "loss": 0.6196691989898682, + "step": 4452 + }, + { + "epoch": 1.879324894514768, + "grad_norm": 1.1072754859924316, + "learning_rate": 8.497550240772341e-05, + "loss": 0.7029181122779846, + "step": 4454 + }, + { + "epoch": 1.880168776371308, + "grad_norm": 1.0440188646316528, + "learning_rate": 8.495833697985652e-05, + "loss": 0.65432208776474, + "step": 4456 + }, + { + "epoch": 1.8810126582278481, + "grad_norm": 1.0646617412567139, + "learning_rate": 8.494116348777269e-05, + "loss": 0.6446614861488342, + "step": 4458 + }, + { + "epoch": 1.8818565400843883, + "grad_norm": 1.2163805961608887, + "learning_rate": 8.492398193543349e-05, + "loss": 0.6430497765541077, + "step": 4460 + }, + { + "epoch": 1.8827004219409282, + "grad_norm": 1.2715297937393188, + "learning_rate": 8.490679232680241e-05, + "loss": 0.6609845161437988, + "step": 4462 + }, + { + "epoch": 1.8835443037974684, + "grad_norm": 1.0435588359832764, + "learning_rate": 8.488959466584469e-05, + "loss": 0.5791062712669373, + "step": 4464 + }, + { + "epoch": 1.8843881856540086, + "grad_norm": 1.229202151298523, + "learning_rate": 8.487238895652759e-05, + "loss": 0.6312171220779419, + "step": 4466 + }, + { + "epoch": 1.8852320675105485, + "grad_norm": 1.0713022947311401, + "learning_rate": 8.485517520282008e-05, + "loss": 0.6698815226554871, + "step": 4468 + }, + { + "epoch": 1.8860759493670884, + "grad_norm": 1.0172312259674072, + "learning_rate": 8.483795340869305e-05, + "loss": 0.6283810138702393, + "step": 4470 + }, + { + "epoch": 1.8869198312236288, + "grad_norm": 1.2880207300186157, + "learning_rate": 8.482072357811926e-05, + "loss": 0.6659437417984009, + "step": 4472 + }, + { + "epoch": 1.8877637130801688, + "grad_norm": 1.0840508937835693, + "learning_rate": 8.480348571507329e-05, + "loss": 0.6190289258956909, + "step": 4474 + }, + { + "epoch": 1.8886075949367087, + "grad_norm": 1.1101994514465332, + "learning_rate": 8.478623982353156e-05, + "loss": 0.5760066509246826, + "step": 4476 + }, + { + "epoch": 1.889451476793249, + "grad_norm": 1.2388770580291748, + "learning_rate": 8.476898590747237e-05, + "loss": 0.6151811480522156, + "step": 4478 + }, + { + "epoch": 1.890295358649789, + "grad_norm": 0.9986408948898315, + "learning_rate": 8.475172397087591e-05, + "loss": 0.5991593599319458, + "step": 4480 + }, + { + "epoch": 1.891139240506329, + "grad_norm": 1.1380778551101685, + "learning_rate": 8.473445401772415e-05, + "loss": 0.7262179255485535, + "step": 4482 + }, + { + "epoch": 1.8919831223628694, + "grad_norm": 1.3933676481246948, + "learning_rate": 8.471717605200092e-05, + "loss": 0.5806916356086731, + "step": 4484 + }, + { + "epoch": 1.8928270042194093, + "grad_norm": 1.0242944955825806, + "learning_rate": 8.469989007769194e-05, + "loss": 0.617904782295227, + "step": 4486 + }, + { + "epoch": 1.8936708860759492, + "grad_norm": 1.0909028053283691, + "learning_rate": 8.468259609878475e-05, + "loss": 0.6488202810287476, + "step": 4488 + }, + { + "epoch": 1.8945147679324894, + "grad_norm": 1.042611002922058, + "learning_rate": 8.466529411926874e-05, + "loss": 0.6015118956565857, + "step": 4490 + }, + { + "epoch": 1.8953586497890296, + "grad_norm": 1.3965784311294556, + "learning_rate": 8.46479841431351e-05, + "loss": 0.7035272717475891, + "step": 4492 + }, + { + "epoch": 1.8962025316455695, + "grad_norm": 1.1486462354660034, + "learning_rate": 8.463066617437698e-05, + "loss": 0.6611229777336121, + "step": 4494 + }, + { + "epoch": 1.8970464135021097, + "grad_norm": 1.0845859050750732, + "learning_rate": 8.461334021698925e-05, + "loss": 0.6378056406974792, + "step": 4496 + }, + { + "epoch": 1.8978902953586498, + "grad_norm": 0.936612069606781, + "learning_rate": 8.459600627496869e-05, + "loss": 0.642429769039154, + "step": 4498 + }, + { + "epoch": 1.8987341772151898, + "grad_norm": 1.1905454397201538, + "learning_rate": 8.457866435231391e-05, + "loss": 0.6341768503189087, + "step": 4500 + }, + { + "epoch": 1.8987341772151898, + "eval_loss": 0.6938078999519348, + "eval_runtime": 513.615, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 4.102, + "step": 4500 + }, + { + "epoch": 1.89957805907173, + "grad_norm": 0.9778118133544922, + "learning_rate": 8.456131445302538e-05, + "loss": 0.5973100662231445, + "step": 4502 + }, + { + "epoch": 1.90042194092827, + "grad_norm": 0.9587083458900452, + "learning_rate": 8.454395658110536e-05, + "loss": 0.5982911586761475, + "step": 4504 + }, + { + "epoch": 1.90126582278481, + "grad_norm": 1.327643871307373, + "learning_rate": 8.452659074055798e-05, + "loss": 0.6858586668968201, + "step": 4506 + }, + { + "epoch": 1.9021097046413502, + "grad_norm": 1.0740257501602173, + "learning_rate": 8.450921693538922e-05, + "loss": 0.6172328591346741, + "step": 4508 + }, + { + "epoch": 1.9029535864978904, + "grad_norm": 1.0705101490020752, + "learning_rate": 8.449183516960685e-05, + "loss": 0.5349634289741516, + "step": 4510 + }, + { + "epoch": 1.9037974683544303, + "grad_norm": 0.9151237607002258, + "learning_rate": 8.447444544722058e-05, + "loss": 0.5769277811050415, + "step": 4512 + }, + { + "epoch": 1.9046413502109705, + "grad_norm": 1.139900803565979, + "learning_rate": 8.44570477722418e-05, + "loss": 0.6579093933105469, + "step": 4514 + }, + { + "epoch": 1.9054852320675106, + "grad_norm": 1.2481658458709717, + "learning_rate": 8.443964214868387e-05, + "loss": 0.6748929619789124, + "step": 4516 + }, + { + "epoch": 1.9063291139240506, + "grad_norm": 1.1661686897277832, + "learning_rate": 8.442222858056193e-05, + "loss": 0.6492021083831787, + "step": 4518 + }, + { + "epoch": 1.9071729957805907, + "grad_norm": 1.241477370262146, + "learning_rate": 8.440480707189295e-05, + "loss": 0.635409951210022, + "step": 4520 + }, + { + "epoch": 1.908016877637131, + "grad_norm": 1.1102054119110107, + "learning_rate": 8.438737762669573e-05, + "loss": 0.631928026676178, + "step": 4522 + }, + { + "epoch": 1.9088607594936708, + "grad_norm": 1.0638107061386108, + "learning_rate": 8.43699402489909e-05, + "loss": 0.604518473148346, + "step": 4524 + }, + { + "epoch": 1.909704641350211, + "grad_norm": 1.0270655155181885, + "learning_rate": 8.435249494280096e-05, + "loss": 0.61314457654953, + "step": 4526 + }, + { + "epoch": 1.9105485232067512, + "grad_norm": 1.1840111017227173, + "learning_rate": 8.433504171215018e-05, + "loss": 0.661663293838501, + "step": 4528 + }, + { + "epoch": 1.9113924050632911, + "grad_norm": 1.1404399871826172, + "learning_rate": 8.43175805610647e-05, + "loss": 0.7026967406272888, + "step": 4530 + }, + { + "epoch": 1.9122362869198313, + "grad_norm": 1.2371265888214111, + "learning_rate": 8.430011149357246e-05, + "loss": 0.6599440574645996, + "step": 4532 + }, + { + "epoch": 1.9130801687763714, + "grad_norm": 1.0042651891708374, + "learning_rate": 8.428263451370326e-05, + "loss": 0.5728344321250916, + "step": 4534 + }, + { + "epoch": 1.9139240506329114, + "grad_norm": 1.04367196559906, + "learning_rate": 8.426514962548866e-05, + "loss": 0.6495450735092163, + "step": 4536 + }, + { + "epoch": 1.9147679324894513, + "grad_norm": 1.0867135524749756, + "learning_rate": 8.424765683296215e-05, + "loss": 0.6406553387641907, + "step": 4538 + }, + { + "epoch": 1.9156118143459917, + "grad_norm": 1.0751310586929321, + "learning_rate": 8.423015614015892e-05, + "loss": 0.6692186594009399, + "step": 4540 + }, + { + "epoch": 1.9164556962025316, + "grad_norm": 1.13556969165802, + "learning_rate": 8.421264755111607e-05, + "loss": 0.6029785871505737, + "step": 4542 + }, + { + "epoch": 1.9172995780590716, + "grad_norm": 1.1560977697372437, + "learning_rate": 8.419513106987251e-05, + "loss": 0.6457844972610474, + "step": 4544 + }, + { + "epoch": 1.918143459915612, + "grad_norm": 1.2192902565002441, + "learning_rate": 8.417760670046893e-05, + "loss": 0.7082147598266602, + "step": 4546 + }, + { + "epoch": 1.918987341772152, + "grad_norm": 1.1170696020126343, + "learning_rate": 8.41600744469479e-05, + "loss": 0.6919234991073608, + "step": 4548 + }, + { + "epoch": 1.9198312236286919, + "grad_norm": 1.061253547668457, + "learning_rate": 8.414253431335373e-05, + "loss": 0.6310052871704102, + "step": 4550 + }, + { + "epoch": 1.920675105485232, + "grad_norm": 1.0671885013580322, + "learning_rate": 8.412498630373263e-05, + "loss": 0.6330236792564392, + "step": 4552 + }, + { + "epoch": 1.9215189873417722, + "grad_norm": 1.2085163593292236, + "learning_rate": 8.410743042213256e-05, + "loss": 0.7031015157699585, + "step": 4554 + }, + { + "epoch": 1.9223628691983121, + "grad_norm": 1.2682013511657715, + "learning_rate": 8.408986667260334e-05, + "loss": 0.7078304290771484, + "step": 4556 + }, + { + "epoch": 1.9232067510548523, + "grad_norm": 1.2966876029968262, + "learning_rate": 8.407229505919658e-05, + "loss": 0.6542860865592957, + "step": 4558 + }, + { + "epoch": 1.9240506329113924, + "grad_norm": 1.1086169481277466, + "learning_rate": 8.405471558596573e-05, + "loss": 0.5856828093528748, + "step": 4560 + }, + { + "epoch": 1.9248945147679324, + "grad_norm": 1.3175504207611084, + "learning_rate": 8.403712825696604e-05, + "loss": 0.7382104992866516, + "step": 4562 + }, + { + "epoch": 1.9257383966244725, + "grad_norm": 1.163164496421814, + "learning_rate": 8.401953307625454e-05, + "loss": 0.6862360239028931, + "step": 4564 + }, + { + "epoch": 1.9265822784810127, + "grad_norm": 1.207650899887085, + "learning_rate": 8.400193004789013e-05, + "loss": 0.7442302703857422, + "step": 4566 + }, + { + "epoch": 1.9274261603375527, + "grad_norm": 1.1570589542388916, + "learning_rate": 8.398431917593345e-05, + "loss": 0.595226526260376, + "step": 4568 + }, + { + "epoch": 1.9282700421940928, + "grad_norm": 1.091927170753479, + "learning_rate": 8.396670046444704e-05, + "loss": 0.6360410451889038, + "step": 4570 + }, + { + "epoch": 1.929113924050633, + "grad_norm": 1.149559497833252, + "learning_rate": 8.394907391749516e-05, + "loss": 0.6343122124671936, + "step": 4572 + }, + { + "epoch": 1.929957805907173, + "grad_norm": 1.0585254430770874, + "learning_rate": 8.393143953914395e-05, + "loss": 0.7394745349884033, + "step": 4574 + }, + { + "epoch": 1.930801687763713, + "grad_norm": 1.1648521423339844, + "learning_rate": 8.391379733346128e-05, + "loss": 0.6489678025245667, + "step": 4576 + }, + { + "epoch": 1.9316455696202532, + "grad_norm": 1.1756316423416138, + "learning_rate": 8.389614730451692e-05, + "loss": 0.6687861084938049, + "step": 4578 + }, + { + "epoch": 1.9324894514767932, + "grad_norm": 0.9857237339019775, + "learning_rate": 8.387848945638235e-05, + "loss": 0.523727536201477, + "step": 4580 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 1.1038693189620972, + "learning_rate": 8.386082379313092e-05, + "loss": 0.6545047760009766, + "step": 4582 + }, + { + "epoch": 1.9341772151898735, + "grad_norm": 1.0780832767486572, + "learning_rate": 8.384315031883774e-05, + "loss": 0.6067036390304565, + "step": 4584 + }, + { + "epoch": 1.9350210970464135, + "grad_norm": 1.2915070056915283, + "learning_rate": 8.382546903757975e-05, + "loss": 0.6880824565887451, + "step": 4586 + }, + { + "epoch": 1.9358649789029536, + "grad_norm": 1.1243441104888916, + "learning_rate": 8.380777995343568e-05, + "loss": 0.7319117188453674, + "step": 4588 + }, + { + "epoch": 1.9367088607594938, + "grad_norm": 1.1143072843551636, + "learning_rate": 8.379008307048609e-05, + "loss": 0.6845395565032959, + "step": 4590 + }, + { + "epoch": 1.9375527426160337, + "grad_norm": 1.039494276046753, + "learning_rate": 8.377237839281327e-05, + "loss": 0.6653600335121155, + "step": 4592 + }, + { + "epoch": 1.9383966244725739, + "grad_norm": 1.299617886543274, + "learning_rate": 8.375466592450136e-05, + "loss": 0.6352495551109314, + "step": 4594 + }, + { + "epoch": 1.939240506329114, + "grad_norm": 0.9918657541275024, + "learning_rate": 8.373694566963631e-05, + "loss": 0.5660957098007202, + "step": 4596 + }, + { + "epoch": 1.940084388185654, + "grad_norm": 1.0540478229522705, + "learning_rate": 8.371921763230579e-05, + "loss": 0.6296496987342834, + "step": 4598 + }, + { + "epoch": 1.9409282700421941, + "grad_norm": 1.1309545040130615, + "learning_rate": 8.370148181659939e-05, + "loss": 0.6672025918960571, + "step": 4600 + }, + { + "epoch": 1.9409282700421941, + "eval_loss": 0.6930755376815796, + "eval_runtime": 617.8927, + "eval_samples_per_second": 3.41, + "eval_steps_per_second": 3.41, + "step": 4600 + }, + { + "epoch": 1.9417721518987343, + "grad_norm": 1.2338588237762451, + "learning_rate": 8.368373822660836e-05, + "loss": 0.6200884580612183, + "step": 4602 + }, + { + "epoch": 1.9426160337552743, + "grad_norm": 1.1756945848464966, + "learning_rate": 8.366598686642582e-05, + "loss": 0.653294026851654, + "step": 4604 + }, + { + "epoch": 1.9434599156118142, + "grad_norm": 1.032018780708313, + "learning_rate": 8.364822774014671e-05, + "loss": 0.5670395493507385, + "step": 4606 + }, + { + "epoch": 1.9443037974683546, + "grad_norm": 1.045280933380127, + "learning_rate": 8.363046085186766e-05, + "loss": 0.6819197535514832, + "step": 4608 + }, + { + "epoch": 1.9451476793248945, + "grad_norm": 1.3223930597305298, + "learning_rate": 8.36126862056872e-05, + "loss": 0.6952820420265198, + "step": 4610 + }, + { + "epoch": 1.9459915611814345, + "grad_norm": 1.0048432350158691, + "learning_rate": 8.359490380570556e-05, + "loss": 0.5291440486907959, + "step": 4612 + }, + { + "epoch": 1.9468354430379748, + "grad_norm": 1.1477346420288086, + "learning_rate": 8.357711365602483e-05, + "loss": 0.6857813000679016, + "step": 4614 + }, + { + "epoch": 1.9476793248945148, + "grad_norm": 0.959985077381134, + "learning_rate": 8.355931576074882e-05, + "loss": 0.5581508278846741, + "step": 4616 + }, + { + "epoch": 1.9485232067510547, + "grad_norm": 1.1104289293289185, + "learning_rate": 8.35415101239832e-05, + "loss": 0.6536211371421814, + "step": 4618 + }, + { + "epoch": 1.9493670886075949, + "grad_norm": 1.2344517707824707, + "learning_rate": 8.352369674983535e-05, + "loss": 0.6570560336112976, + "step": 4620 + }, + { + "epoch": 1.950210970464135, + "grad_norm": 1.3411606550216675, + "learning_rate": 8.350587564241451e-05, + "loss": 0.6070495247840881, + "step": 4622 + }, + { + "epoch": 1.951054852320675, + "grad_norm": 1.1713159084320068, + "learning_rate": 8.348804680583166e-05, + "loss": 0.6444135904312134, + "step": 4624 + }, + { + "epoch": 1.9518987341772152, + "grad_norm": 1.127242922782898, + "learning_rate": 8.347021024419954e-05, + "loss": 0.6517419815063477, + "step": 4626 + }, + { + "epoch": 1.9527426160337553, + "grad_norm": 1.0733028650283813, + "learning_rate": 8.345236596163274e-05, + "loss": 0.6174065470695496, + "step": 4628 + }, + { + "epoch": 1.9535864978902953, + "grad_norm": 1.1114680767059326, + "learning_rate": 8.343451396224757e-05, + "loss": 0.7163593769073486, + "step": 4630 + }, + { + "epoch": 1.9544303797468354, + "grad_norm": 1.0839568376541138, + "learning_rate": 8.341665425016216e-05, + "loss": 0.698553204536438, + "step": 4632 + }, + { + "epoch": 1.9552742616033756, + "grad_norm": 1.17001211643219, + "learning_rate": 8.339878682949638e-05, + "loss": 0.6224857568740845, + "step": 4634 + }, + { + "epoch": 1.9561181434599155, + "grad_norm": 3.483793020248413, + "learning_rate": 8.338091170437193e-05, + "loss": 0.5931200981140137, + "step": 4636 + }, + { + "epoch": 1.9569620253164557, + "grad_norm": 1.1575394868850708, + "learning_rate": 8.336302887891224e-05, + "loss": 0.6031442284584045, + "step": 4638 + }, + { + "epoch": 1.9578059071729959, + "grad_norm": 1.1494992971420288, + "learning_rate": 8.334513835724252e-05, + "loss": 0.6101768016815186, + "step": 4640 + }, + { + "epoch": 1.9586497890295358, + "grad_norm": 1.3858197927474976, + "learning_rate": 8.332724014348981e-05, + "loss": 0.6571711301803589, + "step": 4642 + }, + { + "epoch": 1.959493670886076, + "grad_norm": 1.1094943284988403, + "learning_rate": 8.330933424178284e-05, + "loss": 0.6391071677207947, + "step": 4644 + }, + { + "epoch": 1.9603375527426161, + "grad_norm": 1.1640198230743408, + "learning_rate": 8.329142065625218e-05, + "loss": 0.6542805433273315, + "step": 4646 + }, + { + "epoch": 1.961181434599156, + "grad_norm": 1.1080211400985718, + "learning_rate": 8.327349939103016e-05, + "loss": 0.6053075194358826, + "step": 4648 + }, + { + "epoch": 1.9620253164556962, + "grad_norm": 1.0137052536010742, + "learning_rate": 8.325557045025085e-05, + "loss": 0.6009573340415955, + "step": 4650 + }, + { + "epoch": 1.9628691983122364, + "grad_norm": 1.0867283344268799, + "learning_rate": 8.323763383805012e-05, + "loss": 0.5993483066558838, + "step": 4652 + }, + { + "epoch": 1.9637130801687763, + "grad_norm": 1.0577161312103271, + "learning_rate": 8.321968955856562e-05, + "loss": 0.6788463592529297, + "step": 4654 + }, + { + "epoch": 1.9645569620253165, + "grad_norm": 1.2002183198928833, + "learning_rate": 8.320173761593672e-05, + "loss": 0.5786917209625244, + "step": 4656 + }, + { + "epoch": 1.9654008438818567, + "grad_norm": 1.2266993522644043, + "learning_rate": 8.318377801430461e-05, + "loss": 0.7437994480133057, + "step": 4658 + }, + { + "epoch": 1.9662447257383966, + "grad_norm": 1.007582187652588, + "learning_rate": 8.316581075781223e-05, + "loss": 0.6763550639152527, + "step": 4660 + }, + { + "epoch": 1.9670886075949368, + "grad_norm": 1.2374811172485352, + "learning_rate": 8.314783585060425e-05, + "loss": 0.6953140497207642, + "step": 4662 + }, + { + "epoch": 1.967932489451477, + "grad_norm": 1.1791057586669922, + "learning_rate": 8.312985329682717e-05, + "loss": 0.6867341995239258, + "step": 4664 + }, + { + "epoch": 1.9687763713080169, + "grad_norm": 1.1903331279754639, + "learning_rate": 8.31118631006292e-05, + "loss": 0.6445001363754272, + "step": 4666 + }, + { + "epoch": 1.9696202531645568, + "grad_norm": 1.1731067895889282, + "learning_rate": 8.309386526616034e-05, + "loss": 0.6500589847564697, + "step": 4668 + }, + { + "epoch": 1.9704641350210972, + "grad_norm": 0.9470233917236328, + "learning_rate": 8.307585979757233e-05, + "loss": 0.6215718984603882, + "step": 4670 + }, + { + "epoch": 1.9713080168776371, + "grad_norm": 1.2900800704956055, + "learning_rate": 8.305784669901872e-05, + "loss": 0.6396787762641907, + "step": 4672 + }, + { + "epoch": 1.972151898734177, + "grad_norm": 1.1729133129119873, + "learning_rate": 8.303982597465474e-05, + "loss": 0.6581959128379822, + "step": 4674 + }, + { + "epoch": 1.9729957805907175, + "grad_norm": 1.1450555324554443, + "learning_rate": 8.302179762863746e-05, + "loss": 0.7013490796089172, + "step": 4676 + }, + { + "epoch": 1.9738396624472574, + "grad_norm": 1.1506338119506836, + "learning_rate": 8.300376166512567e-05, + "loss": 0.6796102523803711, + "step": 4678 + }, + { + "epoch": 1.9746835443037973, + "grad_norm": 1.149979591369629, + "learning_rate": 8.298571808827991e-05, + "loss": 0.6960519552230835, + "step": 4680 + }, + { + "epoch": 1.9755274261603377, + "grad_norm": 1.1078912019729614, + "learning_rate": 8.296766690226249e-05, + "loss": 0.6789507865905762, + "step": 4682 + }, + { + "epoch": 1.9763713080168777, + "grad_norm": 1.0199202299118042, + "learning_rate": 8.294960811123747e-05, + "loss": 0.5962659120559692, + "step": 4684 + }, + { + "epoch": 1.9772151898734176, + "grad_norm": 1.2226134538650513, + "learning_rate": 8.293154171937068e-05, + "loss": 0.6483094692230225, + "step": 4686 + }, + { + "epoch": 1.9780590717299578, + "grad_norm": 1.184095025062561, + "learning_rate": 8.291346773082965e-05, + "loss": 0.6750242710113525, + "step": 4688 + }, + { + "epoch": 1.978902953586498, + "grad_norm": 1.1018693447113037, + "learning_rate": 8.289538614978375e-05, + "loss": 0.7094066739082336, + "step": 4690 + }, + { + "epoch": 1.9797468354430379, + "grad_norm": 1.0342390537261963, + "learning_rate": 8.287729698040403e-05, + "loss": 0.6554126739501953, + "step": 4692 + }, + { + "epoch": 1.980590717299578, + "grad_norm": 1.0603563785552979, + "learning_rate": 8.285920022686332e-05, + "loss": 0.5493529438972473, + "step": 4694 + }, + { + "epoch": 1.9814345991561182, + "grad_norm": 1.139609932899475, + "learning_rate": 8.284109589333617e-05, + "loss": 0.6824741363525391, + "step": 4696 + }, + { + "epoch": 1.9822784810126581, + "grad_norm": 1.2167822122573853, + "learning_rate": 8.282298398399895e-05, + "loss": 0.7121000289916992, + "step": 4698 + }, + { + "epoch": 1.9831223628691983, + "grad_norm": 1.109857201576233, + "learning_rate": 8.280486450302968e-05, + "loss": 0.6711249351501465, + "step": 4700 + }, + { + "epoch": 1.9831223628691983, + "eval_loss": 0.6923081278800964, + "eval_runtime": 514.7729, + "eval_samples_per_second": 4.093, + "eval_steps_per_second": 4.093, + "step": 4700 + }, + { + "epoch": 1.9839662447257385, + "grad_norm": 1.1387107372283936, + "learning_rate": 8.27867374546082e-05, + "loss": 0.581635594367981, + "step": 4702 + }, + { + "epoch": 1.9848101265822784, + "grad_norm": 1.2519257068634033, + "learning_rate": 8.27686028429161e-05, + "loss": 0.6867302060127258, + "step": 4704 + }, + { + "epoch": 1.9856540084388186, + "grad_norm": 1.0927205085754395, + "learning_rate": 8.275046067213663e-05, + "loss": 0.6494556665420532, + "step": 4706 + }, + { + "epoch": 1.9864978902953587, + "grad_norm": 1.042035698890686, + "learning_rate": 8.273231094645487e-05, + "loss": 0.6949493288993835, + "step": 4708 + }, + { + "epoch": 1.9873417721518987, + "grad_norm": 1.0220824480056763, + "learning_rate": 8.271415367005762e-05, + "loss": 0.6535884737968445, + "step": 4710 + }, + { + "epoch": 1.9881856540084388, + "grad_norm": 1.3023611307144165, + "learning_rate": 8.269598884713339e-05, + "loss": 0.6635278463363647, + "step": 4712 + }, + { + "epoch": 1.989029535864979, + "grad_norm": 1.2526965141296387, + "learning_rate": 8.267781648187248e-05, + "loss": 0.7194697856903076, + "step": 4714 + }, + { + "epoch": 1.989873417721519, + "grad_norm": 1.0388038158416748, + "learning_rate": 8.265963657846691e-05, + "loss": 0.6355333924293518, + "step": 4716 + }, + { + "epoch": 1.990717299578059, + "grad_norm": 1.0852965116500854, + "learning_rate": 8.264144914111041e-05, + "loss": 0.6898305416107178, + "step": 4718 + }, + { + "epoch": 1.9915611814345993, + "grad_norm": 1.0714049339294434, + "learning_rate": 8.262325417399847e-05, + "loss": 0.6202836036682129, + "step": 4720 + }, + { + "epoch": 1.9924050632911392, + "grad_norm": 1.0767238140106201, + "learning_rate": 8.260505168132835e-05, + "loss": 0.6160458326339722, + "step": 4722 + }, + { + "epoch": 1.9932489451476794, + "grad_norm": 0.9605211615562439, + "learning_rate": 8.258684166729899e-05, + "loss": 0.6049920916557312, + "step": 4724 + }, + { + "epoch": 1.9940928270042195, + "grad_norm": 1.0580185651779175, + "learning_rate": 8.256862413611113e-05, + "loss": 0.5622014999389648, + "step": 4726 + }, + { + "epoch": 1.9949367088607595, + "grad_norm": 1.1039034128189087, + "learning_rate": 8.255039909196713e-05, + "loss": 0.6678924560546875, + "step": 4728 + }, + { + "epoch": 1.9957805907172996, + "grad_norm": 1.1482586860656738, + "learning_rate": 8.253216653907123e-05, + "loss": 0.658260703086853, + "step": 4730 + }, + { + "epoch": 1.9966244725738398, + "grad_norm": 1.135349988937378, + "learning_rate": 8.251392648162929e-05, + "loss": 0.6461613178253174, + "step": 4732 + }, + { + "epoch": 1.9974683544303797, + "grad_norm": 1.0155420303344727, + "learning_rate": 8.249567892384895e-05, + "loss": 0.6837426424026489, + "step": 4734 + }, + { + "epoch": 1.9983122362869197, + "grad_norm": 1.3392970561981201, + "learning_rate": 8.247742386993958e-05, + "loss": 0.6091697812080383, + "step": 4736 + }, + { + "epoch": 1.99915611814346, + "grad_norm": 1.0509974956512451, + "learning_rate": 8.245916132411226e-05, + "loss": 0.6539653539657593, + "step": 4738 + }, + { + "epoch": 2.0, + "grad_norm": 0.9777396321296692, + "learning_rate": 8.244089129057982e-05, + "loss": 0.5630147457122803, + "step": 4740 + }, + { + "epoch": 2.00084388185654, + "grad_norm": 1.1639164686203003, + "learning_rate": 8.24226137735568e-05, + "loss": 0.6190353631973267, + "step": 4742 + }, + { + "epoch": 2.0016877637130803, + "grad_norm": 1.119614839553833, + "learning_rate": 8.240432877725947e-05, + "loss": 0.6282529234886169, + "step": 4744 + }, + { + "epoch": 2.0025316455696203, + "grad_norm": 1.114739179611206, + "learning_rate": 8.238603630590581e-05, + "loss": 0.6176725625991821, + "step": 4746 + }, + { + "epoch": 2.00337552742616, + "grad_norm": 1.0543076992034912, + "learning_rate": 8.236773636371557e-05, + "loss": 0.5182007551193237, + "step": 4748 + }, + { + "epoch": 2.0042194092827006, + "grad_norm": 1.060389518737793, + "learning_rate": 8.234942895491019e-05, + "loss": 0.532536506652832, + "step": 4750 + }, + { + "epoch": 2.0050632911392405, + "grad_norm": 1.0824412107467651, + "learning_rate": 8.233111408371282e-05, + "loss": 0.5474061369895935, + "step": 4752 + }, + { + "epoch": 2.0059071729957805, + "grad_norm": 1.1450858116149902, + "learning_rate": 8.231279175434838e-05, + "loss": 0.586384654045105, + "step": 4754 + }, + { + "epoch": 2.006751054852321, + "grad_norm": 1.1225577592849731, + "learning_rate": 8.229446197104345e-05, + "loss": 0.6469444036483765, + "step": 4756 + }, + { + "epoch": 2.007594936708861, + "grad_norm": 1.7292449474334717, + "learning_rate": 8.227612473802637e-05, + "loss": 0.5371572971343994, + "step": 4758 + }, + { + "epoch": 2.0084388185654007, + "grad_norm": 1.1743781566619873, + "learning_rate": 8.22577800595272e-05, + "loss": 0.558707058429718, + "step": 4760 + }, + { + "epoch": 2.009282700421941, + "grad_norm": 1.0385273694992065, + "learning_rate": 8.223942793977769e-05, + "loss": 0.5943514108657837, + "step": 4762 + }, + { + "epoch": 2.010126582278481, + "grad_norm": 1.1302000284194946, + "learning_rate": 8.222106838301131e-05, + "loss": 0.5630753636360168, + "step": 4764 + }, + { + "epoch": 2.010970464135021, + "grad_norm": 1.140005111694336, + "learning_rate": 8.220270139346327e-05, + "loss": 0.527510404586792, + "step": 4766 + }, + { + "epoch": 2.0118143459915614, + "grad_norm": 1.1979734897613525, + "learning_rate": 8.21843269753705e-05, + "loss": 0.6315013766288757, + "step": 4768 + }, + { + "epoch": 2.0126582278481013, + "grad_norm": 1.3759459257125854, + "learning_rate": 8.21659451329716e-05, + "loss": 0.6225199699401855, + "step": 4770 + }, + { + "epoch": 2.0135021097046413, + "grad_norm": 1.330600380897522, + "learning_rate": 8.21475558705069e-05, + "loss": 0.6838938593864441, + "step": 4772 + }, + { + "epoch": 2.014345991561181, + "grad_norm": 1.2365351915359497, + "learning_rate": 8.21291591922185e-05, + "loss": 0.606302797794342, + "step": 4774 + }, + { + "epoch": 2.0151898734177216, + "grad_norm": 1.1886142492294312, + "learning_rate": 8.211075510235011e-05, + "loss": 0.6194182634353638, + "step": 4776 + }, + { + "epoch": 2.0160337552742615, + "grad_norm": 1.1414743661880493, + "learning_rate": 8.209234360514721e-05, + "loss": 0.639540433883667, + "step": 4778 + }, + { + "epoch": 2.0168776371308015, + "grad_norm": 1.2877455949783325, + "learning_rate": 8.2073924704857e-05, + "loss": 0.6350902318954468, + "step": 4780 + }, + { + "epoch": 2.017721518987342, + "grad_norm": 1.095578908920288, + "learning_rate": 8.205549840572834e-05, + "loss": 0.5152000784873962, + "step": 4782 + }, + { + "epoch": 2.018565400843882, + "grad_norm": 1.0043798685073853, + "learning_rate": 8.203706471201183e-05, + "loss": 0.46245837211608887, + "step": 4784 + }, + { + "epoch": 2.0194092827004217, + "grad_norm": 1.2133857011795044, + "learning_rate": 8.201862362795979e-05, + "loss": 0.6471722722053528, + "step": 4786 + }, + { + "epoch": 2.020253164556962, + "grad_norm": 1.0835390090942383, + "learning_rate": 8.200017515782619e-05, + "loss": 0.5790625214576721, + "step": 4788 + }, + { + "epoch": 2.021097046413502, + "grad_norm": 1.0176091194152832, + "learning_rate": 8.198171930586678e-05, + "loss": 0.5826238989830017, + "step": 4790 + }, + { + "epoch": 2.021940928270042, + "grad_norm": 1.1581370830535889, + "learning_rate": 8.196325607633893e-05, + "loss": 0.5781272649765015, + "step": 4792 + }, + { + "epoch": 2.0227848101265824, + "grad_norm": 1.243381142616272, + "learning_rate": 8.194478547350178e-05, + "loss": 0.6600401997566223, + "step": 4794 + }, + { + "epoch": 2.0236286919831223, + "grad_norm": 1.0718560218811035, + "learning_rate": 8.192630750161612e-05, + "loss": 0.5291268825531006, + "step": 4796 + }, + { + "epoch": 2.0244725738396623, + "grad_norm": 1.2338320016860962, + "learning_rate": 8.190782216494448e-05, + "loss": 0.6564924120903015, + "step": 4798 + }, + { + "epoch": 2.0253164556962027, + "grad_norm": 0.978547990322113, + "learning_rate": 8.188932946775107e-05, + "loss": 0.5471183657646179, + "step": 4800 + }, + { + "epoch": 2.0253164556962027, + "eval_loss": 0.6924457550048828, + "eval_runtime": 514.0427, + "eval_samples_per_second": 4.099, + "eval_steps_per_second": 4.099, + "step": 4800 + }, + { + "epoch": 2.0261603375527426, + "grad_norm": 1.1782792806625366, + "learning_rate": 8.18708294143018e-05, + "loss": 0.567442774772644, + "step": 4802 + }, + { + "epoch": 2.0270042194092825, + "grad_norm": 1.0768574476242065, + "learning_rate": 8.185232200886426e-05, + "loss": 0.6005180478096008, + "step": 4804 + }, + { + "epoch": 2.027848101265823, + "grad_norm": 1.3096717596054077, + "learning_rate": 8.18338072557078e-05, + "loss": 0.616436779499054, + "step": 4806 + }, + { + "epoch": 2.028691983122363, + "grad_norm": 1.0233508348464966, + "learning_rate": 8.181528515910336e-05, + "loss": 0.49587416648864746, + "step": 4808 + }, + { + "epoch": 2.029535864978903, + "grad_norm": 1.0800065994262695, + "learning_rate": 8.179675572332366e-05, + "loss": 0.5758571624755859, + "step": 4810 + }, + { + "epoch": 2.030379746835443, + "grad_norm": 1.09299898147583, + "learning_rate": 8.177821895264309e-05, + "loss": 0.561736524105072, + "step": 4812 + }, + { + "epoch": 2.031223628691983, + "grad_norm": 1.1439210176467896, + "learning_rate": 8.175967485133771e-05, + "loss": 0.5249468088150024, + "step": 4814 + }, + { + "epoch": 2.032067510548523, + "grad_norm": 1.15841805934906, + "learning_rate": 8.174112342368532e-05, + "loss": 0.6429001688957214, + "step": 4816 + }, + { + "epoch": 2.0329113924050635, + "grad_norm": 1.1720670461654663, + "learning_rate": 8.172256467396533e-05, + "loss": 0.60152667760849, + "step": 4818 + }, + { + "epoch": 2.0337552742616034, + "grad_norm": 1.2652091979980469, + "learning_rate": 8.170399860645892e-05, + "loss": 0.5553541779518127, + "step": 4820 + }, + { + "epoch": 2.0345991561181433, + "grad_norm": 1.0768507719039917, + "learning_rate": 8.168542522544893e-05, + "loss": 0.5369323492050171, + "step": 4822 + }, + { + "epoch": 2.0354430379746837, + "grad_norm": 0.9906469583511353, + "learning_rate": 8.166684453521986e-05, + "loss": 0.5468952655792236, + "step": 4824 + }, + { + "epoch": 2.0362869198312237, + "grad_norm": 1.3448988199234009, + "learning_rate": 8.164825654005792e-05, + "loss": 0.5795659422874451, + "step": 4826 + }, + { + "epoch": 2.0371308016877636, + "grad_norm": 1.2502341270446777, + "learning_rate": 8.162966124425103e-05, + "loss": 0.6465779542922974, + "step": 4828 + }, + { + "epoch": 2.037974683544304, + "grad_norm": 1.1512303352355957, + "learning_rate": 8.161105865208875e-05, + "loss": 0.5509394407272339, + "step": 4830 + }, + { + "epoch": 2.038818565400844, + "grad_norm": 1.2513408660888672, + "learning_rate": 8.159244876786232e-05, + "loss": 0.5515735745429993, + "step": 4832 + }, + { + "epoch": 2.039662447257384, + "grad_norm": 1.3035682439804077, + "learning_rate": 8.157383159586473e-05, + "loss": 0.757799506187439, + "step": 4834 + }, + { + "epoch": 2.0405063291139243, + "grad_norm": 1.1136540174484253, + "learning_rate": 8.155520714039056e-05, + "loss": 0.607295036315918, + "step": 4836 + }, + { + "epoch": 2.041350210970464, + "grad_norm": 1.220146656036377, + "learning_rate": 8.153657540573613e-05, + "loss": 0.5769712328910828, + "step": 4838 + }, + { + "epoch": 2.042194092827004, + "grad_norm": 1.2104195356369019, + "learning_rate": 8.151793639619944e-05, + "loss": 0.5746933817863464, + "step": 4840 + }, + { + "epoch": 2.043037974683544, + "grad_norm": 1.241708517074585, + "learning_rate": 8.149929011608014e-05, + "loss": 0.5932332277297974, + "step": 4842 + }, + { + "epoch": 2.0438818565400845, + "grad_norm": 1.1172713041305542, + "learning_rate": 8.148063656967955e-05, + "loss": 0.583284318447113, + "step": 4844 + }, + { + "epoch": 2.0447257383966244, + "grad_norm": 1.0867618322372437, + "learning_rate": 8.14619757613007e-05, + "loss": 0.5589476823806763, + "step": 4846 + }, + { + "epoch": 2.0455696202531644, + "grad_norm": 1.2470483779907227, + "learning_rate": 8.14433076952483e-05, + "loss": 0.6118156313896179, + "step": 4848 + }, + { + "epoch": 2.0464135021097047, + "grad_norm": 1.0908832550048828, + "learning_rate": 8.142463237582868e-05, + "loss": 0.5815895795822144, + "step": 4850 + }, + { + "epoch": 2.0472573839662447, + "grad_norm": 1.2589281797409058, + "learning_rate": 8.140594980734989e-05, + "loss": 0.6232373714447021, + "step": 4852 + }, + { + "epoch": 2.0481012658227846, + "grad_norm": 1.234152913093567, + "learning_rate": 8.138725999412165e-05, + "loss": 0.5992053151130676, + "step": 4854 + }, + { + "epoch": 2.048945147679325, + "grad_norm": 1.3304446935653687, + "learning_rate": 8.136856294045533e-05, + "loss": 0.6494496464729309, + "step": 4856 + }, + { + "epoch": 2.049789029535865, + "grad_norm": 1.1871088743209839, + "learning_rate": 8.134985865066398e-05, + "loss": 0.6263431906700134, + "step": 4858 + }, + { + "epoch": 2.050632911392405, + "grad_norm": 1.1454699039459229, + "learning_rate": 8.133114712906234e-05, + "loss": 0.6036502122879028, + "step": 4860 + }, + { + "epoch": 2.0514767932489453, + "grad_norm": 1.2953420877456665, + "learning_rate": 8.131242837996675e-05, + "loss": 0.5674451589584351, + "step": 4862 + }, + { + "epoch": 2.052320675105485, + "grad_norm": 1.1874405145645142, + "learning_rate": 8.129370240769534e-05, + "loss": 0.5616317987442017, + "step": 4864 + }, + { + "epoch": 2.053164556962025, + "grad_norm": 1.2936227321624756, + "learning_rate": 8.127496921656777e-05, + "loss": 0.6495023369789124, + "step": 4866 + }, + { + "epoch": 2.0540084388185655, + "grad_norm": 1.1935228109359741, + "learning_rate": 8.125622881090544e-05, + "loss": 0.6028099060058594, + "step": 4868 + }, + { + "epoch": 2.0548523206751055, + "grad_norm": 0.9932331442832947, + "learning_rate": 8.123748119503143e-05, + "loss": 0.476296067237854, + "step": 4870 + }, + { + "epoch": 2.0556962025316454, + "grad_norm": 1.3878839015960693, + "learning_rate": 8.121872637327042e-05, + "loss": 0.6191902756690979, + "step": 4872 + }, + { + "epoch": 2.056540084388186, + "grad_norm": 1.1185581684112549, + "learning_rate": 8.11999643499488e-05, + "loss": 0.566487729549408, + "step": 4874 + }, + { + "epoch": 2.0573839662447257, + "grad_norm": 1.3729257583618164, + "learning_rate": 8.118119512939464e-05, + "loss": 0.5970078706741333, + "step": 4876 + }, + { + "epoch": 2.0582278481012657, + "grad_norm": 1.1332688331604004, + "learning_rate": 8.11624187159376e-05, + "loss": 0.570341944694519, + "step": 4878 + }, + { + "epoch": 2.059071729957806, + "grad_norm": 1.2648937702178955, + "learning_rate": 8.114363511390903e-05, + "loss": 0.6302897334098816, + "step": 4880 + }, + { + "epoch": 2.059915611814346, + "grad_norm": 1.250616192817688, + "learning_rate": 8.112484432764197e-05, + "loss": 0.5619142651557922, + "step": 4882 + }, + { + "epoch": 2.060759493670886, + "grad_norm": 0.9710861444473267, + "learning_rate": 8.110604636147109e-05, + "loss": 0.5426228642463684, + "step": 4884 + }, + { + "epoch": 2.0616033755274263, + "grad_norm": 1.1979506015777588, + "learning_rate": 8.108724121973271e-05, + "loss": 0.5498107671737671, + "step": 4886 + }, + { + "epoch": 2.0624472573839663, + "grad_norm": 1.0936485528945923, + "learning_rate": 8.106842890676483e-05, + "loss": 0.5695134401321411, + "step": 4888 + }, + { + "epoch": 2.0632911392405062, + "grad_norm": 1.1246092319488525, + "learning_rate": 8.10496094269071e-05, + "loss": 0.5998331308364868, + "step": 4890 + }, + { + "epoch": 2.0641350210970466, + "grad_norm": 1.244438648223877, + "learning_rate": 8.103078278450075e-05, + "loss": 0.5702623128890991, + "step": 4892 + }, + { + "epoch": 2.0649789029535865, + "grad_norm": 1.1585633754730225, + "learning_rate": 8.101194898388881e-05, + "loss": 0.5392299890518188, + "step": 4894 + }, + { + "epoch": 2.0658227848101265, + "grad_norm": 1.3044285774230957, + "learning_rate": 8.099310802941582e-05, + "loss": 0.5640127658843994, + "step": 4896 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 1.2483032941818237, + "learning_rate": 8.097425992542804e-05, + "loss": 0.6103175282478333, + "step": 4898 + }, + { + "epoch": 2.067510548523207, + "grad_norm": 1.0845462083816528, + "learning_rate": 8.095540467627337e-05, + "loss": 0.5041166543960571, + "step": 4900 + }, + { + "epoch": 2.067510548523207, + "eval_loss": 0.6941288113594055, + "eval_runtime": 513.4497, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 4900 + }, + { + "epoch": 2.0683544303797468, + "grad_norm": 1.2493232488632202, + "learning_rate": 8.093654228630134e-05, + "loss": 0.6253946423530579, + "step": 4902 + }, + { + "epoch": 2.0691983122362867, + "grad_norm": 1.1668756008148193, + "learning_rate": 8.091767275986317e-05, + "loss": 0.523486852645874, + "step": 4904 + }, + { + "epoch": 2.070042194092827, + "grad_norm": 1.1709638833999634, + "learning_rate": 8.089879610131167e-05, + "loss": 0.5569989681243896, + "step": 4906 + }, + { + "epoch": 2.070886075949367, + "grad_norm": 1.1044740676879883, + "learning_rate": 8.087991231500133e-05, + "loss": 0.642728865146637, + "step": 4908 + }, + { + "epoch": 2.071729957805907, + "grad_norm": 1.1032549142837524, + "learning_rate": 8.086102140528828e-05, + "loss": 0.5998259782791138, + "step": 4910 + }, + { + "epoch": 2.0725738396624473, + "grad_norm": 0.9980027079582214, + "learning_rate": 8.08421233765303e-05, + "loss": 0.5460172891616821, + "step": 4912 + }, + { + "epoch": 2.0734177215189873, + "grad_norm": 1.0866090059280396, + "learning_rate": 8.082321823308679e-05, + "loss": 0.5643284916877747, + "step": 4914 + }, + { + "epoch": 2.0742616033755272, + "grad_norm": 1.1942687034606934, + "learning_rate": 8.080430597931878e-05, + "loss": 0.554400622844696, + "step": 4916 + }, + { + "epoch": 2.0751054852320676, + "grad_norm": 1.0680599212646484, + "learning_rate": 8.078538661958901e-05, + "loss": 0.5955621004104614, + "step": 4918 + }, + { + "epoch": 2.0759493670886076, + "grad_norm": 1.20845627784729, + "learning_rate": 8.076646015826179e-05, + "loss": 0.5970203280448914, + "step": 4920 + }, + { + "epoch": 2.0767932489451475, + "grad_norm": 1.8368924856185913, + "learning_rate": 8.074752659970308e-05, + "loss": 0.6467664837837219, + "step": 4922 + }, + { + "epoch": 2.077637130801688, + "grad_norm": 1.3291922807693481, + "learning_rate": 8.072858594828053e-05, + "loss": 0.630719006061554, + "step": 4924 + }, + { + "epoch": 2.078481012658228, + "grad_norm": 1.1496083736419678, + "learning_rate": 8.070963820836333e-05, + "loss": 0.601140022277832, + "step": 4926 + }, + { + "epoch": 2.0793248945147678, + "grad_norm": 1.1562724113464355, + "learning_rate": 8.069068338432239e-05, + "loss": 0.6096881031990051, + "step": 4928 + }, + { + "epoch": 2.080168776371308, + "grad_norm": 1.0115300416946411, + "learning_rate": 8.067172148053021e-05, + "loss": 0.5085908770561218, + "step": 4930 + }, + { + "epoch": 2.081012658227848, + "grad_norm": 1.2181830406188965, + "learning_rate": 8.065275250136097e-05, + "loss": 0.5268720984458923, + "step": 4932 + }, + { + "epoch": 2.081856540084388, + "grad_norm": 1.1249788999557495, + "learning_rate": 8.06337764511904e-05, + "loss": 0.6075665950775146, + "step": 4934 + }, + { + "epoch": 2.0827004219409284, + "grad_norm": 1.1143964529037476, + "learning_rate": 8.061479333439595e-05, + "loss": 0.59170001745224, + "step": 4936 + }, + { + "epoch": 2.0835443037974684, + "grad_norm": 1.4773131608963013, + "learning_rate": 8.059580315535664e-05, + "loss": 0.6689745187759399, + "step": 4938 + }, + { + "epoch": 2.0843881856540083, + "grad_norm": 1.143965244293213, + "learning_rate": 8.057680591845316e-05, + "loss": 0.5409777760505676, + "step": 4940 + }, + { + "epoch": 2.0852320675105487, + "grad_norm": 1.0384942293167114, + "learning_rate": 8.055780162806777e-05, + "loss": 0.5778636336326599, + "step": 4942 + }, + { + "epoch": 2.0860759493670886, + "grad_norm": 1.0102177858352661, + "learning_rate": 8.053879028858442e-05, + "loss": 0.5576038360595703, + "step": 4944 + }, + { + "epoch": 2.0869198312236286, + "grad_norm": 1.3792158365249634, + "learning_rate": 8.051977190438868e-05, + "loss": 0.5873376131057739, + "step": 4946 + }, + { + "epoch": 2.087763713080169, + "grad_norm": 1.4402949810028076, + "learning_rate": 8.050074647986768e-05, + "loss": 0.6067743301391602, + "step": 4948 + }, + { + "epoch": 2.088607594936709, + "grad_norm": 1.2719058990478516, + "learning_rate": 8.048171401941027e-05, + "loss": 0.604671835899353, + "step": 4950 + }, + { + "epoch": 2.089451476793249, + "grad_norm": 1.1054867506027222, + "learning_rate": 8.046267452740683e-05, + "loss": 0.5743544697761536, + "step": 4952 + }, + { + "epoch": 2.090295358649789, + "grad_norm": 1.0521535873413086, + "learning_rate": 8.044362800824944e-05, + "loss": 0.576278567314148, + "step": 4954 + }, + { + "epoch": 2.091139240506329, + "grad_norm": 1.2665088176727295, + "learning_rate": 8.042457446633174e-05, + "loss": 0.5903641581535339, + "step": 4956 + }, + { + "epoch": 2.091983122362869, + "grad_norm": 1.1283398866653442, + "learning_rate": 8.040551390604902e-05, + "loss": 0.5854214429855347, + "step": 4958 + }, + { + "epoch": 2.0928270042194095, + "grad_norm": 1.1194316148757935, + "learning_rate": 8.03864463317982e-05, + "loss": 0.5843619108200073, + "step": 4960 + }, + { + "epoch": 2.0936708860759494, + "grad_norm": 1.3581651449203491, + "learning_rate": 8.036737174797778e-05, + "loss": 0.6115096211433411, + "step": 4962 + }, + { + "epoch": 2.0945147679324894, + "grad_norm": 1.341748595237732, + "learning_rate": 8.034829015898793e-05, + "loss": 0.5998795032501221, + "step": 4964 + }, + { + "epoch": 2.0953586497890297, + "grad_norm": 1.2212611436843872, + "learning_rate": 8.032920156923038e-05, + "loss": 0.628372311592102, + "step": 4966 + }, + { + "epoch": 2.0962025316455697, + "grad_norm": 1.1348317861557007, + "learning_rate": 8.031010598310851e-05, + "loss": 0.5668916702270508, + "step": 4968 + }, + { + "epoch": 2.0970464135021096, + "grad_norm": 1.1106547117233276, + "learning_rate": 8.029100340502731e-05, + "loss": 0.5253881216049194, + "step": 4970 + }, + { + "epoch": 2.09789029535865, + "grad_norm": 1.2471354007720947, + "learning_rate": 8.027189383939339e-05, + "loss": 0.5790762901306152, + "step": 4972 + }, + { + "epoch": 2.09873417721519, + "grad_norm": 1.2477394342422485, + "learning_rate": 8.025277729061492e-05, + "loss": 0.6382888555526733, + "step": 4974 + }, + { + "epoch": 2.09957805907173, + "grad_norm": 1.2716054916381836, + "learning_rate": 8.023365376310176e-05, + "loss": 0.5962072610855103, + "step": 4976 + }, + { + "epoch": 2.10042194092827, + "grad_norm": 1.257820725440979, + "learning_rate": 8.021452326126532e-05, + "loss": 0.5882940292358398, + "step": 4978 + }, + { + "epoch": 2.1012658227848102, + "grad_norm": 1.0924186706542969, + "learning_rate": 8.019538578951864e-05, + "loss": 0.5640701055526733, + "step": 4980 + }, + { + "epoch": 2.10210970464135, + "grad_norm": 1.1250383853912354, + "learning_rate": 8.017624135227637e-05, + "loss": 0.5746428966522217, + "step": 4982 + }, + { + "epoch": 2.10295358649789, + "grad_norm": 1.131323218345642, + "learning_rate": 8.015708995395477e-05, + "loss": 0.5611346960067749, + "step": 4984 + }, + { + "epoch": 2.1037974683544305, + "grad_norm": 1.4267152547836304, + "learning_rate": 8.013793159897171e-05, + "loss": 0.6173797249794006, + "step": 4986 + }, + { + "epoch": 2.1046413502109704, + "grad_norm": 1.41414213180542, + "learning_rate": 8.011876629174662e-05, + "loss": 0.64865642786026, + "step": 4988 + }, + { + "epoch": 2.1054852320675104, + "grad_norm": 1.1498184204101562, + "learning_rate": 8.00995940367006e-05, + "loss": 0.6125827431678772, + "step": 4990 + }, + { + "epoch": 2.1063291139240508, + "grad_norm": 1.2327708005905151, + "learning_rate": 8.00804148382563e-05, + "loss": 0.670495867729187, + "step": 4992 + }, + { + "epoch": 2.1071729957805907, + "grad_norm": 1.2797311544418335, + "learning_rate": 8.0061228700838e-05, + "loss": 0.6020209193229675, + "step": 4994 + }, + { + "epoch": 2.1080168776371306, + "grad_norm": 1.079584002494812, + "learning_rate": 8.004203562887157e-05, + "loss": 0.5974310636520386, + "step": 4996 + }, + { + "epoch": 2.108860759493671, + "grad_norm": 1.4352604150772095, + "learning_rate": 8.002283562678452e-05, + "loss": 0.6424587368965149, + "step": 4998 + }, + { + "epoch": 2.109704641350211, + "grad_norm": 1.0876719951629639, + "learning_rate": 8.000362869900586e-05, + "loss": 0.6185846328735352, + "step": 5000 + }, + { + "epoch": 2.109704641350211, + "eval_loss": 0.6908889412879944, + "eval_runtime": 675.8398, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 3.118, + "step": 5000 + }, + { + "epoch": 2.110548523206751, + "grad_norm": 1.0125762224197388, + "learning_rate": 7.998441484996631e-05, + "loss": 0.6127280592918396, + "step": 5002 + }, + { + "epoch": 2.1113924050632913, + "grad_norm": 1.0253753662109375, + "learning_rate": 7.99651940840981e-05, + "loss": 0.5495694875717163, + "step": 5004 + }, + { + "epoch": 2.1122362869198312, + "grad_norm": 1.5620673894882202, + "learning_rate": 7.994596640583511e-05, + "loss": 0.6199497580528259, + "step": 5006 + }, + { + "epoch": 2.113080168776371, + "grad_norm": 1.3032969236373901, + "learning_rate": 7.992673181961281e-05, + "loss": 0.5896390676498413, + "step": 5008 + }, + { + "epoch": 2.1139240506329116, + "grad_norm": 1.0933046340942383, + "learning_rate": 7.990749032986821e-05, + "loss": 0.6332341432571411, + "step": 5010 + }, + { + "epoch": 2.1147679324894515, + "grad_norm": 1.3115314245224, + "learning_rate": 7.988824194104e-05, + "loss": 0.5964323282241821, + "step": 5012 + }, + { + "epoch": 2.1156118143459914, + "grad_norm": 1.229978084564209, + "learning_rate": 7.986898665756837e-05, + "loss": 0.5938325524330139, + "step": 5014 + }, + { + "epoch": 2.116455696202532, + "grad_norm": 1.1779940128326416, + "learning_rate": 7.984972448389517e-05, + "loss": 0.5761791467666626, + "step": 5016 + }, + { + "epoch": 2.1172995780590718, + "grad_norm": 1.063490629196167, + "learning_rate": 7.98304554244638e-05, + "loss": 0.6073653101921082, + "step": 5018 + }, + { + "epoch": 2.1181434599156117, + "grad_norm": 1.2390391826629639, + "learning_rate": 7.981117948371927e-05, + "loss": 0.6126761436462402, + "step": 5020 + }, + { + "epoch": 2.118987341772152, + "grad_norm": 1.1946247816085815, + "learning_rate": 7.979189666610818e-05, + "loss": 0.614434003829956, + "step": 5022 + }, + { + "epoch": 2.119831223628692, + "grad_norm": 1.1008374691009521, + "learning_rate": 7.977260697607867e-05, + "loss": 0.5947603583335876, + "step": 5024 + }, + { + "epoch": 2.120675105485232, + "grad_norm": 1.14899480342865, + "learning_rate": 7.975331041808054e-05, + "loss": 0.583965539932251, + "step": 5026 + }, + { + "epoch": 2.1215189873417724, + "grad_norm": 1.1627864837646484, + "learning_rate": 7.973400699656512e-05, + "loss": 0.615121603012085, + "step": 5028 + }, + { + "epoch": 2.1223628691983123, + "grad_norm": 1.3622617721557617, + "learning_rate": 7.971469671598532e-05, + "loss": 0.6268601417541504, + "step": 5030 + }, + { + "epoch": 2.1232067510548522, + "grad_norm": 1.1735879182815552, + "learning_rate": 7.96953795807957e-05, + "loss": 0.6021270155906677, + "step": 5032 + }, + { + "epoch": 2.124050632911392, + "grad_norm": 1.3856201171875, + "learning_rate": 7.96760555954523e-05, + "loss": 0.636816680431366, + "step": 5034 + }, + { + "epoch": 2.1248945147679326, + "grad_norm": 1.1410126686096191, + "learning_rate": 7.965672476441282e-05, + "loss": 0.5324423313140869, + "step": 5036 + }, + { + "epoch": 2.1257383966244725, + "grad_norm": 1.446070909500122, + "learning_rate": 7.963738709213651e-05, + "loss": 0.7433624267578125, + "step": 5038 + }, + { + "epoch": 2.1265822784810124, + "grad_norm": 1.3041753768920898, + "learning_rate": 7.961804258308419e-05, + "loss": 0.6359145641326904, + "step": 5040 + }, + { + "epoch": 2.127426160337553, + "grad_norm": 1.2043813467025757, + "learning_rate": 7.959869124171826e-05, + "loss": 0.6164234280586243, + "step": 5042 + }, + { + "epoch": 2.1282700421940928, + "grad_norm": 1.2375630140304565, + "learning_rate": 7.957933307250273e-05, + "loss": 0.6437279582023621, + "step": 5044 + }, + { + "epoch": 2.1291139240506327, + "grad_norm": 1.210644245147705, + "learning_rate": 7.955996807990314e-05, + "loss": 0.585924506187439, + "step": 5046 + }, + { + "epoch": 2.129957805907173, + "grad_norm": 1.2011489868164062, + "learning_rate": 7.954059626838661e-05, + "loss": 0.6081803441047668, + "step": 5048 + }, + { + "epoch": 2.130801687763713, + "grad_norm": 1.0365782976150513, + "learning_rate": 7.952121764242187e-05, + "loss": 0.5609047412872314, + "step": 5050 + }, + { + "epoch": 2.131645569620253, + "grad_norm": 1.7950767278671265, + "learning_rate": 7.950183220647918e-05, + "loss": 0.5612874031066895, + "step": 5052 + }, + { + "epoch": 2.1324894514767934, + "grad_norm": 1.2933409214019775, + "learning_rate": 7.94824399650304e-05, + "loss": 0.6554630994796753, + "step": 5054 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 1.129828929901123, + "learning_rate": 7.946304092254894e-05, + "loss": 0.5623239278793335, + "step": 5056 + }, + { + "epoch": 2.1341772151898732, + "grad_norm": 1.1060296297073364, + "learning_rate": 7.944363508350978e-05, + "loss": 0.5036910772323608, + "step": 5058 + }, + { + "epoch": 2.1350210970464136, + "grad_norm": 1.2622627019882202, + "learning_rate": 7.94242224523895e-05, + "loss": 0.5840913653373718, + "step": 5060 + }, + { + "epoch": 2.1358649789029536, + "grad_norm": 1.3803153038024902, + "learning_rate": 7.940480303366618e-05, + "loss": 0.6365578770637512, + "step": 5062 + }, + { + "epoch": 2.1367088607594935, + "grad_norm": 1.2524651288986206, + "learning_rate": 7.938537683181955e-05, + "loss": 0.6167916655540466, + "step": 5064 + }, + { + "epoch": 2.137552742616034, + "grad_norm": 1.3320350646972656, + "learning_rate": 7.936594385133083e-05, + "loss": 0.6356930732727051, + "step": 5066 + }, + { + "epoch": 2.138396624472574, + "grad_norm": 1.3180949687957764, + "learning_rate": 7.934650409668285e-05, + "loss": 0.5888242721557617, + "step": 5068 + }, + { + "epoch": 2.1392405063291138, + "grad_norm": 1.1376243829727173, + "learning_rate": 7.932705757235999e-05, + "loss": 0.608725905418396, + "step": 5070 + }, + { + "epoch": 2.140084388185654, + "grad_norm": 1.1734369993209839, + "learning_rate": 7.930760428284817e-05, + "loss": 0.5824158787727356, + "step": 5072 + }, + { + "epoch": 2.140928270042194, + "grad_norm": 1.1038579940795898, + "learning_rate": 7.928814423263493e-05, + "loss": 0.5629416704177856, + "step": 5074 + }, + { + "epoch": 2.141772151898734, + "grad_norm": 1.269780158996582, + "learning_rate": 7.926867742620929e-05, + "loss": 0.5994445085525513, + "step": 5076 + }, + { + "epoch": 2.1426160337552744, + "grad_norm": 1.2274279594421387, + "learning_rate": 7.924920386806188e-05, + "loss": 0.5845475792884827, + "step": 5078 + }, + { + "epoch": 2.1434599156118144, + "grad_norm": 1.168766975402832, + "learning_rate": 7.922972356268488e-05, + "loss": 0.621201753616333, + "step": 5080 + }, + { + "epoch": 2.1443037974683543, + "grad_norm": 1.0057638883590698, + "learning_rate": 7.921023651457203e-05, + "loss": 0.5282597541809082, + "step": 5082 + }, + { + "epoch": 2.1451476793248947, + "grad_norm": 1.432309865951538, + "learning_rate": 7.91907427282186e-05, + "loss": 0.632583737373352, + "step": 5084 + }, + { + "epoch": 2.1459915611814346, + "grad_norm": 1.3939776420593262, + "learning_rate": 7.917124220812144e-05, + "loss": 0.6239289045333862, + "step": 5086 + }, + { + "epoch": 2.1468354430379746, + "grad_norm": 1.3741775751113892, + "learning_rate": 7.915173495877895e-05, + "loss": 0.5749062895774841, + "step": 5088 + }, + { + "epoch": 2.147679324894515, + "grad_norm": 1.3123528957366943, + "learning_rate": 7.913222098469109e-05, + "loss": 0.6011738181114197, + "step": 5090 + }, + { + "epoch": 2.148523206751055, + "grad_norm": 1.3473498821258545, + "learning_rate": 7.911270029035932e-05, + "loss": 0.5804699659347534, + "step": 5092 + }, + { + "epoch": 2.149367088607595, + "grad_norm": 1.0873067378997803, + "learning_rate": 7.909317288028673e-05, + "loss": 0.6446103453636169, + "step": 5094 + }, + { + "epoch": 2.1502109704641352, + "grad_norm": 1.1374083757400513, + "learning_rate": 7.907363875897789e-05, + "loss": 0.6136524677276611, + "step": 5096 + }, + { + "epoch": 2.151054852320675, + "grad_norm": 1.1356533765792847, + "learning_rate": 7.905409793093896e-05, + "loss": 0.5107976794242859, + "step": 5098 + }, + { + "epoch": 2.151898734177215, + "grad_norm": 1.2579567432403564, + "learning_rate": 7.903455040067763e-05, + "loss": 0.6073099374771118, + "step": 5100 + }, + { + "epoch": 2.151898734177215, + "eval_loss": 0.6902023553848267, + "eval_runtime": 733.915, + "eval_samples_per_second": 2.871, + "eval_steps_per_second": 2.871, + "step": 5100 + }, + { + "epoch": 2.1527426160337555, + "grad_norm": 1.2401398420333862, + "learning_rate": 7.901499617270315e-05, + "loss": 0.5562406182289124, + "step": 5102 + }, + { + "epoch": 2.1535864978902954, + "grad_norm": 1.086590051651001, + "learning_rate": 7.899543525152628e-05, + "loss": 0.5749467015266418, + "step": 5104 + }, + { + "epoch": 2.1544303797468354, + "grad_norm": 1.206458568572998, + "learning_rate": 7.897586764165939e-05, + "loss": 0.6326877474784851, + "step": 5106 + }, + { + "epoch": 2.1552742616033758, + "grad_norm": 1.030740737915039, + "learning_rate": 7.895629334761632e-05, + "loss": 0.5616445541381836, + "step": 5108 + }, + { + "epoch": 2.1561181434599157, + "grad_norm": 1.3338581323623657, + "learning_rate": 7.89367123739125e-05, + "loss": 0.6307384371757507, + "step": 5110 + }, + { + "epoch": 2.1569620253164556, + "grad_norm": 1.2684671878814697, + "learning_rate": 7.891712472506485e-05, + "loss": 0.6087653636932373, + "step": 5112 + }, + { + "epoch": 2.1578059071729956, + "grad_norm": 1.1610581874847412, + "learning_rate": 7.889753040559188e-05, + "loss": 0.5747998952865601, + "step": 5114 + }, + { + "epoch": 2.158649789029536, + "grad_norm": 1.4069275856018066, + "learning_rate": 7.887792942001366e-05, + "loss": 0.6143770217895508, + "step": 5116 + }, + { + "epoch": 2.159493670886076, + "grad_norm": 1.0858227014541626, + "learning_rate": 7.885832177285173e-05, + "loss": 0.552534282207489, + "step": 5118 + }, + { + "epoch": 2.160337552742616, + "grad_norm": 1.067070722579956, + "learning_rate": 7.88387074686292e-05, + "loss": 0.5781989693641663, + "step": 5120 + }, + { + "epoch": 2.1611814345991562, + "grad_norm": 1.139981746673584, + "learning_rate": 7.881908651187072e-05, + "loss": 0.5521422624588013, + "step": 5122 + }, + { + "epoch": 2.162025316455696, + "grad_norm": 1.0987457036972046, + "learning_rate": 7.879945890710245e-05, + "loss": 0.5755025744438171, + "step": 5124 + }, + { + "epoch": 2.162869198312236, + "grad_norm": 1.1530758142471313, + "learning_rate": 7.877982465885214e-05, + "loss": 0.5783509612083435, + "step": 5126 + }, + { + "epoch": 2.1637130801687765, + "grad_norm": 1.2285696268081665, + "learning_rate": 7.876018377164899e-05, + "loss": 0.5942281484603882, + "step": 5128 + }, + { + "epoch": 2.1645569620253164, + "grad_norm": 1.1283711194992065, + "learning_rate": 7.874053625002378e-05, + "loss": 0.5539707541465759, + "step": 5130 + }, + { + "epoch": 2.1654008438818564, + "grad_norm": 1.3213335275650024, + "learning_rate": 7.872088209850885e-05, + "loss": 0.5955292582511902, + "step": 5132 + }, + { + "epoch": 2.1662447257383968, + "grad_norm": 1.1748592853546143, + "learning_rate": 7.8701221321638e-05, + "loss": 0.5422899723052979, + "step": 5134 + }, + { + "epoch": 2.1670886075949367, + "grad_norm": 1.0752148628234863, + "learning_rate": 7.868155392394662e-05, + "loss": 0.5547205209732056, + "step": 5136 + }, + { + "epoch": 2.1679324894514767, + "grad_norm": 1.1814554929733276, + "learning_rate": 7.86618799099716e-05, + "loss": 0.5938948392868042, + "step": 5138 + }, + { + "epoch": 2.168776371308017, + "grad_norm": 1.3455278873443604, + "learning_rate": 7.864219928425132e-05, + "loss": 0.6468925476074219, + "step": 5140 + }, + { + "epoch": 2.169620253164557, + "grad_norm": 1.2695354223251343, + "learning_rate": 7.862251205132576e-05, + "loss": 0.5704391002655029, + "step": 5142 + }, + { + "epoch": 2.170464135021097, + "grad_norm": 1.1529468297958374, + "learning_rate": 7.860281821573638e-05, + "loss": 0.6057283878326416, + "step": 5144 + }, + { + "epoch": 2.1713080168776373, + "grad_norm": 1.3461004495620728, + "learning_rate": 7.858311778202616e-05, + "loss": 0.6135527491569519, + "step": 5146 + }, + { + "epoch": 2.1721518987341772, + "grad_norm": 1.1258536577224731, + "learning_rate": 7.856341075473962e-05, + "loss": 0.5585638880729675, + "step": 5148 + }, + { + "epoch": 2.172995780590717, + "grad_norm": 1.254898190498352, + "learning_rate": 7.854369713842279e-05, + "loss": 0.5780918002128601, + "step": 5150 + }, + { + "epoch": 2.1738396624472576, + "grad_norm": 1.2730201482772827, + "learning_rate": 7.852397693762321e-05, + "loss": 0.595267117023468, + "step": 5152 + }, + { + "epoch": 2.1746835443037975, + "grad_norm": 1.1875078678131104, + "learning_rate": 7.850425015688999e-05, + "loss": 0.5636162161827087, + "step": 5154 + }, + { + "epoch": 2.1755274261603375, + "grad_norm": 1.0930945873260498, + "learning_rate": 7.848451680077366e-05, + "loss": 0.6362089514732361, + "step": 5156 + }, + { + "epoch": 2.176371308016878, + "grad_norm": 1.2274452447891235, + "learning_rate": 7.846477687382639e-05, + "loss": 0.6268675327301025, + "step": 5158 + }, + { + "epoch": 2.1772151898734178, + "grad_norm": 1.2023133039474487, + "learning_rate": 7.844503038060176e-05, + "loss": 0.6014906167984009, + "step": 5160 + }, + { + "epoch": 2.1780590717299577, + "grad_norm": 1.2616889476776123, + "learning_rate": 7.842527732565491e-05, + "loss": 0.6180019974708557, + "step": 5162 + }, + { + "epoch": 2.1789029535864977, + "grad_norm": 1.1046907901763916, + "learning_rate": 7.84055177135425e-05, + "loss": 0.5400100946426392, + "step": 5164 + }, + { + "epoch": 2.179746835443038, + "grad_norm": 1.1664032936096191, + "learning_rate": 7.83857515488227e-05, + "loss": 0.5713199973106384, + "step": 5166 + }, + { + "epoch": 2.180590717299578, + "grad_norm": 1.2526558637619019, + "learning_rate": 7.836597883605519e-05, + "loss": 0.5741307735443115, + "step": 5168 + }, + { + "epoch": 2.181434599156118, + "grad_norm": 1.0457103252410889, + "learning_rate": 7.834619957980112e-05, + "loss": 0.47188031673431396, + "step": 5170 + }, + { + "epoch": 2.1822784810126583, + "grad_norm": 1.1978110074996948, + "learning_rate": 7.832641378462319e-05, + "loss": 0.6149471998214722, + "step": 5172 + }, + { + "epoch": 2.1831223628691983, + "grad_norm": 1.2231460809707642, + "learning_rate": 7.830662145508567e-05, + "loss": 0.5520018339157104, + "step": 5174 + }, + { + "epoch": 2.183966244725738, + "grad_norm": 1.4367618560791016, + "learning_rate": 7.828682259575417e-05, + "loss": 0.6536548733711243, + "step": 5176 + }, + { + "epoch": 2.1848101265822786, + "grad_norm": 1.0891374349594116, + "learning_rate": 7.826701721119598e-05, + "loss": 0.5324372053146362, + "step": 5178 + }, + { + "epoch": 2.1856540084388185, + "grad_norm": 1.118695616722107, + "learning_rate": 7.82472053059798e-05, + "loss": 0.6127952337265015, + "step": 5180 + }, + { + "epoch": 2.1864978902953585, + "grad_norm": 1.1116070747375488, + "learning_rate": 7.822738688467585e-05, + "loss": 0.505962610244751, + "step": 5182 + }, + { + "epoch": 2.187341772151899, + "grad_norm": 1.2140545845031738, + "learning_rate": 7.820756195185586e-05, + "loss": 0.6210073232650757, + "step": 5184 + }, + { + "epoch": 2.188185654008439, + "grad_norm": 1.2135601043701172, + "learning_rate": 7.818773051209307e-05, + "loss": 0.6517674326896667, + "step": 5186 + }, + { + "epoch": 2.1890295358649787, + "grad_norm": 1.3875514268875122, + "learning_rate": 7.816789256996218e-05, + "loss": 0.5577492117881775, + "step": 5188 + }, + { + "epoch": 2.189873417721519, + "grad_norm": 1.181325912475586, + "learning_rate": 7.814804813003949e-05, + "loss": 0.6010199189186096, + "step": 5190 + }, + { + "epoch": 2.190717299578059, + "grad_norm": 1.102044701576233, + "learning_rate": 7.812819719690265e-05, + "loss": 0.5635302662849426, + "step": 5192 + }, + { + "epoch": 2.191561181434599, + "grad_norm": 1.4227958917617798, + "learning_rate": 7.810833977513094e-05, + "loss": 0.5804321765899658, + "step": 5194 + }, + { + "epoch": 2.1924050632911394, + "grad_norm": 1.2573446035385132, + "learning_rate": 7.80884758693051e-05, + "loss": 0.6005555987358093, + "step": 5196 + }, + { + "epoch": 2.1932489451476793, + "grad_norm": 1.3534085750579834, + "learning_rate": 7.80686054840073e-05, + "loss": 0.6263643503189087, + "step": 5198 + }, + { + "epoch": 2.1940928270042193, + "grad_norm": 1.6895852088928223, + "learning_rate": 7.804872862382131e-05, + "loss": 0.6235764622688293, + "step": 5200 + }, + { + "epoch": 2.1940928270042193, + "eval_loss": 0.6915348172187805, + "eval_runtime": 1167.9782, + "eval_samples_per_second": 1.804, + "eval_steps_per_second": 1.804, + "step": 5200 + }, + { + "epoch": 2.1949367088607596, + "grad_norm": 1.138973593711853, + "learning_rate": 7.802884529333227e-05, + "loss": 0.5586035847663879, + "step": 5202 + }, + { + "epoch": 2.1957805907172996, + "grad_norm": 1.3664026260375977, + "learning_rate": 7.800895549712697e-05, + "loss": 0.5768917202949524, + "step": 5204 + }, + { + "epoch": 2.1966244725738395, + "grad_norm": 1.2182449102401733, + "learning_rate": 7.798905923979353e-05, + "loss": 0.6046215891838074, + "step": 5206 + }, + { + "epoch": 2.19746835443038, + "grad_norm": 1.2692211866378784, + "learning_rate": 7.796915652592167e-05, + "loss": 0.5412904024124146, + "step": 5208 + }, + { + "epoch": 2.19831223628692, + "grad_norm": 1.200822114944458, + "learning_rate": 7.794924736010256e-05, + "loss": 0.5328584909439087, + "step": 5210 + }, + { + "epoch": 2.19915611814346, + "grad_norm": 1.1093779802322388, + "learning_rate": 7.792933174692886e-05, + "loss": 0.5497913360595703, + "step": 5212 + }, + { + "epoch": 2.2, + "grad_norm": 1.3838921785354614, + "learning_rate": 7.790940969099471e-05, + "loss": 0.5908066034317017, + "step": 5214 + }, + { + "epoch": 2.20084388185654, + "grad_norm": 1.1411913633346558, + "learning_rate": 7.788948119689576e-05, + "loss": 0.6117307543754578, + "step": 5216 + }, + { + "epoch": 2.20168776371308, + "grad_norm": 1.5668916702270508, + "learning_rate": 7.786954626922913e-05, + "loss": 0.5788605809211731, + "step": 5218 + }, + { + "epoch": 2.2025316455696204, + "grad_norm": 1.195027232170105, + "learning_rate": 7.784960491259344e-05, + "loss": 0.5948591828346252, + "step": 5220 + }, + { + "epoch": 2.2033755274261604, + "grad_norm": 1.2665271759033203, + "learning_rate": 7.782965713158872e-05, + "loss": 0.6321669220924377, + "step": 5222 + }, + { + "epoch": 2.2042194092827003, + "grad_norm": 1.123711109161377, + "learning_rate": 7.78097029308166e-05, + "loss": 0.5853859186172485, + "step": 5224 + }, + { + "epoch": 2.2050632911392407, + "grad_norm": 1.9381071329116821, + "learning_rate": 7.77897423148801e-05, + "loss": 0.6485977172851562, + "step": 5226 + }, + { + "epoch": 2.2059071729957807, + "grad_norm": 1.4062265157699585, + "learning_rate": 7.776977528838376e-05, + "loss": 0.6243517398834229, + "step": 5228 + }, + { + "epoch": 2.2067510548523206, + "grad_norm": 1.2127182483673096, + "learning_rate": 7.774980185593358e-05, + "loss": 0.5770578980445862, + "step": 5230 + }, + { + "epoch": 2.207594936708861, + "grad_norm": 1.250847578048706, + "learning_rate": 7.772982202213709e-05, + "loss": 0.6521194577217102, + "step": 5232 + }, + { + "epoch": 2.208438818565401, + "grad_norm": 1.2568131685256958, + "learning_rate": 7.77098357916032e-05, + "loss": 0.5755271911621094, + "step": 5234 + }, + { + "epoch": 2.209282700421941, + "grad_norm": 1.2422975301742554, + "learning_rate": 7.768984316894236e-05, + "loss": 0.5486469864845276, + "step": 5236 + }, + { + "epoch": 2.2101265822784812, + "grad_norm": 1.1018635034561157, + "learning_rate": 7.766984415876652e-05, + "loss": 0.5512928366661072, + "step": 5238 + }, + { + "epoch": 2.210970464135021, + "grad_norm": 1.2261123657226562, + "learning_rate": 7.764983876568903e-05, + "loss": 0.5753499269485474, + "step": 5240 + }, + { + "epoch": 2.211814345991561, + "grad_norm": 1.2222342491149902, + "learning_rate": 7.762982699432474e-05, + "loss": 0.5404848456382751, + "step": 5242 + }, + { + "epoch": 2.212658227848101, + "grad_norm": 1.231494426727295, + "learning_rate": 7.760980884929004e-05, + "loss": 0.5999218821525574, + "step": 5244 + }, + { + "epoch": 2.2135021097046415, + "grad_norm": 1.1530078649520874, + "learning_rate": 7.758978433520268e-05, + "loss": 0.6123101115226746, + "step": 5246 + }, + { + "epoch": 2.2143459915611814, + "grad_norm": 1.182706594467163, + "learning_rate": 7.756975345668194e-05, + "loss": 0.5945886969566345, + "step": 5248 + }, + { + "epoch": 2.2151898734177213, + "grad_norm": 1.0788652896881104, + "learning_rate": 7.754971621834857e-05, + "loss": 0.5698213577270508, + "step": 5250 + }, + { + "epoch": 2.2160337552742617, + "grad_norm": 1.2243359088897705, + "learning_rate": 7.752967262482477e-05, + "loss": 0.5959678888320923, + "step": 5252 + }, + { + "epoch": 2.2168776371308017, + "grad_norm": 1.4292869567871094, + "learning_rate": 7.750962268073421e-05, + "loss": 0.586794376373291, + "step": 5254 + }, + { + "epoch": 2.2177215189873416, + "grad_norm": 1.1809570789337158, + "learning_rate": 7.748956639070204e-05, + "loss": 0.5513298511505127, + "step": 5256 + }, + { + "epoch": 2.218565400843882, + "grad_norm": 1.485813856124878, + "learning_rate": 7.746950375935484e-05, + "loss": 0.6402831673622131, + "step": 5258 + }, + { + "epoch": 2.219409282700422, + "grad_norm": 1.0851374864578247, + "learning_rate": 7.744943479132069e-05, + "loss": 0.5729117393493652, + "step": 5260 + }, + { + "epoch": 2.220253164556962, + "grad_norm": 1.4308949708938599, + "learning_rate": 7.742935949122911e-05, + "loss": 0.6239725947380066, + "step": 5262 + }, + { + "epoch": 2.2210970464135023, + "grad_norm": 1.379258155822754, + "learning_rate": 7.740927786371107e-05, + "loss": 0.6260181069374084, + "step": 5264 + }, + { + "epoch": 2.221940928270042, + "grad_norm": 1.1661925315856934, + "learning_rate": 7.738918991339905e-05, + "loss": 0.6074157357215881, + "step": 5266 + }, + { + "epoch": 2.222784810126582, + "grad_norm": 1.168901801109314, + "learning_rate": 7.736909564492694e-05, + "loss": 0.6119515895843506, + "step": 5268 + }, + { + "epoch": 2.2236286919831225, + "grad_norm": 1.1451057195663452, + "learning_rate": 7.734899506293008e-05, + "loss": 0.5505842566490173, + "step": 5270 + }, + { + "epoch": 2.2244725738396625, + "grad_norm": 1.2303991317749023, + "learning_rate": 7.732888817204533e-05, + "loss": 0.6117991805076599, + "step": 5272 + }, + { + "epoch": 2.2253164556962024, + "grad_norm": 1.04572331905365, + "learning_rate": 7.730877497691092e-05, + "loss": 0.5589770078659058, + "step": 5274 + }, + { + "epoch": 2.226160337552743, + "grad_norm": 1.2047234773635864, + "learning_rate": 7.72886554821666e-05, + "loss": 0.6288654208183289, + "step": 5276 + }, + { + "epoch": 2.2270042194092827, + "grad_norm": 1.2036652565002441, + "learning_rate": 7.726852969245355e-05, + "loss": 0.6174501776695251, + "step": 5278 + }, + { + "epoch": 2.2278481012658227, + "grad_norm": 1.1740167140960693, + "learning_rate": 7.72483976124144e-05, + "loss": 0.6027677655220032, + "step": 5280 + }, + { + "epoch": 2.228691983122363, + "grad_norm": 1.0600008964538574, + "learning_rate": 7.722825924669326e-05, + "loss": 0.6016151309013367, + "step": 5282 + }, + { + "epoch": 2.229535864978903, + "grad_norm": 1.2631008625030518, + "learning_rate": 7.720811459993562e-05, + "loss": 0.5905849933624268, + "step": 5284 + }, + { + "epoch": 2.230379746835443, + "grad_norm": 1.1024738550186157, + "learning_rate": 7.718796367678848e-05, + "loss": 0.5129587054252625, + "step": 5286 + }, + { + "epoch": 2.2312236286919833, + "grad_norm": 1.23116934299469, + "learning_rate": 7.716780648190028e-05, + "loss": 0.5709586143493652, + "step": 5288 + }, + { + "epoch": 2.2320675105485233, + "grad_norm": 1.2739102840423584, + "learning_rate": 7.714764301992088e-05, + "loss": 0.5454761385917664, + "step": 5290 + }, + { + "epoch": 2.232911392405063, + "grad_norm": 1.303963303565979, + "learning_rate": 7.712747329550162e-05, + "loss": 0.537248969078064, + "step": 5292 + }, + { + "epoch": 2.233755274261603, + "grad_norm": 1.2454309463500977, + "learning_rate": 7.710729731329529e-05, + "loss": 0.6364415884017944, + "step": 5294 + }, + { + "epoch": 2.2345991561181435, + "grad_norm": 1.2401882410049438, + "learning_rate": 7.708711507795605e-05, + "loss": 0.5640100240707397, + "step": 5296 + }, + { + "epoch": 2.2354430379746835, + "grad_norm": 1.197432041168213, + "learning_rate": 7.706692659413959e-05, + "loss": 0.5919729471206665, + "step": 5298 + }, + { + "epoch": 2.2362869198312234, + "grad_norm": 1.1779764890670776, + "learning_rate": 7.704673186650298e-05, + "loss": 0.5569849014282227, + "step": 5300 + }, + { + "epoch": 2.2362869198312234, + "eval_loss": 0.6898328065872192, + "eval_runtime": 739.3794, + "eval_samples_per_second": 2.85, + "eval_steps_per_second": 2.85, + "step": 5300 + }, + { + "epoch": 2.237130801687764, + "grad_norm": 1.1371463537216187, + "learning_rate": 7.702653089970479e-05, + "loss": 0.5823061466217041, + "step": 5302 + }, + { + "epoch": 2.2379746835443037, + "grad_norm": 1.1877846717834473, + "learning_rate": 7.700632369840497e-05, + "loss": 0.5556252002716064, + "step": 5304 + }, + { + "epoch": 2.2388185654008437, + "grad_norm": 1.1580896377563477, + "learning_rate": 7.698611026726492e-05, + "loss": 0.5794119834899902, + "step": 5306 + }, + { + "epoch": 2.239662447257384, + "grad_norm": 1.29141366481781, + "learning_rate": 7.696589061094755e-05, + "loss": 0.5828680396080017, + "step": 5308 + }, + { + "epoch": 2.240506329113924, + "grad_norm": 1.1286728382110596, + "learning_rate": 7.694566473411706e-05, + "loss": 0.6161736845970154, + "step": 5310 + }, + { + "epoch": 2.241350210970464, + "grad_norm": 1.0969985723495483, + "learning_rate": 7.692543264143925e-05, + "loss": 0.570767879486084, + "step": 5312 + }, + { + "epoch": 2.2421940928270043, + "grad_norm": 1.2902227640151978, + "learning_rate": 7.690519433758123e-05, + "loss": 0.631476104259491, + "step": 5314 + }, + { + "epoch": 2.2430379746835443, + "grad_norm": 1.432735800743103, + "learning_rate": 7.68849498272116e-05, + "loss": 0.6142309904098511, + "step": 5316 + }, + { + "epoch": 2.243881856540084, + "grad_norm": 1.0824161767959595, + "learning_rate": 7.686469911500038e-05, + "loss": 0.5871514081954956, + "step": 5318 + }, + { + "epoch": 2.2447257383966246, + "grad_norm": 1.1694978475570679, + "learning_rate": 7.684444220561902e-05, + "loss": 0.6144557595252991, + "step": 5320 + }, + { + "epoch": 2.2455696202531645, + "grad_norm": 1.2981040477752686, + "learning_rate": 7.68241791037404e-05, + "loss": 0.6049425601959229, + "step": 5322 + }, + { + "epoch": 2.2464135021097045, + "grad_norm": 1.132128357887268, + "learning_rate": 7.680390981403885e-05, + "loss": 0.5571867823600769, + "step": 5324 + }, + { + "epoch": 2.247257383966245, + "grad_norm": 1.1760079860687256, + "learning_rate": 7.678363434119005e-05, + "loss": 0.5710517168045044, + "step": 5326 + }, + { + "epoch": 2.248101265822785, + "grad_norm": 1.1918572187423706, + "learning_rate": 7.67633526898712e-05, + "loss": 0.5508866906166077, + "step": 5328 + }, + { + "epoch": 2.2489451476793247, + "grad_norm": 1.1837294101715088, + "learning_rate": 7.674306486476091e-05, + "loss": 0.6242696046829224, + "step": 5330 + }, + { + "epoch": 2.249789029535865, + "grad_norm": 1.384918212890625, + "learning_rate": 7.672277087053914e-05, + "loss": 0.5821678042411804, + "step": 5332 + }, + { + "epoch": 2.250632911392405, + "grad_norm": 1.1248877048492432, + "learning_rate": 7.670247071188738e-05, + "loss": 0.5415928363800049, + "step": 5334 + }, + { + "epoch": 2.251476793248945, + "grad_norm": 1.228140950202942, + "learning_rate": 7.668216439348843e-05, + "loss": 0.5475174188613892, + "step": 5336 + }, + { + "epoch": 2.2523206751054854, + "grad_norm": 1.3816046714782715, + "learning_rate": 7.666185192002662e-05, + "loss": 0.5793306231498718, + "step": 5338 + }, + { + "epoch": 2.2531645569620253, + "grad_norm": 1.2446565628051758, + "learning_rate": 7.664153329618759e-05, + "loss": 0.6221131682395935, + "step": 5340 + }, + { + "epoch": 2.2540084388185653, + "grad_norm": 1.1677669286727905, + "learning_rate": 7.662120852665852e-05, + "loss": 0.5403847694396973, + "step": 5342 + }, + { + "epoch": 2.2548523206751057, + "grad_norm": 1.2485873699188232, + "learning_rate": 7.66008776161279e-05, + "loss": 0.620201587677002, + "step": 5344 + }, + { + "epoch": 2.2556962025316456, + "grad_norm": 1.2486802339553833, + "learning_rate": 7.658054056928568e-05, + "loss": 0.5969216227531433, + "step": 5346 + }, + { + "epoch": 2.2565400843881855, + "grad_norm": 1.2621372938156128, + "learning_rate": 7.656019739082326e-05, + "loss": 0.6376339793205261, + "step": 5348 + }, + { + "epoch": 2.257383966244726, + "grad_norm": 1.238633155822754, + "learning_rate": 7.65398480854334e-05, + "loss": 0.6374872326850891, + "step": 5350 + }, + { + "epoch": 2.258227848101266, + "grad_norm": 1.3031803369522095, + "learning_rate": 7.651949265781029e-05, + "loss": 0.6348551511764526, + "step": 5352 + }, + { + "epoch": 2.259071729957806, + "grad_norm": 1.3735158443450928, + "learning_rate": 7.649913111264952e-05, + "loss": 0.6267750859260559, + "step": 5354 + }, + { + "epoch": 2.259915611814346, + "grad_norm": 1.1227772235870361, + "learning_rate": 7.647876345464817e-05, + "loss": 0.623030960559845, + "step": 5356 + }, + { + "epoch": 2.260759493670886, + "grad_norm": 1.4555678367614746, + "learning_rate": 7.645838968850459e-05, + "loss": 0.5810713171958923, + "step": 5358 + }, + { + "epoch": 2.261603375527426, + "grad_norm": 1.227725863456726, + "learning_rate": 7.643800981891867e-05, + "loss": 0.6150093078613281, + "step": 5360 + }, + { + "epoch": 2.2624472573839665, + "grad_norm": 1.0648300647735596, + "learning_rate": 7.641762385059161e-05, + "loss": 0.5350445508956909, + "step": 5362 + }, + { + "epoch": 2.2632911392405064, + "grad_norm": 1.179452896118164, + "learning_rate": 7.639723178822613e-05, + "loss": 0.6253421306610107, + "step": 5364 + }, + { + "epoch": 2.2641350210970463, + "grad_norm": 1.0983240604400635, + "learning_rate": 7.637683363652621e-05, + "loss": 0.5512562990188599, + "step": 5366 + }, + { + "epoch": 2.2649789029535867, + "grad_norm": 1.1825451850891113, + "learning_rate": 7.635642940019736e-05, + "loss": 0.5584151148796082, + "step": 5368 + }, + { + "epoch": 2.2658227848101267, + "grad_norm": 1.1022000312805176, + "learning_rate": 7.633601908394643e-05, + "loss": 0.5881790518760681, + "step": 5370 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 1.1935697793960571, + "learning_rate": 7.631560269248169e-05, + "loss": 0.6060683131217957, + "step": 5372 + }, + { + "epoch": 2.267510548523207, + "grad_norm": 1.1174103021621704, + "learning_rate": 7.62951802305128e-05, + "loss": 0.5877062678337097, + "step": 5374 + }, + { + "epoch": 2.268354430379747, + "grad_norm": 1.3934977054595947, + "learning_rate": 7.627475170275086e-05, + "loss": 0.5145504474639893, + "step": 5376 + }, + { + "epoch": 2.269198312236287, + "grad_norm": 1.2637842893600464, + "learning_rate": 7.625431711390831e-05, + "loss": 0.6194025874137878, + "step": 5378 + }, + { + "epoch": 2.270042194092827, + "grad_norm": 1.2034388780593872, + "learning_rate": 7.623387646869902e-05, + "loss": 0.6205627918243408, + "step": 5380 + }, + { + "epoch": 2.270886075949367, + "grad_norm": 0.953880250453949, + "learning_rate": 7.621342977183826e-05, + "loss": 0.5609696507453918, + "step": 5382 + }, + { + "epoch": 2.271729957805907, + "grad_norm": 1.2841949462890625, + "learning_rate": 7.619297702804272e-05, + "loss": 0.6044906377792358, + "step": 5384 + }, + { + "epoch": 2.272573839662447, + "grad_norm": 1.146804690361023, + "learning_rate": 7.617251824203037e-05, + "loss": 0.5420435667037964, + "step": 5386 + }, + { + "epoch": 2.2734177215189875, + "grad_norm": 1.2225698232650757, + "learning_rate": 7.615205341852076e-05, + "loss": 0.6230710744857788, + "step": 5388 + }, + { + "epoch": 2.2742616033755274, + "grad_norm": 1.3423371315002441, + "learning_rate": 7.613158256223467e-05, + "loss": 0.6486349701881409, + "step": 5390 + }, + { + "epoch": 2.2751054852320673, + "grad_norm": 1.0840023756027222, + "learning_rate": 7.611110567789435e-05, + "loss": 0.6527825593948364, + "step": 5392 + }, + { + "epoch": 2.2759493670886077, + "grad_norm": 1.342466950416565, + "learning_rate": 7.609062277022341e-05, + "loss": 0.6859483122825623, + "step": 5394 + }, + { + "epoch": 2.2767932489451477, + "grad_norm": 1.0406129360198975, + "learning_rate": 7.607013384394691e-05, + "loss": 0.5536003708839417, + "step": 5396 + }, + { + "epoch": 2.2776371308016876, + "grad_norm": 1.0853544473648071, + "learning_rate": 7.604963890379118e-05, + "loss": 0.5488654971122742, + "step": 5398 + }, + { + "epoch": 2.278481012658228, + "grad_norm": 1.0330145359039307, + "learning_rate": 7.602913795448407e-05, + "loss": 0.6072142720222473, + "step": 5400 + }, + { + "epoch": 2.278481012658228, + "eval_loss": 0.6875645518302917, + "eval_runtime": 861.3558, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 5400 + }, + { + "epoch": 2.279324894514768, + "grad_norm": 1.1858742237091064, + "learning_rate": 7.600863100075472e-05, + "loss": 0.5420109033584595, + "step": 5402 + }, + { + "epoch": 2.280168776371308, + "grad_norm": 1.2126039266586304, + "learning_rate": 7.598811804733373e-05, + "loss": 0.6109243631362915, + "step": 5404 + }, + { + "epoch": 2.2810126582278483, + "grad_norm": 1.1290241479873657, + "learning_rate": 7.5967599098953e-05, + "loss": 0.5889696478843689, + "step": 5406 + }, + { + "epoch": 2.281856540084388, + "grad_norm": 1.320263147354126, + "learning_rate": 7.594707416034586e-05, + "loss": 0.6548630595207214, + "step": 5408 + }, + { + "epoch": 2.282700421940928, + "grad_norm": 1.346169114112854, + "learning_rate": 7.592654323624703e-05, + "loss": 0.6556787490844727, + "step": 5410 + }, + { + "epoch": 2.2835443037974685, + "grad_norm": 1.2104716300964355, + "learning_rate": 7.590600633139265e-05, + "loss": 0.5631673336029053, + "step": 5412 + }, + { + "epoch": 2.2843881856540085, + "grad_norm": 1.3298237323760986, + "learning_rate": 7.58854634505201e-05, + "loss": 0.5931088328361511, + "step": 5414 + }, + { + "epoch": 2.2852320675105484, + "grad_norm": 1.4201204776763916, + "learning_rate": 7.586491459836829e-05, + "loss": 0.6966755986213684, + "step": 5416 + }, + { + "epoch": 2.286075949367089, + "grad_norm": 1.253135323524475, + "learning_rate": 7.584435977967743e-05, + "loss": 0.6172569394111633, + "step": 5418 + }, + { + "epoch": 2.2869198312236287, + "grad_norm": 1.133144736289978, + "learning_rate": 7.582379899918911e-05, + "loss": 0.5376655459403992, + "step": 5420 + }, + { + "epoch": 2.2877637130801687, + "grad_norm": 1.1103745698928833, + "learning_rate": 7.580323226164632e-05, + "loss": 0.6138498187065125, + "step": 5422 + }, + { + "epoch": 2.2886075949367086, + "grad_norm": 1.091636300086975, + "learning_rate": 7.57826595717934e-05, + "loss": 0.5049096345901489, + "step": 5424 + }, + { + "epoch": 2.289451476793249, + "grad_norm": 1.2486571073532104, + "learning_rate": 7.57620809343761e-05, + "loss": 0.5666115283966064, + "step": 5426 + }, + { + "epoch": 2.290295358649789, + "grad_norm": 1.510684847831726, + "learning_rate": 7.57414963541415e-05, + "loss": 0.49512919783592224, + "step": 5428 + }, + { + "epoch": 2.291139240506329, + "grad_norm": 1.1142191886901855, + "learning_rate": 7.572090583583805e-05, + "loss": 0.558807373046875, + "step": 5430 + }, + { + "epoch": 2.2919831223628693, + "grad_norm": 1.1162657737731934, + "learning_rate": 7.57003093842156e-05, + "loss": 0.6245265603065491, + "step": 5432 + }, + { + "epoch": 2.292827004219409, + "grad_norm": 1.2784614562988281, + "learning_rate": 7.567970700402537e-05, + "loss": 0.5505527853965759, + "step": 5434 + }, + { + "epoch": 2.293670886075949, + "grad_norm": 1.3142638206481934, + "learning_rate": 7.565909870001992e-05, + "loss": 0.6137702465057373, + "step": 5436 + }, + { + "epoch": 2.2945147679324895, + "grad_norm": 1.072805404663086, + "learning_rate": 7.563848447695318e-05, + "loss": 0.540766716003418, + "step": 5438 + }, + { + "epoch": 2.2953586497890295, + "grad_norm": 1.2861377000808716, + "learning_rate": 7.561786433958048e-05, + "loss": 0.6806555986404419, + "step": 5440 + }, + { + "epoch": 2.2962025316455694, + "grad_norm": 1.3193045854568481, + "learning_rate": 7.559723829265847e-05, + "loss": 0.6191258430480957, + "step": 5442 + }, + { + "epoch": 2.29704641350211, + "grad_norm": 1.1969127655029297, + "learning_rate": 7.55766063409452e-05, + "loss": 0.6067718863487244, + "step": 5444 + }, + { + "epoch": 2.2978902953586497, + "grad_norm": 1.2129666805267334, + "learning_rate": 7.555596848920006e-05, + "loss": 0.5673627257347107, + "step": 5446 + }, + { + "epoch": 2.2987341772151897, + "grad_norm": 1.1639961004257202, + "learning_rate": 7.553532474218379e-05, + "loss": 0.61825031042099, + "step": 5448 + }, + { + "epoch": 2.29957805907173, + "grad_norm": 1.3893283605575562, + "learning_rate": 7.551467510465852e-05, + "loss": 0.6096790432929993, + "step": 5450 + }, + { + "epoch": 2.30042194092827, + "grad_norm": 1.0708417892456055, + "learning_rate": 7.549401958138772e-05, + "loss": 0.6121414303779602, + "step": 5452 + }, + { + "epoch": 2.30126582278481, + "grad_norm": 1.3299298286437988, + "learning_rate": 7.547335817713624e-05, + "loss": 0.6504668593406677, + "step": 5454 + }, + { + "epoch": 2.3021097046413503, + "grad_norm": 1.3594682216644287, + "learning_rate": 7.545269089667022e-05, + "loss": 0.5761144161224365, + "step": 5456 + }, + { + "epoch": 2.3029535864978903, + "grad_norm": 1.1089586019515991, + "learning_rate": 7.543201774475726e-05, + "loss": 0.5457773804664612, + "step": 5458 + }, + { + "epoch": 2.3037974683544302, + "grad_norm": 1.3472918272018433, + "learning_rate": 7.541133872616624e-05, + "loss": 0.6014775037765503, + "step": 5460 + }, + { + "epoch": 2.3046413502109706, + "grad_norm": 1.2757689952850342, + "learning_rate": 7.53906538456674e-05, + "loss": 0.6246467232704163, + "step": 5462 + }, + { + "epoch": 2.3054852320675105, + "grad_norm": 1.4598166942596436, + "learning_rate": 7.536996310803236e-05, + "loss": 0.6583935022354126, + "step": 5464 + }, + { + "epoch": 2.3063291139240505, + "grad_norm": 1.2861602306365967, + "learning_rate": 7.534926651803407e-05, + "loss": 0.562523603439331, + "step": 5466 + }, + { + "epoch": 2.307172995780591, + "grad_norm": 1.0953221321105957, + "learning_rate": 7.532856408044684e-05, + "loss": 0.6093505620956421, + "step": 5468 + }, + { + "epoch": 2.308016877637131, + "grad_norm": 1.0982829332351685, + "learning_rate": 7.530785580004631e-05, + "loss": 0.6196447014808655, + "step": 5470 + }, + { + "epoch": 2.3088607594936708, + "grad_norm": 1.2224280834197998, + "learning_rate": 7.52871416816095e-05, + "loss": 0.6360989212989807, + "step": 5472 + }, + { + "epoch": 2.309704641350211, + "grad_norm": 1.244486927986145, + "learning_rate": 7.526642172991476e-05, + "loss": 0.6189543008804321, + "step": 5474 + }, + { + "epoch": 2.310548523206751, + "grad_norm": 1.2408053874969482, + "learning_rate": 7.524569594974178e-05, + "loss": 0.6137582659721375, + "step": 5476 + }, + { + "epoch": 2.311392405063291, + "grad_norm": 1.3323272466659546, + "learning_rate": 7.522496434587157e-05, + "loss": 0.6462169289588928, + "step": 5478 + }, + { + "epoch": 2.3122362869198314, + "grad_norm": 1.1076425313949585, + "learning_rate": 7.520422692308657e-05, + "loss": 0.5495362877845764, + "step": 5480 + }, + { + "epoch": 2.3130801687763713, + "grad_norm": 1.3298509120941162, + "learning_rate": 7.518348368617046e-05, + "loss": 0.5560636520385742, + "step": 5482 + }, + { + "epoch": 2.3139240506329113, + "grad_norm": 1.0740195512771606, + "learning_rate": 7.516273463990832e-05, + "loss": 0.5763371586799622, + "step": 5484 + }, + { + "epoch": 2.3147679324894517, + "grad_norm": 1.0748567581176758, + "learning_rate": 7.514197978908657e-05, + "loss": 0.5111498832702637, + "step": 5486 + }, + { + "epoch": 2.3156118143459916, + "grad_norm": 1.2047218084335327, + "learning_rate": 7.512121913849294e-05, + "loss": 0.6599951982498169, + "step": 5488 + }, + { + "epoch": 2.3164556962025316, + "grad_norm": 1.2956700325012207, + "learning_rate": 7.510045269291651e-05, + "loss": 0.6409770846366882, + "step": 5490 + }, + { + "epoch": 2.317299578059072, + "grad_norm": 1.241860032081604, + "learning_rate": 7.50796804571477e-05, + "loss": 0.5967662334442139, + "step": 5492 + }, + { + "epoch": 2.318143459915612, + "grad_norm": 1.1612682342529297, + "learning_rate": 7.50589024359783e-05, + "loss": 0.5856342315673828, + "step": 5494 + }, + { + "epoch": 2.318987341772152, + "grad_norm": 1.0895500183105469, + "learning_rate": 7.503811863420135e-05, + "loss": 0.5652023553848267, + "step": 5496 + }, + { + "epoch": 2.319831223628692, + "grad_norm": 1.3374481201171875, + "learning_rate": 7.50173290566113e-05, + "loss": 0.6777268648147583, + "step": 5498 + }, + { + "epoch": 2.320675105485232, + "grad_norm": 1.192614197731018, + "learning_rate": 7.499653370800391e-05, + "loss": 0.6052314043045044, + "step": 5500 + }, + { + "epoch": 2.320675105485232, + "eval_loss": 0.6867148876190186, + "eval_runtime": 941.3545, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 2.238, + "step": 5500 + }, + { + "epoch": 2.321518987341772, + "grad_norm": 1.1008832454681396, + "learning_rate": 7.497573259317625e-05, + "loss": 0.5208253860473633, + "step": 5502 + }, + { + "epoch": 2.3223628691983125, + "grad_norm": 1.2141541242599487, + "learning_rate": 7.495492571692677e-05, + "loss": 0.6352296471595764, + "step": 5504 + }, + { + "epoch": 2.3232067510548524, + "grad_norm": 1.2588802576065063, + "learning_rate": 7.493411308405517e-05, + "loss": 0.6132256388664246, + "step": 5506 + }, + { + "epoch": 2.3240506329113924, + "grad_norm": 1.348765254020691, + "learning_rate": 7.491329469936258e-05, + "loss": 0.571265697479248, + "step": 5508 + }, + { + "epoch": 2.3248945147679323, + "grad_norm": 1.266377329826355, + "learning_rate": 7.489247056765135e-05, + "loss": 0.5433708429336548, + "step": 5510 + }, + { + "epoch": 2.3257383966244727, + "grad_norm": 1.2920128107070923, + "learning_rate": 7.487164069372523e-05, + "loss": 0.6193158030509949, + "step": 5512 + }, + { + "epoch": 2.3265822784810126, + "grad_norm": 1.068169116973877, + "learning_rate": 7.485080508238928e-05, + "loss": 0.5817977786064148, + "step": 5514 + }, + { + "epoch": 2.3274261603375526, + "grad_norm": 1.2941710948944092, + "learning_rate": 7.482996373844985e-05, + "loss": 0.6558082103729248, + "step": 5516 + }, + { + "epoch": 2.328270042194093, + "grad_norm": 1.2143336534500122, + "learning_rate": 7.480911666671467e-05, + "loss": 0.5569961667060852, + "step": 5518 + }, + { + "epoch": 2.329113924050633, + "grad_norm": 1.3364789485931396, + "learning_rate": 7.478826387199274e-05, + "loss": 0.6497300863265991, + "step": 5520 + }, + { + "epoch": 2.329957805907173, + "grad_norm": 1.057530403137207, + "learning_rate": 7.47674053590944e-05, + "loss": 0.5793087482452393, + "step": 5522 + }, + { + "epoch": 2.330801687763713, + "grad_norm": 1.1543176174163818, + "learning_rate": 7.47465411328313e-05, + "loss": 0.5583140850067139, + "step": 5524 + }, + { + "epoch": 2.331645569620253, + "grad_norm": 1.3409180641174316, + "learning_rate": 7.472567119801645e-05, + "loss": 0.6318784952163696, + "step": 5526 + }, + { + "epoch": 2.332489451476793, + "grad_norm": 1.2899413108825684, + "learning_rate": 7.47047955594641e-05, + "loss": 0.5950855612754822, + "step": 5528 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 1.329220175743103, + "learning_rate": 7.468391422198989e-05, + "loss": 0.6181023716926575, + "step": 5530 + }, + { + "epoch": 2.3341772151898734, + "grad_norm": 1.202129602432251, + "learning_rate": 7.466302719041073e-05, + "loss": 0.6384578943252563, + "step": 5532 + }, + { + "epoch": 2.3350210970464134, + "grad_norm": 1.1890549659729004, + "learning_rate": 7.464213446954487e-05, + "loss": 0.6059293746948242, + "step": 5534 + }, + { + "epoch": 2.3358649789029537, + "grad_norm": 1.2041429281234741, + "learning_rate": 7.462123606421183e-05, + "loss": 0.6432797908782959, + "step": 5536 + }, + { + "epoch": 2.3367088607594937, + "grad_norm": 1.3827080726623535, + "learning_rate": 7.460033197923249e-05, + "loss": 0.6796717047691345, + "step": 5538 + }, + { + "epoch": 2.3375527426160336, + "grad_norm": 1.2323482036590576, + "learning_rate": 7.457942221942903e-05, + "loss": 0.5772476196289062, + "step": 5540 + }, + { + "epoch": 2.338396624472574, + "grad_norm": 1.2011388540267944, + "learning_rate": 7.455850678962493e-05, + "loss": 0.5964269042015076, + "step": 5542 + }, + { + "epoch": 2.339240506329114, + "grad_norm": 1.1133569478988647, + "learning_rate": 7.453758569464495e-05, + "loss": 0.6416608095169067, + "step": 5544 + }, + { + "epoch": 2.340084388185654, + "grad_norm": 1.1257679462432861, + "learning_rate": 7.451665893931521e-05, + "loss": 0.5668829679489136, + "step": 5546 + }, + { + "epoch": 2.3409282700421943, + "grad_norm": 1.3494724035263062, + "learning_rate": 7.449572652846311e-05, + "loss": 0.6029916405677795, + "step": 5548 + }, + { + "epoch": 2.3417721518987342, + "grad_norm": 1.2199759483337402, + "learning_rate": 7.447478846691735e-05, + "loss": 0.6336984634399414, + "step": 5550 + }, + { + "epoch": 2.342616033755274, + "grad_norm": 1.2806570529937744, + "learning_rate": 7.445384475950792e-05, + "loss": 0.579140305519104, + "step": 5552 + }, + { + "epoch": 2.343459915611814, + "grad_norm": 0.9874221086502075, + "learning_rate": 7.443289541106616e-05, + "loss": 0.6061640381813049, + "step": 5554 + }, + { + "epoch": 2.3443037974683545, + "grad_norm": 1.2271486520767212, + "learning_rate": 7.441194042642467e-05, + "loss": 0.5502339601516724, + "step": 5556 + }, + { + "epoch": 2.3451476793248944, + "grad_norm": 1.2522462606430054, + "learning_rate": 7.439097981041738e-05, + "loss": 0.5774438381195068, + "step": 5558 + }, + { + "epoch": 2.3459915611814344, + "grad_norm": 1.267204761505127, + "learning_rate": 7.437001356787945e-05, + "loss": 0.6091527342796326, + "step": 5560 + }, + { + "epoch": 2.3468354430379748, + "grad_norm": 1.1711935997009277, + "learning_rate": 7.434904170364747e-05, + "loss": 0.5443631410598755, + "step": 5562 + }, + { + "epoch": 2.3476793248945147, + "grad_norm": 1.085097074508667, + "learning_rate": 7.432806422255918e-05, + "loss": 0.5255029201507568, + "step": 5564 + }, + { + "epoch": 2.3485232067510546, + "grad_norm": 1.3244949579238892, + "learning_rate": 7.430708112945369e-05, + "loss": 0.5197238922119141, + "step": 5566 + }, + { + "epoch": 2.349367088607595, + "grad_norm": 1.3646879196166992, + "learning_rate": 7.428609242917141e-05, + "loss": 0.5576170682907104, + "step": 5568 + }, + { + "epoch": 2.350210970464135, + "grad_norm": 1.339190125465393, + "learning_rate": 7.426509812655406e-05, + "loss": 0.6254662275314331, + "step": 5570 + }, + { + "epoch": 2.351054852320675, + "grad_norm": 1.4624155759811401, + "learning_rate": 7.424409822644457e-05, + "loss": 0.6593500375747681, + "step": 5572 + }, + { + "epoch": 2.3518987341772153, + "grad_norm": 1.1931114196777344, + "learning_rate": 7.422309273368722e-05, + "loss": 0.6102238297462463, + "step": 5574 + }, + { + "epoch": 2.3527426160337552, + "grad_norm": 1.789340615272522, + "learning_rate": 7.420208165312762e-05, + "loss": 0.6695854067802429, + "step": 5576 + }, + { + "epoch": 2.353586497890295, + "grad_norm": 1.2364262342453003, + "learning_rate": 7.418106498961258e-05, + "loss": 0.578844428062439, + "step": 5578 + }, + { + "epoch": 2.3544303797468356, + "grad_norm": 1.1568509340286255, + "learning_rate": 7.416004274799027e-05, + "loss": 0.5717503428459167, + "step": 5580 + }, + { + "epoch": 2.3552742616033755, + "grad_norm": 1.1744630336761475, + "learning_rate": 7.413901493311009e-05, + "loss": 0.6170201897621155, + "step": 5582 + }, + { + "epoch": 2.3561181434599154, + "grad_norm": 1.0684332847595215, + "learning_rate": 7.411798154982275e-05, + "loss": 0.6482691764831543, + "step": 5584 + }, + { + "epoch": 2.356962025316456, + "grad_norm": 1.046196460723877, + "learning_rate": 7.409694260298025e-05, + "loss": 0.572839617729187, + "step": 5586 + }, + { + "epoch": 2.3578059071729958, + "grad_norm": 1.0110210180282593, + "learning_rate": 7.407589809743591e-05, + "loss": 0.5645976662635803, + "step": 5588 + }, + { + "epoch": 2.3586497890295357, + "grad_norm": 1.0801016092300415, + "learning_rate": 7.405484803804425e-05, + "loss": 0.5653133392333984, + "step": 5590 + }, + { + "epoch": 2.359493670886076, + "grad_norm": 1.0934380292892456, + "learning_rate": 7.403379242966116e-05, + "loss": 0.5972150564193726, + "step": 5592 + }, + { + "epoch": 2.360337552742616, + "grad_norm": 1.3722410202026367, + "learning_rate": 7.40127312771437e-05, + "loss": 0.5927542448043823, + "step": 5594 + }, + { + "epoch": 2.361181434599156, + "grad_norm": 1.1567236185073853, + "learning_rate": 7.399166458535032e-05, + "loss": 0.547027051448822, + "step": 5596 + }, + { + "epoch": 2.3620253164556964, + "grad_norm": 1.2254211902618408, + "learning_rate": 7.397059235914067e-05, + "loss": 0.5356617569923401, + "step": 5598 + }, + { + "epoch": 2.3628691983122363, + "grad_norm": 1.1529103517532349, + "learning_rate": 7.394951460337575e-05, + "loss": 0.5424175262451172, + "step": 5600 + }, + { + "epoch": 2.3628691983122363, + "eval_loss": 0.6851074695587158, + "eval_runtime": 938.5536, + "eval_samples_per_second": 2.245, + "eval_steps_per_second": 2.245, + "step": 5600 + }, + { + "epoch": 2.3637130801687762, + "grad_norm": 1.2050299644470215, + "learning_rate": 7.392843132291777e-05, + "loss": 0.5834107398986816, + "step": 5602 + }, + { + "epoch": 2.3645569620253166, + "grad_norm": 1.264567494392395, + "learning_rate": 7.390734252263024e-05, + "loss": 0.5445035099983215, + "step": 5604 + }, + { + "epoch": 2.3654008438818566, + "grad_norm": 1.357791781425476, + "learning_rate": 7.388624820737791e-05, + "loss": 0.6207653880119324, + "step": 5606 + }, + { + "epoch": 2.3662447257383965, + "grad_norm": 1.2246928215026855, + "learning_rate": 7.386514838202689e-05, + "loss": 0.6628696322441101, + "step": 5608 + }, + { + "epoch": 2.367088607594937, + "grad_norm": 1.1455399990081787, + "learning_rate": 7.384404305144447e-05, + "loss": 0.5870704054832458, + "step": 5610 + }, + { + "epoch": 2.367932489451477, + "grad_norm": 1.2338638305664062, + "learning_rate": 7.382293222049925e-05, + "loss": 0.6160538792610168, + "step": 5612 + }, + { + "epoch": 2.3687763713080168, + "grad_norm": 1.231271505355835, + "learning_rate": 7.38018158940611e-05, + "loss": 0.6274036765098572, + "step": 5614 + }, + { + "epoch": 2.369620253164557, + "grad_norm": 1.022050380706787, + "learning_rate": 7.378069407700114e-05, + "loss": 0.5623515248298645, + "step": 5616 + }, + { + "epoch": 2.370464135021097, + "grad_norm": 1.2040951251983643, + "learning_rate": 7.375956677419178e-05, + "loss": 0.5505564212799072, + "step": 5618 + }, + { + "epoch": 2.371308016877637, + "grad_norm": 1.1754523515701294, + "learning_rate": 7.373843399050668e-05, + "loss": 0.6537002921104431, + "step": 5620 + }, + { + "epoch": 2.3721518987341774, + "grad_norm": 1.1710485219955444, + "learning_rate": 7.371729573082073e-05, + "loss": 0.6224458813667297, + "step": 5622 + }, + { + "epoch": 2.3729957805907174, + "grad_norm": 1.1629483699798584, + "learning_rate": 7.36961520000102e-05, + "loss": 0.6297177076339722, + "step": 5624 + }, + { + "epoch": 2.3738396624472573, + "grad_norm": 1.1069440841674805, + "learning_rate": 7.367500280295248e-05, + "loss": 0.5202008485794067, + "step": 5626 + }, + { + "epoch": 2.3746835443037977, + "grad_norm": 1.0068297386169434, + "learning_rate": 7.36538481445263e-05, + "loss": 0.5256102681159973, + "step": 5628 + }, + { + "epoch": 2.3755274261603376, + "grad_norm": 1.1103417873382568, + "learning_rate": 7.363268802961161e-05, + "loss": 0.5460903644561768, + "step": 5630 + }, + { + "epoch": 2.3763713080168776, + "grad_norm": 1.2885268926620483, + "learning_rate": 7.361152246308969e-05, + "loss": 0.5817124247550964, + "step": 5632 + }, + { + "epoch": 2.377215189873418, + "grad_norm": 1.233831524848938, + "learning_rate": 7.359035144984302e-05, + "loss": 0.5415143966674805, + "step": 5634 + }, + { + "epoch": 2.378059071729958, + "grad_norm": 1.3451908826828003, + "learning_rate": 7.35691749947553e-05, + "loss": 0.6837685108184814, + "step": 5636 + }, + { + "epoch": 2.378902953586498, + "grad_norm": 1.1320621967315674, + "learning_rate": 7.354799310271159e-05, + "loss": 0.5966196656227112, + "step": 5638 + }, + { + "epoch": 2.379746835443038, + "grad_norm": 1.1884461641311646, + "learning_rate": 7.35268057785981e-05, + "loss": 0.5607479214668274, + "step": 5640 + }, + { + "epoch": 2.380590717299578, + "grad_norm": 1.2710856199264526, + "learning_rate": 7.350561302730236e-05, + "loss": 0.595242977142334, + "step": 5642 + }, + { + "epoch": 2.381434599156118, + "grad_norm": 1.3110458850860596, + "learning_rate": 7.348441485371314e-05, + "loss": 0.6208752393722534, + "step": 5644 + }, + { + "epoch": 2.382278481012658, + "grad_norm": 1.1734380722045898, + "learning_rate": 7.346321126272044e-05, + "loss": 0.6173125505447388, + "step": 5646 + }, + { + "epoch": 2.3831223628691984, + "grad_norm": 1.2024762630462646, + "learning_rate": 7.34420022592155e-05, + "loss": 0.6013050675392151, + "step": 5648 + }, + { + "epoch": 2.3839662447257384, + "grad_norm": 1.1305288076400757, + "learning_rate": 7.342078784809086e-05, + "loss": 0.5919594764709473, + "step": 5650 + }, + { + "epoch": 2.3848101265822783, + "grad_norm": 1.075323462486267, + "learning_rate": 7.339956803424028e-05, + "loss": 0.5399283766746521, + "step": 5652 + }, + { + "epoch": 2.3856540084388187, + "grad_norm": 1.2035599946975708, + "learning_rate": 7.337834282255873e-05, + "loss": 0.6253576874732971, + "step": 5654 + }, + { + "epoch": 2.3864978902953586, + "grad_norm": 1.0572105646133423, + "learning_rate": 7.335711221794251e-05, + "loss": 0.5247007608413696, + "step": 5656 + }, + { + "epoch": 2.3873417721518986, + "grad_norm": 1.2701191902160645, + "learning_rate": 7.333587622528906e-05, + "loss": 0.5800243020057678, + "step": 5658 + }, + { + "epoch": 2.388185654008439, + "grad_norm": 1.1772741079330444, + "learning_rate": 7.331463484949716e-05, + "loss": 0.589645504951477, + "step": 5660 + }, + { + "epoch": 2.389029535864979, + "grad_norm": 1.0562703609466553, + "learning_rate": 7.329338809546674e-05, + "loss": 0.5820419192314148, + "step": 5662 + }, + { + "epoch": 2.389873417721519, + "grad_norm": 1.1634355783462524, + "learning_rate": 7.327213596809906e-05, + "loss": 0.591435432434082, + "step": 5664 + }, + { + "epoch": 2.3907172995780592, + "grad_norm": 1.2220302820205688, + "learning_rate": 7.325087847229655e-05, + "loss": 0.5630883574485779, + "step": 5666 + }, + { + "epoch": 2.391561181434599, + "grad_norm": 1.4087659120559692, + "learning_rate": 7.322961561296294e-05, + "loss": 0.6050130128860474, + "step": 5668 + }, + { + "epoch": 2.392405063291139, + "grad_norm": 1.1126172542572021, + "learning_rate": 7.320834739500313e-05, + "loss": 0.56146240234375, + "step": 5670 + }, + { + "epoch": 2.3932489451476795, + "grad_norm": 0.99373859167099, + "learning_rate": 7.31870738233233e-05, + "loss": 0.5507852435112, + "step": 5672 + }, + { + "epoch": 2.3940928270042194, + "grad_norm": 1.14408540725708, + "learning_rate": 7.316579490283085e-05, + "loss": 0.5895347595214844, + "step": 5674 + }, + { + "epoch": 2.3949367088607594, + "grad_norm": 1.1728581190109253, + "learning_rate": 7.314451063843443e-05, + "loss": 0.5304404497146606, + "step": 5676 + }, + { + "epoch": 2.3957805907172998, + "grad_norm": 1.1721378564834595, + "learning_rate": 7.31232210350439e-05, + "loss": 0.5805793404579163, + "step": 5678 + }, + { + "epoch": 2.3966244725738397, + "grad_norm": 1.0499866008758545, + "learning_rate": 7.310192609757038e-05, + "loss": 0.5671767592430115, + "step": 5680 + }, + { + "epoch": 2.3974683544303796, + "grad_norm": 1.0959177017211914, + "learning_rate": 7.308062583092617e-05, + "loss": 0.6335723400115967, + "step": 5682 + }, + { + "epoch": 2.3983122362869196, + "grad_norm": 1.31142258644104, + "learning_rate": 7.305932024002487e-05, + "loss": 0.6032374501228333, + "step": 5684 + }, + { + "epoch": 2.39915611814346, + "grad_norm": 0.9212818741798401, + "learning_rate": 7.303800932978124e-05, + "loss": 0.5492936372756958, + "step": 5686 + }, + { + "epoch": 2.4, + "grad_norm": 1.1956428289413452, + "learning_rate": 7.301669310511132e-05, + "loss": 0.5533297061920166, + "step": 5688 + }, + { + "epoch": 2.40084388185654, + "grad_norm": 1.4048634767532349, + "learning_rate": 7.299537157093232e-05, + "loss": 0.5859368443489075, + "step": 5690 + }, + { + "epoch": 2.4016877637130802, + "grad_norm": 1.0580679178237915, + "learning_rate": 7.297404473216277e-05, + "loss": 0.5099439024925232, + "step": 5692 + }, + { + "epoch": 2.40253164556962, + "grad_norm": 1.2450575828552246, + "learning_rate": 7.29527125937223e-05, + "loss": 0.5631486177444458, + "step": 5694 + }, + { + "epoch": 2.40337552742616, + "grad_norm": 1.338466763496399, + "learning_rate": 7.293137516053187e-05, + "loss": 0.6045404672622681, + "step": 5696 + }, + { + "epoch": 2.4042194092827005, + "grad_norm": 1.198588252067566, + "learning_rate": 7.291003243751358e-05, + "loss": 0.6063475608825684, + "step": 5698 + }, + { + "epoch": 2.4050632911392404, + "grad_norm": 1.2315080165863037, + "learning_rate": 7.288868442959081e-05, + "loss": 0.5734809041023254, + "step": 5700 + }, + { + "epoch": 2.4050632911392404, + "eval_loss": 0.6841402053833008, + "eval_runtime": 941.6641, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 2.238, + "step": 5700 + }, + { + "epoch": 2.4059071729957804, + "grad_norm": 1.1494885683059692, + "learning_rate": 7.286733114168812e-05, + "loss": 0.5744594931602478, + "step": 5702 + }, + { + "epoch": 2.4067510548523208, + "grad_norm": 1.3769505023956299, + "learning_rate": 7.284597257873132e-05, + "loss": 0.611789882183075, + "step": 5704 + }, + { + "epoch": 2.4075949367088607, + "grad_norm": 1.2326449155807495, + "learning_rate": 7.28246087456474e-05, + "loss": 0.6091431975364685, + "step": 5706 + }, + { + "epoch": 2.4084388185654007, + "grad_norm": 1.1960830688476562, + "learning_rate": 7.28032396473646e-05, + "loss": 0.49431973695755005, + "step": 5708 + }, + { + "epoch": 2.409282700421941, + "grad_norm": 1.1672827005386353, + "learning_rate": 7.278186528881237e-05, + "loss": 0.5344718098640442, + "step": 5710 + }, + { + "epoch": 2.410126582278481, + "grad_norm": 1.1923719644546509, + "learning_rate": 7.276048567492136e-05, + "loss": 0.6011165380477905, + "step": 5712 + }, + { + "epoch": 2.410970464135021, + "grad_norm": 1.2314990758895874, + "learning_rate": 7.273910081062341e-05, + "loss": 0.6300925016403198, + "step": 5714 + }, + { + "epoch": 2.4118143459915613, + "grad_norm": 0.8976680040359497, + "learning_rate": 7.27177107008516e-05, + "loss": 0.56329345703125, + "step": 5716 + }, + { + "epoch": 2.4126582278481012, + "grad_norm": 1.2954038381576538, + "learning_rate": 7.269631535054026e-05, + "loss": 0.6266427040100098, + "step": 5718 + }, + { + "epoch": 2.413502109704641, + "grad_norm": 1.3357585668563843, + "learning_rate": 7.267491476462485e-05, + "loss": 0.6234018802642822, + "step": 5720 + }, + { + "epoch": 2.4143459915611816, + "grad_norm": 1.1913645267486572, + "learning_rate": 7.265350894804209e-05, + "loss": 0.5909059047698975, + "step": 5722 + }, + { + "epoch": 2.4151898734177215, + "grad_norm": 1.3425955772399902, + "learning_rate": 7.263209790572986e-05, + "loss": 0.5708479285240173, + "step": 5724 + }, + { + "epoch": 2.4160337552742615, + "grad_norm": 1.2258507013320923, + "learning_rate": 7.261068164262734e-05, + "loss": 0.5810034871101379, + "step": 5726 + }, + { + "epoch": 2.416877637130802, + "grad_norm": 1.348794937133789, + "learning_rate": 7.258926016367479e-05, + "loss": 0.5939235687255859, + "step": 5728 + }, + { + "epoch": 2.4177215189873418, + "grad_norm": 1.0896574258804321, + "learning_rate": 7.256783347381375e-05, + "loss": 0.6298259496688843, + "step": 5730 + }, + { + "epoch": 2.4185654008438817, + "grad_norm": 1.164866328239441, + "learning_rate": 7.254640157798696e-05, + "loss": 0.5277430415153503, + "step": 5732 + }, + { + "epoch": 2.419409282700422, + "grad_norm": 1.1215453147888184, + "learning_rate": 7.252496448113833e-05, + "loss": 0.5724055767059326, + "step": 5734 + }, + { + "epoch": 2.420253164556962, + "grad_norm": 1.0640764236450195, + "learning_rate": 7.2503522188213e-05, + "loss": 0.5439977645874023, + "step": 5736 + }, + { + "epoch": 2.421097046413502, + "grad_norm": 1.4874604940414429, + "learning_rate": 7.248207470415729e-05, + "loss": 0.7568614482879639, + "step": 5738 + }, + { + "epoch": 2.4219409282700424, + "grad_norm": 1.2611099481582642, + "learning_rate": 7.246062203391873e-05, + "loss": 0.6389632225036621, + "step": 5740 + }, + { + "epoch": 2.4227848101265823, + "grad_norm": 1.185644507408142, + "learning_rate": 7.243916418244602e-05, + "loss": 0.6180628538131714, + "step": 5742 + }, + { + "epoch": 2.4236286919831223, + "grad_norm": 1.1648430824279785, + "learning_rate": 7.241770115468909e-05, + "loss": 0.619799017906189, + "step": 5744 + }, + { + "epoch": 2.4244725738396626, + "grad_norm": 1.1974445581436157, + "learning_rate": 7.239623295559903e-05, + "loss": 0.6446201205253601, + "step": 5746 + }, + { + "epoch": 2.4253164556962026, + "grad_norm": 1.140477180480957, + "learning_rate": 7.237475959012818e-05, + "loss": 0.5839580297470093, + "step": 5748 + }, + { + "epoch": 2.4261603375527425, + "grad_norm": 1.1374423503875732, + "learning_rate": 7.235328106322998e-05, + "loss": 0.48815420269966125, + "step": 5750 + }, + { + "epoch": 2.427004219409283, + "grad_norm": 1.411432147026062, + "learning_rate": 7.233179737985916e-05, + "loss": 0.638519287109375, + "step": 5752 + }, + { + "epoch": 2.427848101265823, + "grad_norm": 1.1232497692108154, + "learning_rate": 7.231030854497157e-05, + "loss": 0.5776677131652832, + "step": 5754 + }, + { + "epoch": 2.428691983122363, + "grad_norm": 1.0815738439559937, + "learning_rate": 7.228881456352428e-05, + "loss": 0.5297027230262756, + "step": 5756 + }, + { + "epoch": 2.429535864978903, + "grad_norm": 1.2230733633041382, + "learning_rate": 7.226731544047553e-05, + "loss": 0.5630011558532715, + "step": 5758 + }, + { + "epoch": 2.430379746835443, + "grad_norm": 1.2033147811889648, + "learning_rate": 7.224581118078476e-05, + "loss": 0.5772101283073425, + "step": 5760 + }, + { + "epoch": 2.431223628691983, + "grad_norm": 1.2150053977966309, + "learning_rate": 7.22243017894126e-05, + "loss": 0.5412847399711609, + "step": 5762 + }, + { + "epoch": 2.4320675105485234, + "grad_norm": 1.0494824647903442, + "learning_rate": 7.220278727132083e-05, + "loss": 0.5568405389785767, + "step": 5764 + }, + { + "epoch": 2.4329113924050634, + "grad_norm": 1.2803306579589844, + "learning_rate": 7.218126763147244e-05, + "loss": 0.6022217869758606, + "step": 5766 + }, + { + "epoch": 2.4337552742616033, + "grad_norm": 1.0832798480987549, + "learning_rate": 7.215974287483163e-05, + "loss": 0.5568796396255493, + "step": 5768 + }, + { + "epoch": 2.4345991561181437, + "grad_norm": 1.1829264163970947, + "learning_rate": 7.213821300636372e-05, + "loss": 0.5607990026473999, + "step": 5770 + }, + { + "epoch": 2.4354430379746836, + "grad_norm": 2.3017473220825195, + "learning_rate": 7.211667803103523e-05, + "loss": 0.6382274031639099, + "step": 5772 + }, + { + "epoch": 2.4362869198312236, + "grad_norm": 1.1701387166976929, + "learning_rate": 7.209513795381388e-05, + "loss": 0.5748776793479919, + "step": 5774 + }, + { + "epoch": 2.4371308016877635, + "grad_norm": 1.0480856895446777, + "learning_rate": 7.207359277966856e-05, + "loss": 0.5760934352874756, + "step": 5776 + }, + { + "epoch": 2.437974683544304, + "grad_norm": 1.2263693809509277, + "learning_rate": 7.20520425135693e-05, + "loss": 0.6387208104133606, + "step": 5778 + }, + { + "epoch": 2.438818565400844, + "grad_norm": 1.219246506690979, + "learning_rate": 7.203048716048737e-05, + "loss": 0.6078037619590759, + "step": 5780 + }, + { + "epoch": 2.439662447257384, + "grad_norm": 1.2452640533447266, + "learning_rate": 7.200892672539515e-05, + "loss": 0.606924831867218, + "step": 5782 + }, + { + "epoch": 2.440506329113924, + "grad_norm": 1.3469732999801636, + "learning_rate": 7.198736121326621e-05, + "loss": 0.585297703742981, + "step": 5784 + }, + { + "epoch": 2.441350210970464, + "grad_norm": 1.151127576828003, + "learning_rate": 7.196579062907533e-05, + "loss": 0.5849902033805847, + "step": 5786 + }, + { + "epoch": 2.442194092827004, + "grad_norm": 1.0669564008712769, + "learning_rate": 7.19442149777984e-05, + "loss": 0.6150397062301636, + "step": 5788 + }, + { + "epoch": 2.4430379746835444, + "grad_norm": 1.1700209379196167, + "learning_rate": 7.192263426441252e-05, + "loss": 0.6324567794799805, + "step": 5790 + }, + { + "epoch": 2.4438818565400844, + "grad_norm": 1.2832094430923462, + "learning_rate": 7.190104849389597e-05, + "loss": 0.6202381253242493, + "step": 5792 + }, + { + "epoch": 2.4447257383966243, + "grad_norm": 1.2046177387237549, + "learning_rate": 7.187945767122813e-05, + "loss": 0.6156684756278992, + "step": 5794 + }, + { + "epoch": 2.4455696202531647, + "grad_norm": 1.031133770942688, + "learning_rate": 7.185786180138961e-05, + "loss": 0.5763497352600098, + "step": 5796 + }, + { + "epoch": 2.4464135021097047, + "grad_norm": 1.2803475856781006, + "learning_rate": 7.183626088936216e-05, + "loss": 0.5419677495956421, + "step": 5798 + }, + { + "epoch": 2.4472573839662446, + "grad_norm": 1.2407588958740234, + "learning_rate": 7.181465494012869e-05, + "loss": 0.629108190536499, + "step": 5800 + }, + { + "epoch": 2.4472573839662446, + "eval_loss": 0.6835155487060547, + "eval_runtime": 758.407, + "eval_samples_per_second": 2.778, + "eval_steps_per_second": 2.778, + "step": 5800 + }, + { + "epoch": 2.448101265822785, + "grad_norm": 1.3525878190994263, + "learning_rate": 7.17930439586733e-05, + "loss": 0.6146516799926758, + "step": 5802 + }, + { + "epoch": 2.448945147679325, + "grad_norm": 1.255921721458435, + "learning_rate": 7.177142794998121e-05, + "loss": 0.5796315670013428, + "step": 5804 + }, + { + "epoch": 2.449789029535865, + "grad_norm": 1.2135448455810547, + "learning_rate": 7.174980691903881e-05, + "loss": 0.5978766679763794, + "step": 5806 + }, + { + "epoch": 2.4506329113924052, + "grad_norm": 1.117942214012146, + "learning_rate": 7.172818087083367e-05, + "loss": 0.5941054821014404, + "step": 5808 + }, + { + "epoch": 2.451476793248945, + "grad_norm": 1.2917672395706177, + "learning_rate": 7.17065498103545e-05, + "loss": 0.6213865876197815, + "step": 5810 + }, + { + "epoch": 2.452320675105485, + "grad_norm": 1.2287952899932861, + "learning_rate": 7.168491374259118e-05, + "loss": 0.627090573310852, + "step": 5812 + }, + { + "epoch": 2.453164556962025, + "grad_norm": 1.2427480220794678, + "learning_rate": 7.16632726725347e-05, + "loss": 0.605871319770813, + "step": 5814 + }, + { + "epoch": 2.4540084388185655, + "grad_norm": 1.2568929195404053, + "learning_rate": 7.16416266051773e-05, + "loss": 0.5961518883705139, + "step": 5816 + }, + { + "epoch": 2.4548523206751054, + "grad_norm": 1.2202998399734497, + "learning_rate": 7.161997554551226e-05, + "loss": 0.585054874420166, + "step": 5818 + }, + { + "epoch": 2.4556962025316453, + "grad_norm": 1.2326043844223022, + "learning_rate": 7.159831949853409e-05, + "loss": 0.6219096779823303, + "step": 5820 + }, + { + "epoch": 2.4565400843881857, + "grad_norm": 1.2161623239517212, + "learning_rate": 7.15766584692384e-05, + "loss": 0.641189455986023, + "step": 5822 + }, + { + "epoch": 2.4573839662447257, + "grad_norm": 1.2391023635864258, + "learning_rate": 7.1554992462622e-05, + "loss": 0.577190101146698, + "step": 5824 + }, + { + "epoch": 2.4582278481012656, + "grad_norm": 1.0883333683013916, + "learning_rate": 7.153332148368281e-05, + "loss": 0.5264694690704346, + "step": 5826 + }, + { + "epoch": 2.459071729957806, + "grad_norm": 1.2129524946212769, + "learning_rate": 7.15116455374199e-05, + "loss": 0.631437361240387, + "step": 5828 + }, + { + "epoch": 2.459915611814346, + "grad_norm": 1.0476374626159668, + "learning_rate": 7.148996462883352e-05, + "loss": 0.5025489926338196, + "step": 5830 + }, + { + "epoch": 2.460759493670886, + "grad_norm": 1.1389570236206055, + "learning_rate": 7.146827876292502e-05, + "loss": 0.5903586745262146, + "step": 5832 + }, + { + "epoch": 2.4616033755274263, + "grad_norm": 1.4385539293289185, + "learning_rate": 7.14465879446969e-05, + "loss": 0.633786141872406, + "step": 5834 + }, + { + "epoch": 2.462447257383966, + "grad_norm": 1.1184585094451904, + "learning_rate": 7.142489217915283e-05, + "loss": 0.5889136791229248, + "step": 5836 + }, + { + "epoch": 2.463291139240506, + "grad_norm": 1.2257685661315918, + "learning_rate": 7.140319147129763e-05, + "loss": 0.5774597525596619, + "step": 5838 + }, + { + "epoch": 2.4641350210970465, + "grad_norm": 0.9524238109588623, + "learning_rate": 7.13814858261372e-05, + "loss": 0.5220611095428467, + "step": 5840 + }, + { + "epoch": 2.4649789029535865, + "grad_norm": 1.2814422845840454, + "learning_rate": 7.135977524867861e-05, + "loss": 0.5724858641624451, + "step": 5842 + }, + { + "epoch": 2.4658227848101264, + "grad_norm": 1.0978140830993652, + "learning_rate": 7.133805974393013e-05, + "loss": 0.5469759702682495, + "step": 5844 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 1.310279130935669, + "learning_rate": 7.131633931690104e-05, + "loss": 0.6554312705993652, + "step": 5846 + }, + { + "epoch": 2.4675105485232067, + "grad_norm": 1.286189317703247, + "learning_rate": 7.129461397260187e-05, + "loss": 0.6166019439697266, + "step": 5848 + }, + { + "epoch": 2.4683544303797467, + "grad_norm": 1.1586377620697021, + "learning_rate": 7.127288371604424e-05, + "loss": 0.6301121711730957, + "step": 5850 + }, + { + "epoch": 2.469198312236287, + "grad_norm": 1.1684564352035522, + "learning_rate": 7.125114855224087e-05, + "loss": 0.6022663712501526, + "step": 5852 + }, + { + "epoch": 2.470042194092827, + "grad_norm": 1.182511329650879, + "learning_rate": 7.122940848620567e-05, + "loss": 0.5959302186965942, + "step": 5854 + }, + { + "epoch": 2.470886075949367, + "grad_norm": 1.2383002042770386, + "learning_rate": 7.120766352295366e-05, + "loss": 0.6251413822174072, + "step": 5856 + }, + { + "epoch": 2.4717299578059073, + "grad_norm": 1.2001979351043701, + "learning_rate": 7.118591366750097e-05, + "loss": 0.6332544088363647, + "step": 5858 + }, + { + "epoch": 2.4725738396624473, + "grad_norm": 1.2166392803192139, + "learning_rate": 7.116415892486488e-05, + "loss": 0.5797795057296753, + "step": 5860 + }, + { + "epoch": 2.473417721518987, + "grad_norm": 1.2235382795333862, + "learning_rate": 7.114239930006379e-05, + "loss": 0.5335313081741333, + "step": 5862 + }, + { + "epoch": 2.4742616033755276, + "grad_norm": 1.2405973672866821, + "learning_rate": 7.112063479811724e-05, + "loss": 0.5536905527114868, + "step": 5864 + }, + { + "epoch": 2.4751054852320675, + "grad_norm": 1.116328477859497, + "learning_rate": 7.109886542404585e-05, + "loss": 0.554654061794281, + "step": 5866 + }, + { + "epoch": 2.4759493670886075, + "grad_norm": 1.2757837772369385, + "learning_rate": 7.107709118287143e-05, + "loss": 0.6017873287200928, + "step": 5868 + }, + { + "epoch": 2.476793248945148, + "grad_norm": 1.3445937633514404, + "learning_rate": 7.105531207961686e-05, + "loss": 0.6479908227920532, + "step": 5870 + }, + { + "epoch": 2.477637130801688, + "grad_norm": 1.1464542150497437, + "learning_rate": 7.103352811930619e-05, + "loss": 0.5829157829284668, + "step": 5872 + }, + { + "epoch": 2.4784810126582277, + "grad_norm": 1.3275130987167358, + "learning_rate": 7.101173930696453e-05, + "loss": 0.54380863904953, + "step": 5874 + }, + { + "epoch": 2.479324894514768, + "grad_norm": 1.006990909576416, + "learning_rate": 7.098994564761813e-05, + "loss": 0.6313910484313965, + "step": 5876 + }, + { + "epoch": 2.480168776371308, + "grad_norm": 1.1358299255371094, + "learning_rate": 7.09681471462944e-05, + "loss": 0.5343483090400696, + "step": 5878 + }, + { + "epoch": 2.481012658227848, + "grad_norm": 1.1456117630004883, + "learning_rate": 7.094634380802184e-05, + "loss": 0.49450409412384033, + "step": 5880 + }, + { + "epoch": 2.4818565400843884, + "grad_norm": 1.2961846590042114, + "learning_rate": 7.092453563783003e-05, + "loss": 0.6378757357597351, + "step": 5882 + }, + { + "epoch": 2.4827004219409283, + "grad_norm": 0.983889102935791, + "learning_rate": 7.090272264074972e-05, + "loss": 0.5937124490737915, + "step": 5884 + }, + { + "epoch": 2.4835443037974683, + "grad_norm": 1.0205817222595215, + "learning_rate": 7.088090482181273e-05, + "loss": 0.5301283597946167, + "step": 5886 + }, + { + "epoch": 2.4843881856540087, + "grad_norm": 1.1721397638320923, + "learning_rate": 7.085908218605204e-05, + "loss": 0.6191756129264832, + "step": 5888 + }, + { + "epoch": 2.4852320675105486, + "grad_norm": 1.2432814836502075, + "learning_rate": 7.083725473850168e-05, + "loss": 0.5928890109062195, + "step": 5890 + }, + { + "epoch": 2.4860759493670885, + "grad_norm": 1.252125859260559, + "learning_rate": 7.081542248419686e-05, + "loss": 0.6136764287948608, + "step": 5892 + }, + { + "epoch": 2.486919831223629, + "grad_norm": 1.3686699867248535, + "learning_rate": 7.079358542817382e-05, + "loss": 0.6084910035133362, + "step": 5894 + }, + { + "epoch": 2.487763713080169, + "grad_norm": 1.0877282619476318, + "learning_rate": 7.077174357546996e-05, + "loss": 0.5862250924110413, + "step": 5896 + }, + { + "epoch": 2.488607594936709, + "grad_norm": 1.164095401763916, + "learning_rate": 7.074989693112381e-05, + "loss": 0.6300894021987915, + "step": 5898 + }, + { + "epoch": 2.489451476793249, + "grad_norm": 1.1169507503509521, + "learning_rate": 7.072804550017493e-05, + "loss": 0.5508570075035095, + "step": 5900 + }, + { + "epoch": 2.489451476793249, + "eval_loss": 0.6820966005325317, + "eval_runtime": 513.3515, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 5900 + }, + { + "epoch": 2.490295358649789, + "grad_norm": 1.1718615293502808, + "learning_rate": 7.070618928766406e-05, + "loss": 0.550847589969635, + "step": 5902 + }, + { + "epoch": 2.491139240506329, + "grad_norm": 1.4725650548934937, + "learning_rate": 7.068432829863298e-05, + "loss": 0.5663347840309143, + "step": 5904 + }, + { + "epoch": 2.491983122362869, + "grad_norm": 1.042083978652954, + "learning_rate": 7.066246253812462e-05, + "loss": 0.5506191849708557, + "step": 5906 + }, + { + "epoch": 2.4928270042194094, + "grad_norm": 1.2020974159240723, + "learning_rate": 7.064059201118297e-05, + "loss": 0.5656929612159729, + "step": 5908 + }, + { + "epoch": 2.4936708860759493, + "grad_norm": 1.1040663719177246, + "learning_rate": 7.061871672285317e-05, + "loss": 0.5159370303153992, + "step": 5910 + }, + { + "epoch": 2.4945147679324893, + "grad_norm": 1.3681589365005493, + "learning_rate": 7.05968366781814e-05, + "loss": 0.6161949634552002, + "step": 5912 + }, + { + "epoch": 2.4953586497890297, + "grad_norm": 1.26628839969635, + "learning_rate": 7.057495188221498e-05, + "loss": 0.6357758641242981, + "step": 5914 + }, + { + "epoch": 2.4962025316455696, + "grad_norm": 1.2714020013809204, + "learning_rate": 7.05530623400023e-05, + "loss": 0.5467366576194763, + "step": 5916 + }, + { + "epoch": 2.4970464135021095, + "grad_norm": 1.2255018949508667, + "learning_rate": 7.053116805659287e-05, + "loss": 0.592526376247406, + "step": 5918 + }, + { + "epoch": 2.49789029535865, + "grad_norm": 1.2816206216812134, + "learning_rate": 7.050926903703729e-05, + "loss": 0.5819981694221497, + "step": 5920 + }, + { + "epoch": 2.49873417721519, + "grad_norm": 1.1938221454620361, + "learning_rate": 7.048736528638722e-05, + "loss": 0.6037712693214417, + "step": 5922 + }, + { + "epoch": 2.49957805907173, + "grad_norm": 1.1330323219299316, + "learning_rate": 7.046545680969545e-05, + "loss": 0.5567215085029602, + "step": 5924 + }, + { + "epoch": 2.50042194092827, + "grad_norm": 1.233564019203186, + "learning_rate": 7.044354361201585e-05, + "loss": 0.5626974105834961, + "step": 5926 + }, + { + "epoch": 2.50126582278481, + "grad_norm": 1.1913540363311768, + "learning_rate": 7.042162569840336e-05, + "loss": 0.5672739744186401, + "step": 5928 + }, + { + "epoch": 2.50210970464135, + "grad_norm": 1.060952067375183, + "learning_rate": 7.039970307391402e-05, + "loss": 0.5965602993965149, + "step": 5930 + }, + { + "epoch": 2.5029535864978905, + "grad_norm": 1.2003182172775269, + "learning_rate": 7.037777574360497e-05, + "loss": 0.590932309627533, + "step": 5932 + }, + { + "epoch": 2.5037974683544304, + "grad_norm": 1.073434829711914, + "learning_rate": 7.035584371253441e-05, + "loss": 0.5736868381500244, + "step": 5934 + }, + { + "epoch": 2.5046413502109703, + "grad_norm": 1.2641130685806274, + "learning_rate": 7.033390698576166e-05, + "loss": 0.614703357219696, + "step": 5936 + }, + { + "epoch": 2.5054852320675103, + "grad_norm": 1.2406511306762695, + "learning_rate": 7.031196556834708e-05, + "loss": 0.5866397023200989, + "step": 5938 + }, + { + "epoch": 2.5063291139240507, + "grad_norm": 1.231619119644165, + "learning_rate": 7.029001946535215e-05, + "loss": 0.5792667865753174, + "step": 5940 + }, + { + "epoch": 2.5071729957805906, + "grad_norm": 1.419447660446167, + "learning_rate": 7.026806868183939e-05, + "loss": 0.5686604976654053, + "step": 5942 + }, + { + "epoch": 2.5080168776371305, + "grad_norm": 1.139244556427002, + "learning_rate": 7.024611322287245e-05, + "loss": 0.5860661268234253, + "step": 5944 + }, + { + "epoch": 2.508860759493671, + "grad_norm": 1.070517897605896, + "learning_rate": 7.022415309351602e-05, + "loss": 0.5823250412940979, + "step": 5946 + }, + { + "epoch": 2.509704641350211, + "grad_norm": 1.0775398015975952, + "learning_rate": 7.020218829883589e-05, + "loss": 0.5291389226913452, + "step": 5948 + }, + { + "epoch": 2.510548523206751, + "grad_norm": 1.339716911315918, + "learning_rate": 7.018021884389892e-05, + "loss": 0.6215447783470154, + "step": 5950 + }, + { + "epoch": 2.511392405063291, + "grad_norm": 1.3589707612991333, + "learning_rate": 7.0158244733773e-05, + "loss": 0.5419909358024597, + "step": 5952 + }, + { + "epoch": 2.512236286919831, + "grad_norm": 1.1664098501205444, + "learning_rate": 7.01362659735272e-05, + "loss": 0.5476977229118347, + "step": 5954 + }, + { + "epoch": 2.513080168776371, + "grad_norm": 1.1184223890304565, + "learning_rate": 7.011428256823154e-05, + "loss": 0.5896323919296265, + "step": 5956 + }, + { + "epoch": 2.5139240506329115, + "grad_norm": 1.4071170091629028, + "learning_rate": 7.00922945229572e-05, + "loss": 0.6353691220283508, + "step": 5958 + }, + { + "epoch": 2.5147679324894514, + "grad_norm": 1.3740885257720947, + "learning_rate": 7.007030184277641e-05, + "loss": 0.6605582237243652, + "step": 5960 + }, + { + "epoch": 2.5156118143459913, + "grad_norm": 1.071395754814148, + "learning_rate": 7.004830453276241e-05, + "loss": 0.6399887800216675, + "step": 5962 + }, + { + "epoch": 2.5164556962025317, + "grad_norm": 1.2292311191558838, + "learning_rate": 7.002630259798962e-05, + "loss": 0.5992775559425354, + "step": 5964 + }, + { + "epoch": 2.5172995780590717, + "grad_norm": 1.0133391618728638, + "learning_rate": 7.000429604353341e-05, + "loss": 0.5716721415519714, + "step": 5966 + }, + { + "epoch": 2.5181434599156116, + "grad_norm": 1.2669343948364258, + "learning_rate": 6.998228487447032e-05, + "loss": 0.5455520749092102, + "step": 5968 + }, + { + "epoch": 2.518987341772152, + "grad_norm": 1.2026386260986328, + "learning_rate": 6.996026909587785e-05, + "loss": 0.6411572694778442, + "step": 5970 + }, + { + "epoch": 2.519831223628692, + "grad_norm": 1.359923243522644, + "learning_rate": 6.993824871283465e-05, + "loss": 0.6687750220298767, + "step": 5972 + }, + { + "epoch": 2.520675105485232, + "grad_norm": 1.1265650987625122, + "learning_rate": 6.99162237304204e-05, + "loss": 0.6271382570266724, + "step": 5974 + }, + { + "epoch": 2.5215189873417723, + "grad_norm": 1.197667121887207, + "learning_rate": 6.989419415371583e-05, + "loss": 0.6191279888153076, + "step": 5976 + }, + { + "epoch": 2.522362869198312, + "grad_norm": 1.169992446899414, + "learning_rate": 6.987215998780275e-05, + "loss": 0.6313687562942505, + "step": 5978 + }, + { + "epoch": 2.523206751054852, + "grad_norm": 1.2706433534622192, + "learning_rate": 6.9850121237764e-05, + "loss": 0.6058336496353149, + "step": 5980 + }, + { + "epoch": 2.5240506329113925, + "grad_norm": 1.322376012802124, + "learning_rate": 6.982807790868352e-05, + "loss": 0.6466464400291443, + "step": 5982 + }, + { + "epoch": 2.5248945147679325, + "grad_norm": 1.2398571968078613, + "learning_rate": 6.980603000564626e-05, + "loss": 0.5730098485946655, + "step": 5984 + }, + { + "epoch": 2.5257383966244724, + "grad_norm": 1.2035216093063354, + "learning_rate": 6.978397753373826e-05, + "loss": 0.5305635333061218, + "step": 5986 + }, + { + "epoch": 2.526582278481013, + "grad_norm": 1.1951299905776978, + "learning_rate": 6.976192049804661e-05, + "loss": 0.5601096153259277, + "step": 5988 + }, + { + "epoch": 2.5274261603375527, + "grad_norm": 0.9950459599494934, + "learning_rate": 6.973985890365945e-05, + "loss": 0.5049516558647156, + "step": 5990 + }, + { + "epoch": 2.5282700421940927, + "grad_norm": 1.2581008672714233, + "learning_rate": 6.971779275566593e-05, + "loss": 0.5456960797309875, + "step": 5992 + }, + { + "epoch": 2.529113924050633, + "grad_norm": 1.2196903228759766, + "learning_rate": 6.969572205915632e-05, + "loss": 0.6026827096939087, + "step": 5994 + }, + { + "epoch": 2.529957805907173, + "grad_norm": 1.3109357357025146, + "learning_rate": 6.967364681922189e-05, + "loss": 0.597453236579895, + "step": 5996 + }, + { + "epoch": 2.530801687763713, + "grad_norm": 1.016904354095459, + "learning_rate": 6.965156704095498e-05, + "loss": 0.5304323434829712, + "step": 5998 + }, + { + "epoch": 2.5316455696202533, + "grad_norm": 1.2363858222961426, + "learning_rate": 6.962948272944896e-05, + "loss": 0.5748253464698792, + "step": 6000 + }, + { + "epoch": 2.5316455696202533, + "eval_loss": 0.6813357472419739, + "eval_runtime": 513.5491, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 6000 + }, + { + "epoch": 2.5324894514767933, + "grad_norm": 1.1766576766967773, + "learning_rate": 6.960739388979827e-05, + "loss": 0.613327145576477, + "step": 6002 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 1.4065337181091309, + "learning_rate": 6.95853005270984e-05, + "loss": 0.6648217439651489, + "step": 6004 + }, + { + "epoch": 2.5341772151898736, + "grad_norm": 0.9513862133026123, + "learning_rate": 6.956320264644582e-05, + "loss": 0.5165349841117859, + "step": 6006 + }, + { + "epoch": 2.5350210970464135, + "grad_norm": 1.1104962825775146, + "learning_rate": 6.95411002529381e-05, + "loss": 0.5594159364700317, + "step": 6008 + }, + { + "epoch": 2.5358649789029535, + "grad_norm": 1.1698877811431885, + "learning_rate": 6.951899335167386e-05, + "loss": 0.5662833452224731, + "step": 6010 + }, + { + "epoch": 2.536708860759494, + "grad_norm": 1.2051950693130493, + "learning_rate": 6.949688194775272e-05, + "loss": 0.5780806541442871, + "step": 6012 + }, + { + "epoch": 2.537552742616034, + "grad_norm": 1.2434250116348267, + "learning_rate": 6.947476604627536e-05, + "loss": 0.6112543344497681, + "step": 6014 + }, + { + "epoch": 2.5383966244725737, + "grad_norm": 1.1473076343536377, + "learning_rate": 6.945264565234348e-05, + "loss": 0.5556519031524658, + "step": 6016 + }, + { + "epoch": 2.539240506329114, + "grad_norm": 1.3139631748199463, + "learning_rate": 6.943052077105987e-05, + "loss": 0.6664283275604248, + "step": 6018 + }, + { + "epoch": 2.540084388185654, + "grad_norm": 1.3407402038574219, + "learning_rate": 6.940839140752825e-05, + "loss": 0.6358945369720459, + "step": 6020 + }, + { + "epoch": 2.540928270042194, + "grad_norm": 1.2223491668701172, + "learning_rate": 6.938625756685352e-05, + "loss": 0.6310063600540161, + "step": 6022 + }, + { + "epoch": 2.5417721518987344, + "grad_norm": 1.3984094858169556, + "learning_rate": 6.936411925414146e-05, + "loss": 0.6090726256370544, + "step": 6024 + }, + { + "epoch": 2.5426160337552743, + "grad_norm": 1.1876440048217773, + "learning_rate": 6.9341976474499e-05, + "loss": 0.585586428642273, + "step": 6026 + }, + { + "epoch": 2.5434599156118143, + "grad_norm": 1.2213155031204224, + "learning_rate": 6.931982923303402e-05, + "loss": 0.6382114887237549, + "step": 6028 + }, + { + "epoch": 2.5443037974683547, + "grad_norm": 1.0637959241867065, + "learning_rate": 6.92976775348555e-05, + "loss": 0.5851555466651917, + "step": 6030 + }, + { + "epoch": 2.5451476793248946, + "grad_norm": 1.150227665901184, + "learning_rate": 6.927552138507337e-05, + "loss": 0.5867910385131836, + "step": 6032 + }, + { + "epoch": 2.5459915611814345, + "grad_norm": 1.1405255794525146, + "learning_rate": 6.925336078879865e-05, + "loss": 0.5876969695091248, + "step": 6034 + }, + { + "epoch": 2.546835443037975, + "grad_norm": 1.0269757509231567, + "learning_rate": 6.923119575114339e-05, + "loss": 0.626306414604187, + "step": 6036 + }, + { + "epoch": 2.547679324894515, + "grad_norm": 1.1978809833526611, + "learning_rate": 6.920902627722059e-05, + "loss": 0.645074188709259, + "step": 6038 + }, + { + "epoch": 2.548523206751055, + "grad_norm": 1.1684149503707886, + "learning_rate": 6.918685237214435e-05, + "loss": 0.6284276247024536, + "step": 6040 + }, + { + "epoch": 2.549367088607595, + "grad_norm": 1.2538992166519165, + "learning_rate": 6.916467404102977e-05, + "loss": 0.5770997405052185, + "step": 6042 + }, + { + "epoch": 2.550210970464135, + "grad_norm": 1.2381856441497803, + "learning_rate": 6.914249128899294e-05, + "loss": 0.5501131415367126, + "step": 6044 + }, + { + "epoch": 2.551054852320675, + "grad_norm": 1.0487099885940552, + "learning_rate": 6.912030412115101e-05, + "loss": 0.5362627506256104, + "step": 6046 + }, + { + "epoch": 2.5518987341772155, + "grad_norm": 1.3471804857254028, + "learning_rate": 6.909811254262213e-05, + "loss": 0.6694624423980713, + "step": 6048 + }, + { + "epoch": 2.5527426160337554, + "grad_norm": 1.4262096881866455, + "learning_rate": 6.907591655852547e-05, + "loss": 0.642368733882904, + "step": 6050 + }, + { + "epoch": 2.5535864978902953, + "grad_norm": 1.171004295349121, + "learning_rate": 6.905371617398122e-05, + "loss": 0.6266166567802429, + "step": 6052 + }, + { + "epoch": 2.5544303797468353, + "grad_norm": 1.1249992847442627, + "learning_rate": 6.90315113941106e-05, + "loss": 0.5518985986709595, + "step": 6054 + }, + { + "epoch": 2.5552742616033757, + "grad_norm": 1.3049964904785156, + "learning_rate": 6.900930222403579e-05, + "loss": 0.5367884039878845, + "step": 6056 + }, + { + "epoch": 2.5561181434599156, + "grad_norm": 1.3548237085342407, + "learning_rate": 6.898708866888005e-05, + "loss": 0.6057673096656799, + "step": 6058 + }, + { + "epoch": 2.5569620253164556, + "grad_norm": 1.1422157287597656, + "learning_rate": 6.89648707337676e-05, + "loss": 0.5493726134300232, + "step": 6060 + }, + { + "epoch": 2.557805907172996, + "grad_norm": 1.0179574489593506, + "learning_rate": 6.89426484238237e-05, + "loss": 0.5055251717567444, + "step": 6062 + }, + { + "epoch": 2.558649789029536, + "grad_norm": 1.2062081098556519, + "learning_rate": 6.89204217441746e-05, + "loss": 0.6099714040756226, + "step": 6064 + }, + { + "epoch": 2.559493670886076, + "grad_norm": 1.3043999671936035, + "learning_rate": 6.889819069994759e-05, + "loss": 0.6432347893714905, + "step": 6066 + }, + { + "epoch": 2.5603375527426158, + "grad_norm": 1.241347074508667, + "learning_rate": 6.887595529627093e-05, + "loss": 0.6052974462509155, + "step": 6068 + }, + { + "epoch": 2.561181434599156, + "grad_norm": 1.2502845525741577, + "learning_rate": 6.88537155382739e-05, + "loss": 0.6239711046218872, + "step": 6070 + }, + { + "epoch": 2.562025316455696, + "grad_norm": 1.0815852880477905, + "learning_rate": 6.883147143108679e-05, + "loss": 0.5462124347686768, + "step": 6072 + }, + { + "epoch": 2.562869198312236, + "grad_norm": 1.1990602016448975, + "learning_rate": 6.880922297984087e-05, + "loss": 0.5727240443229675, + "step": 6074 + }, + { + "epoch": 2.5637130801687764, + "grad_norm": 1.016781210899353, + "learning_rate": 6.878697018966846e-05, + "loss": 0.5160089731216431, + "step": 6076 + }, + { + "epoch": 2.5645569620253164, + "grad_norm": 1.1946886777877808, + "learning_rate": 6.876471306570286e-05, + "loss": 0.6344075798988342, + "step": 6078 + }, + { + "epoch": 2.5654008438818563, + "grad_norm": 1.1460139751434326, + "learning_rate": 6.87424516130783e-05, + "loss": 0.6142247319221497, + "step": 6080 + }, + { + "epoch": 2.5662447257383967, + "grad_norm": 1.3636937141418457, + "learning_rate": 6.872018583693013e-05, + "loss": 0.6330769658088684, + "step": 6082 + }, + { + "epoch": 2.5670886075949366, + "grad_norm": 1.3545513153076172, + "learning_rate": 6.869791574239463e-05, + "loss": 0.6386255621910095, + "step": 6084 + }, + { + "epoch": 2.5679324894514766, + "grad_norm": 1.1196715831756592, + "learning_rate": 6.867564133460904e-05, + "loss": 0.5527385473251343, + "step": 6086 + }, + { + "epoch": 2.568776371308017, + "grad_norm": 1.0583977699279785, + "learning_rate": 6.865336261871168e-05, + "loss": 0.5689145922660828, + "step": 6088 + }, + { + "epoch": 2.569620253164557, + "grad_norm": 1.2963348627090454, + "learning_rate": 6.86310795998418e-05, + "loss": 0.5756540298461914, + "step": 6090 + }, + { + "epoch": 2.570464135021097, + "grad_norm": 1.122214436531067, + "learning_rate": 6.860879228313968e-05, + "loss": 0.6062834858894348, + "step": 6092 + }, + { + "epoch": 2.571308016877637, + "grad_norm": 1.1313230991363525, + "learning_rate": 6.858650067374657e-05, + "loss": 0.5526617169380188, + "step": 6094 + }, + { + "epoch": 2.572151898734177, + "grad_norm": 1.6992650032043457, + "learning_rate": 6.856420477680471e-05, + "loss": 0.5911332964897156, + "step": 6096 + }, + { + "epoch": 2.572995780590717, + "grad_norm": 1.2622860670089722, + "learning_rate": 6.854190459745735e-05, + "loss": 0.5730270743370056, + "step": 6098 + }, + { + "epoch": 2.5738396624472575, + "grad_norm": 1.1420512199401855, + "learning_rate": 6.851960014084868e-05, + "loss": 0.597838282585144, + "step": 6100 + }, + { + "epoch": 2.5738396624472575, + "eval_loss": 0.6812278628349304, + "eval_runtime": 513.4749, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 6100 + }, + { + "epoch": 2.5746835443037974, + "grad_norm": 1.129335641860962, + "learning_rate": 6.849729141212396e-05, + "loss": 0.6048991084098816, + "step": 6102 + }, + { + "epoch": 2.5755274261603374, + "grad_norm": 1.161284327507019, + "learning_rate": 6.847497841642935e-05, + "loss": 0.6359057426452637, + "step": 6104 + }, + { + "epoch": 2.5763713080168777, + "grad_norm": 1.285344123840332, + "learning_rate": 6.845266115891203e-05, + "loss": 0.5858902335166931, + "step": 6106 + }, + { + "epoch": 2.5772151898734177, + "grad_norm": 1.085143804550171, + "learning_rate": 6.843033964472018e-05, + "loss": 0.5742247700691223, + "step": 6108 + }, + { + "epoch": 2.5780590717299576, + "grad_norm": 1.1920831203460693, + "learning_rate": 6.840801387900291e-05, + "loss": 0.6738532185554504, + "step": 6110 + }, + { + "epoch": 2.578902953586498, + "grad_norm": 1.2750232219696045, + "learning_rate": 6.838568386691042e-05, + "loss": 0.6046389937400818, + "step": 6112 + }, + { + "epoch": 2.579746835443038, + "grad_norm": 1.1027764081954956, + "learning_rate": 6.836334961359373e-05, + "loss": 0.6231611967086792, + "step": 6114 + }, + { + "epoch": 2.580590717299578, + "grad_norm": 1.2996546030044556, + "learning_rate": 6.834101112420497e-05, + "loss": 0.5848191380500793, + "step": 6116 + }, + { + "epoch": 2.5814345991561183, + "grad_norm": 1.2683454751968384, + "learning_rate": 6.831866840389719e-05, + "loss": 0.6160622835159302, + "step": 6118 + }, + { + "epoch": 2.5822784810126582, + "grad_norm": 1.049797534942627, + "learning_rate": 6.829632145782441e-05, + "loss": 0.5220097899436951, + "step": 6120 + }, + { + "epoch": 2.583122362869198, + "grad_norm": 1.1798468828201294, + "learning_rate": 6.827397029114168e-05, + "loss": 0.5709835290908813, + "step": 6122 + }, + { + "epoch": 2.5839662447257385, + "grad_norm": 1.0136369466781616, + "learning_rate": 6.825161490900495e-05, + "loss": 0.5086703300476074, + "step": 6124 + }, + { + "epoch": 2.5848101265822785, + "grad_norm": 1.147735595703125, + "learning_rate": 6.822925531657119e-05, + "loss": 0.5904423594474792, + "step": 6126 + }, + { + "epoch": 2.5856540084388184, + "grad_norm": 0.9979357123374939, + "learning_rate": 6.820689151899833e-05, + "loss": 0.5002011060714722, + "step": 6128 + }, + { + "epoch": 2.586497890295359, + "grad_norm": 1.4129728078842163, + "learning_rate": 6.818452352144527e-05, + "loss": 0.5694814920425415, + "step": 6130 + }, + { + "epoch": 2.5873417721518988, + "grad_norm": 1.1388975381851196, + "learning_rate": 6.816215132907186e-05, + "loss": 0.5448270440101624, + "step": 6132 + }, + { + "epoch": 2.5881856540084387, + "grad_norm": 1.268865942955017, + "learning_rate": 6.813977494703896e-05, + "loss": 0.6184739470481873, + "step": 6134 + }, + { + "epoch": 2.589029535864979, + "grad_norm": 1.2403846979141235, + "learning_rate": 6.811739438050835e-05, + "loss": 0.6493034958839417, + "step": 6136 + }, + { + "epoch": 2.589873417721519, + "grad_norm": 1.108298659324646, + "learning_rate": 6.809500963464282e-05, + "loss": 0.6168854236602783, + "step": 6138 + }, + { + "epoch": 2.590717299578059, + "grad_norm": 1.106427788734436, + "learning_rate": 6.807262071460609e-05, + "loss": 0.5734958052635193, + "step": 6140 + }, + { + "epoch": 2.5915611814345993, + "grad_norm": 1.147791862487793, + "learning_rate": 6.805022762556286e-05, + "loss": 0.5422238111495972, + "step": 6142 + }, + { + "epoch": 2.5924050632911393, + "grad_norm": 1.214465856552124, + "learning_rate": 6.802783037267874e-05, + "loss": 0.6511701345443726, + "step": 6144 + }, + { + "epoch": 2.5932489451476792, + "grad_norm": 1.087735891342163, + "learning_rate": 6.800542896112043e-05, + "loss": 0.5978493094444275, + "step": 6146 + }, + { + "epoch": 2.5940928270042196, + "grad_norm": 1.0772241353988647, + "learning_rate": 6.798302339605544e-05, + "loss": 0.5656765699386597, + "step": 6148 + }, + { + "epoch": 2.5949367088607596, + "grad_norm": 1.1666499376296997, + "learning_rate": 6.796061368265231e-05, + "loss": 0.6147777438163757, + "step": 6150 + }, + { + "epoch": 2.5957805907172995, + "grad_norm": 0.9949467182159424, + "learning_rate": 6.793819982608057e-05, + "loss": 0.502659022808075, + "step": 6152 + }, + { + "epoch": 2.59662447257384, + "grad_norm": 1.311484456062317, + "learning_rate": 6.791578183151061e-05, + "loss": 0.6019812226295471, + "step": 6154 + }, + { + "epoch": 2.59746835443038, + "grad_norm": 0.9594855904579163, + "learning_rate": 6.789335970411387e-05, + "loss": 0.625690221786499, + "step": 6156 + }, + { + "epoch": 2.5983122362869198, + "grad_norm": 1.2252063751220703, + "learning_rate": 6.78709334490627e-05, + "loss": 0.628356397151947, + "step": 6158 + }, + { + "epoch": 2.59915611814346, + "grad_norm": 1.089603304862976, + "learning_rate": 6.784850307153043e-05, + "loss": 0.5447192192077637, + "step": 6160 + }, + { + "epoch": 2.6, + "grad_norm": 1.1035163402557373, + "learning_rate": 6.782606857669125e-05, + "loss": 0.5400487184524536, + "step": 6162 + }, + { + "epoch": 2.60084388185654, + "grad_norm": 1.2329976558685303, + "learning_rate": 6.780362996972042e-05, + "loss": 0.5795643329620361, + "step": 6164 + }, + { + "epoch": 2.6016877637130804, + "grad_norm": 1.2984000444412231, + "learning_rate": 6.778118725579408e-05, + "loss": 0.5664985775947571, + "step": 6166 + }, + { + "epoch": 2.6025316455696204, + "grad_norm": 1.3563600778579712, + "learning_rate": 6.775874044008933e-05, + "loss": 0.5406283140182495, + "step": 6168 + }, + { + "epoch": 2.6033755274261603, + "grad_norm": 1.1897385120391846, + "learning_rate": 6.773628952778421e-05, + "loss": 0.5362374782562256, + "step": 6170 + }, + { + "epoch": 2.6042194092827007, + "grad_norm": 1.1492685079574585, + "learning_rate": 6.771383452405773e-05, + "loss": 0.5942689180374146, + "step": 6172 + }, + { + "epoch": 2.6050632911392406, + "grad_norm": 1.2306408882141113, + "learning_rate": 6.769137543408985e-05, + "loss": 0.6144227981567383, + "step": 6174 + }, + { + "epoch": 2.6059071729957806, + "grad_norm": 1.1260589361190796, + "learning_rate": 6.766891226306143e-05, + "loss": 0.5147640705108643, + "step": 6176 + }, + { + "epoch": 2.606751054852321, + "grad_norm": 1.214007019996643, + "learning_rate": 6.764644501615427e-05, + "loss": 0.6822091341018677, + "step": 6178 + }, + { + "epoch": 2.607594936708861, + "grad_norm": 1.2251341342926025, + "learning_rate": 6.762397369855116e-05, + "loss": 0.5330857038497925, + "step": 6180 + }, + { + "epoch": 2.608438818565401, + "grad_norm": 1.3556525707244873, + "learning_rate": 6.760149831543578e-05, + "loss": 0.58979332447052, + "step": 6182 + }, + { + "epoch": 2.6092827004219408, + "grad_norm": 1.286598563194275, + "learning_rate": 6.757901887199278e-05, + "loss": 0.5667334198951721, + "step": 6184 + }, + { + "epoch": 2.610126582278481, + "grad_norm": 1.2515888214111328, + "learning_rate": 6.755653537340776e-05, + "loss": 0.6028750538825989, + "step": 6186 + }, + { + "epoch": 2.610970464135021, + "grad_norm": 1.1090617179870605, + "learning_rate": 6.753404782486719e-05, + "loss": 0.604102611541748, + "step": 6188 + }, + { + "epoch": 2.611814345991561, + "grad_norm": 1.1782273054122925, + "learning_rate": 6.751155623155853e-05, + "loss": 0.5486276745796204, + "step": 6190 + }, + { + "epoch": 2.6126582278481014, + "grad_norm": 1.5475431680679321, + "learning_rate": 6.748906059867018e-05, + "loss": 0.630682110786438, + "step": 6192 + }, + { + "epoch": 2.6135021097046414, + "grad_norm": 1.237891435623169, + "learning_rate": 6.746656093139143e-05, + "loss": 0.571597695350647, + "step": 6194 + }, + { + "epoch": 2.6143459915611813, + "grad_norm": 1.2367130517959595, + "learning_rate": 6.744405723491253e-05, + "loss": 0.6020040512084961, + "step": 6196 + }, + { + "epoch": 2.6151898734177212, + "grad_norm": 1.0747612714767456, + "learning_rate": 6.742154951442464e-05, + "loss": 0.5520704984664917, + "step": 6198 + }, + { + "epoch": 2.6160337552742616, + "grad_norm": 1.3944035768508911, + "learning_rate": 6.739903777511985e-05, + "loss": 0.7312755584716797, + "step": 6200 + }, + { + "epoch": 2.6160337552742616, + "eval_loss": 0.6795271039009094, + "eval_runtime": 513.2393, + "eval_samples_per_second": 4.105, + "eval_steps_per_second": 4.105, + "step": 6200 + }, + { + "epoch": 2.6168776371308016, + "grad_norm": 1.3716613054275513, + "learning_rate": 6.737652202219121e-05, + "loss": 0.617123007774353, + "step": 6202 + }, + { + "epoch": 2.6177215189873415, + "grad_norm": 1.1962300539016724, + "learning_rate": 6.735400226083267e-05, + "loss": 0.5791950225830078, + "step": 6204 + }, + { + "epoch": 2.618565400843882, + "grad_norm": 1.2570394277572632, + "learning_rate": 6.733147849623909e-05, + "loss": 0.5941018462181091, + "step": 6206 + }, + { + "epoch": 2.619409282700422, + "grad_norm": 1.2903523445129395, + "learning_rate": 6.730895073360628e-05, + "loss": 0.5417253971099854, + "step": 6208 + }, + { + "epoch": 2.620253164556962, + "grad_norm": 1.0618562698364258, + "learning_rate": 6.728641897813096e-05, + "loss": 0.536359965801239, + "step": 6210 + }, + { + "epoch": 2.621097046413502, + "grad_norm": 1.307300090789795, + "learning_rate": 6.726388323501077e-05, + "loss": 0.6409479975700378, + "step": 6212 + }, + { + "epoch": 2.621940928270042, + "grad_norm": 1.3672584295272827, + "learning_rate": 6.72413435094443e-05, + "loss": 0.66277676820755, + "step": 6214 + }, + { + "epoch": 2.622784810126582, + "grad_norm": 1.2156232595443726, + "learning_rate": 6.721879980663098e-05, + "loss": 0.6193054914474487, + "step": 6216 + }, + { + "epoch": 2.6236286919831224, + "grad_norm": 1.1575636863708496, + "learning_rate": 6.719625213177124e-05, + "loss": 0.5773701667785645, + "step": 6218 + }, + { + "epoch": 2.6244725738396624, + "grad_norm": 1.2327474355697632, + "learning_rate": 6.71737004900664e-05, + "loss": 0.6913977265357971, + "step": 6220 + }, + { + "epoch": 2.6253164556962023, + "grad_norm": 1.1316778659820557, + "learning_rate": 6.715114488671869e-05, + "loss": 0.5773524045944214, + "step": 6222 + }, + { + "epoch": 2.6261603375527427, + "grad_norm": 1.1508816480636597, + "learning_rate": 6.712858532693125e-05, + "loss": 0.5554601550102234, + "step": 6224 + }, + { + "epoch": 2.6270042194092826, + "grad_norm": 1.2404967546463013, + "learning_rate": 6.710602181590812e-05, + "loss": 0.6090670824050903, + "step": 6226 + }, + { + "epoch": 2.6278481012658226, + "grad_norm": 1.0721718072891235, + "learning_rate": 6.70834543588543e-05, + "loss": 0.5546537637710571, + "step": 6228 + }, + { + "epoch": 2.628691983122363, + "grad_norm": 1.2788114547729492, + "learning_rate": 6.706088296097564e-05, + "loss": 0.5939876437187195, + "step": 6230 + }, + { + "epoch": 2.629535864978903, + "grad_norm": 1.1952526569366455, + "learning_rate": 6.703830762747896e-05, + "loss": 0.5291836857795715, + "step": 6232 + }, + { + "epoch": 2.630379746835443, + "grad_norm": 1.0261807441711426, + "learning_rate": 6.701572836357191e-05, + "loss": 0.518436074256897, + "step": 6234 + }, + { + "epoch": 2.6312236286919832, + "grad_norm": 1.1804791688919067, + "learning_rate": 6.699314517446316e-05, + "loss": 0.5830684900283813, + "step": 6236 + }, + { + "epoch": 2.632067510548523, + "grad_norm": 1.2079823017120361, + "learning_rate": 6.697055806536214e-05, + "loss": 0.5899971127510071, + "step": 6238 + }, + { + "epoch": 2.632911392405063, + "grad_norm": 1.1989154815673828, + "learning_rate": 6.694796704147932e-05, + "loss": 0.6533132791519165, + "step": 6240 + }, + { + "epoch": 2.6337552742616035, + "grad_norm": 1.0621024370193481, + "learning_rate": 6.692537210802598e-05, + "loss": 0.5341002345085144, + "step": 6242 + }, + { + "epoch": 2.6345991561181434, + "grad_norm": 1.2911880016326904, + "learning_rate": 6.690277327021436e-05, + "loss": 0.6795719861984253, + "step": 6244 + }, + { + "epoch": 2.6354430379746834, + "grad_norm": 1.3586145639419556, + "learning_rate": 6.688017053325757e-05, + "loss": 0.5390555262565613, + "step": 6246 + }, + { + "epoch": 2.6362869198312238, + "grad_norm": 1.31569242477417, + "learning_rate": 6.685756390236964e-05, + "loss": 0.5935586094856262, + "step": 6248 + }, + { + "epoch": 2.6371308016877637, + "grad_norm": 1.0801384449005127, + "learning_rate": 6.683495338276547e-05, + "loss": 0.5845919847488403, + "step": 6250 + }, + { + "epoch": 2.6379746835443036, + "grad_norm": 1.179715633392334, + "learning_rate": 6.681233897966087e-05, + "loss": 0.6017906665802002, + "step": 6252 + }, + { + "epoch": 2.638818565400844, + "grad_norm": 1.1927930116653442, + "learning_rate": 6.678972069827255e-05, + "loss": 0.6637946367263794, + "step": 6254 + }, + { + "epoch": 2.639662447257384, + "grad_norm": 1.2167247533798218, + "learning_rate": 6.676709854381812e-05, + "loss": 0.5572535991668701, + "step": 6256 + }, + { + "epoch": 2.640506329113924, + "grad_norm": 1.2026311159133911, + "learning_rate": 6.674447252151608e-05, + "loss": 0.5426514148712158, + "step": 6258 + }, + { + "epoch": 2.6413502109704643, + "grad_norm": 1.101891279220581, + "learning_rate": 6.672184263658579e-05, + "loss": 0.5123113989830017, + "step": 6260 + }, + { + "epoch": 2.6421940928270042, + "grad_norm": 1.3467986583709717, + "learning_rate": 6.669920889424758e-05, + "loss": 0.6018276214599609, + "step": 6262 + }, + { + "epoch": 2.643037974683544, + "grad_norm": 1.2477779388427734, + "learning_rate": 6.667657129972257e-05, + "loss": 0.5618380308151245, + "step": 6264 + }, + { + "epoch": 2.6438818565400846, + "grad_norm": 1.1284273862838745, + "learning_rate": 6.665392985823287e-05, + "loss": 0.5541924834251404, + "step": 6266 + }, + { + "epoch": 2.6447257383966245, + "grad_norm": 1.2376370429992676, + "learning_rate": 6.663128457500137e-05, + "loss": 0.5534335970878601, + "step": 6268 + }, + { + "epoch": 2.6455696202531644, + "grad_norm": 1.3205965757369995, + "learning_rate": 6.660863545525196e-05, + "loss": 0.6160520315170288, + "step": 6270 + }, + { + "epoch": 2.646413502109705, + "grad_norm": 1.175926685333252, + "learning_rate": 6.65859825042093e-05, + "loss": 0.6035991311073303, + "step": 6272 + }, + { + "epoch": 2.6472573839662448, + "grad_norm": 1.2805176973342896, + "learning_rate": 6.656332572709901e-05, + "loss": 0.6101992130279541, + "step": 6274 + }, + { + "epoch": 2.6481012658227847, + "grad_norm": 1.2493922710418701, + "learning_rate": 6.65406651291476e-05, + "loss": 0.5665684342384338, + "step": 6276 + }, + { + "epoch": 2.648945147679325, + "grad_norm": 1.3103299140930176, + "learning_rate": 6.65180007155824e-05, + "loss": 0.682868242263794, + "step": 6278 + }, + { + "epoch": 2.649789029535865, + "grad_norm": 1.3098952770233154, + "learning_rate": 6.649533249163167e-05, + "loss": 0.6398087739944458, + "step": 6280 + }, + { + "epoch": 2.650632911392405, + "grad_norm": 1.230396032333374, + "learning_rate": 6.647266046252454e-05, + "loss": 0.5410205721855164, + "step": 6282 + }, + { + "epoch": 2.6514767932489454, + "grad_norm": 1.1755880117416382, + "learning_rate": 6.6449984633491e-05, + "loss": 0.6019781231880188, + "step": 6284 + }, + { + "epoch": 2.6523206751054853, + "grad_norm": 1.1013081073760986, + "learning_rate": 6.642730500976193e-05, + "loss": 0.5327204465866089, + "step": 6286 + }, + { + "epoch": 2.6531645569620252, + "grad_norm": 1.1285136938095093, + "learning_rate": 6.640462159656908e-05, + "loss": 0.6458070278167725, + "step": 6288 + }, + { + "epoch": 2.6540084388185656, + "grad_norm": 1.5320124626159668, + "learning_rate": 6.638193439914512e-05, + "loss": 0.6038496494293213, + "step": 6290 + }, + { + "epoch": 2.6548523206751056, + "grad_norm": 1.0231032371520996, + "learning_rate": 6.635924342272349e-05, + "loss": 0.5353283286094666, + "step": 6292 + }, + { + "epoch": 2.6556962025316455, + "grad_norm": 1.1871505975723267, + "learning_rate": 6.633654867253858e-05, + "loss": 0.644368588924408, + "step": 6294 + }, + { + "epoch": 2.656540084388186, + "grad_norm": 1.0641425848007202, + "learning_rate": 6.631385015382565e-05, + "loss": 0.5251830220222473, + "step": 6296 + }, + { + "epoch": 2.657383966244726, + "grad_norm": 0.8980898261070251, + "learning_rate": 6.62911478718208e-05, + "loss": 0.527733564376831, + "step": 6298 + }, + { + "epoch": 2.6582278481012658, + "grad_norm": 1.1694822311401367, + "learning_rate": 6.626844183176102e-05, + "loss": 0.5868222117424011, + "step": 6300 + }, + { + "epoch": 2.6582278481012658, + "eval_loss": 0.6781066656112671, + "eval_runtime": 512.3669, + "eval_samples_per_second": 4.112, + "eval_steps_per_second": 4.112, + "step": 6300 + }, + { + "epoch": 2.659071729957806, + "grad_norm": 1.3010352849960327, + "learning_rate": 6.624573203888413e-05, + "loss": 0.5965607166290283, + "step": 6302 + }, + { + "epoch": 2.659915611814346, + "grad_norm": 1.074964165687561, + "learning_rate": 6.62230184984289e-05, + "loss": 0.5776658654212952, + "step": 6304 + }, + { + "epoch": 2.660759493670886, + "grad_norm": 1.0930451154708862, + "learning_rate": 6.620030121563484e-05, + "loss": 0.584223210811615, + "step": 6306 + }, + { + "epoch": 2.6616033755274264, + "grad_norm": 1.1418803930282593, + "learning_rate": 6.617758019574243e-05, + "loss": 0.534063994884491, + "step": 6308 + }, + { + "epoch": 2.6624472573839664, + "grad_norm": 1.1602790355682373, + "learning_rate": 6.615485544399298e-05, + "loss": 0.5719610452651978, + "step": 6310 + }, + { + "epoch": 2.6632911392405063, + "grad_norm": 1.0926544666290283, + "learning_rate": 6.613212696562863e-05, + "loss": 0.5489934682846069, + "step": 6312 + }, + { + "epoch": 2.6641350210970463, + "grad_norm": 1.2560242414474487, + "learning_rate": 6.610939476589239e-05, + "loss": 0.5568612217903137, + "step": 6314 + }, + { + "epoch": 2.6649789029535866, + "grad_norm": 1.110960602760315, + "learning_rate": 6.60866588500282e-05, + "loss": 0.6019266247749329, + "step": 6316 + }, + { + "epoch": 2.6658227848101266, + "grad_norm": 1.333012342453003, + "learning_rate": 6.606391922328074e-05, + "loss": 0.6083081364631653, + "step": 6318 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.1256170272827148, + "learning_rate": 6.604117589089564e-05, + "loss": 0.5586183071136475, + "step": 6320 + }, + { + "epoch": 2.667510548523207, + "grad_norm": 1.2877609729766846, + "learning_rate": 6.601842885811934e-05, + "loss": 0.5676470994949341, + "step": 6322 + }, + { + "epoch": 2.668354430379747, + "grad_norm": 1.305034875869751, + "learning_rate": 6.599567813019914e-05, + "loss": 0.6470263600349426, + "step": 6324 + }, + { + "epoch": 2.669198312236287, + "grad_norm": 1.1695195436477661, + "learning_rate": 6.597292371238318e-05, + "loss": 0.588540256023407, + "step": 6326 + }, + { + "epoch": 2.670042194092827, + "grad_norm": 1.084652304649353, + "learning_rate": 6.59501656099205e-05, + "loss": 0.602922260761261, + "step": 6328 + }, + { + "epoch": 2.670886075949367, + "grad_norm": 1.1664962768554688, + "learning_rate": 6.592740382806094e-05, + "loss": 0.5613425970077515, + "step": 6330 + }, + { + "epoch": 2.671729957805907, + "grad_norm": 1.2208726406097412, + "learning_rate": 6.590463837205522e-05, + "loss": 0.5850927829742432, + "step": 6332 + }, + { + "epoch": 2.672573839662447, + "grad_norm": 1.0662479400634766, + "learning_rate": 6.588186924715488e-05, + "loss": 0.503675639629364, + "step": 6334 + }, + { + "epoch": 2.6734177215189874, + "grad_norm": 1.5318000316619873, + "learning_rate": 6.58590964586123e-05, + "loss": 0.6245100498199463, + "step": 6336 + }, + { + "epoch": 2.6742616033755273, + "grad_norm": 1.402784824371338, + "learning_rate": 6.583632001168077e-05, + "loss": 0.6556243896484375, + "step": 6338 + }, + { + "epoch": 2.6751054852320673, + "grad_norm": 1.2293213605880737, + "learning_rate": 6.581353991161435e-05, + "loss": 0.6398119926452637, + "step": 6340 + }, + { + "epoch": 2.6759493670886076, + "grad_norm": 1.2687599658966064, + "learning_rate": 6.579075616366797e-05, + "loss": 0.5792493224143982, + "step": 6342 + }, + { + "epoch": 2.6767932489451476, + "grad_norm": 1.2112480401992798, + "learning_rate": 6.576796877309741e-05, + "loss": 0.6669304966926575, + "step": 6344 + }, + { + "epoch": 2.6776371308016875, + "grad_norm": 1.3074487447738647, + "learning_rate": 6.574517774515929e-05, + "loss": 0.6012452840805054, + "step": 6346 + }, + { + "epoch": 2.678481012658228, + "grad_norm": 1.3157081604003906, + "learning_rate": 6.572238308511106e-05, + "loss": 0.6556297540664673, + "step": 6348 + }, + { + "epoch": 2.679324894514768, + "grad_norm": 1.0735292434692383, + "learning_rate": 6.569958479821099e-05, + "loss": 0.5607976317405701, + "step": 6350 + }, + { + "epoch": 2.680168776371308, + "grad_norm": 1.1896809339523315, + "learning_rate": 6.567678288971825e-05, + "loss": 0.6040812730789185, + "step": 6352 + }, + { + "epoch": 2.681012658227848, + "grad_norm": 1.1350760459899902, + "learning_rate": 6.565397736489274e-05, + "loss": 0.5807676911354065, + "step": 6354 + }, + { + "epoch": 2.681856540084388, + "grad_norm": 1.3865782022476196, + "learning_rate": 6.563116822899532e-05, + "loss": 0.5877989530563354, + "step": 6356 + }, + { + "epoch": 2.682700421940928, + "grad_norm": 1.218682050704956, + "learning_rate": 6.560835548728758e-05, + "loss": 0.614531397819519, + "step": 6358 + }, + { + "epoch": 2.6835443037974684, + "grad_norm": 1.06162691116333, + "learning_rate": 6.5585539145032e-05, + "loss": 0.5880973935127258, + "step": 6360 + }, + { + "epoch": 2.6843881856540084, + "grad_norm": 1.264328956604004, + "learning_rate": 6.556271920749187e-05, + "loss": 0.5795428156852722, + "step": 6362 + }, + { + "epoch": 2.6852320675105483, + "grad_norm": 1.335652470588684, + "learning_rate": 6.553989567993129e-05, + "loss": 0.5927176475524902, + "step": 6364 + }, + { + "epoch": 2.6860759493670887, + "grad_norm": 1.1110745668411255, + "learning_rate": 6.551706856761524e-05, + "loss": 0.5814473628997803, + "step": 6366 + }, + { + "epoch": 2.6869198312236287, + "grad_norm": 1.1731220483779907, + "learning_rate": 6.549423787580947e-05, + "loss": 0.557738184928894, + "step": 6368 + }, + { + "epoch": 2.6877637130801686, + "grad_norm": 1.2679874897003174, + "learning_rate": 6.54714036097806e-05, + "loss": 0.5947291254997253, + "step": 6370 + }, + { + "epoch": 2.688607594936709, + "grad_norm": 1.112322211265564, + "learning_rate": 6.544856577479606e-05, + "loss": 0.5769563317298889, + "step": 6372 + }, + { + "epoch": 2.689451476793249, + "grad_norm": 1.3385759592056274, + "learning_rate": 6.542572437612408e-05, + "loss": 0.6077675223350525, + "step": 6374 + }, + { + "epoch": 2.690295358649789, + "grad_norm": 1.0953450202941895, + "learning_rate": 6.540287941903375e-05, + "loss": 0.5600538849830627, + "step": 6376 + }, + { + "epoch": 2.6911392405063292, + "grad_norm": 1.2455042600631714, + "learning_rate": 6.538003090879495e-05, + "loss": 0.5828459858894348, + "step": 6378 + }, + { + "epoch": 2.691983122362869, + "grad_norm": 1.2563562393188477, + "learning_rate": 6.53571788506784e-05, + "loss": 0.5844002366065979, + "step": 6380 + }, + { + "epoch": 2.692827004219409, + "grad_norm": 1.3466061353683472, + "learning_rate": 6.533432324995563e-05, + "loss": 0.6632003784179688, + "step": 6382 + }, + { + "epoch": 2.6936708860759495, + "grad_norm": 1.2467784881591797, + "learning_rate": 6.531146411189899e-05, + "loss": 0.5532103180885315, + "step": 6384 + }, + { + "epoch": 2.6945147679324895, + "grad_norm": 1.344250202178955, + "learning_rate": 6.528860144178163e-05, + "loss": 0.5722881555557251, + "step": 6386 + }, + { + "epoch": 2.6953586497890294, + "grad_norm": 1.3688865900039673, + "learning_rate": 6.526573524487756e-05, + "loss": 0.6424282789230347, + "step": 6388 + }, + { + "epoch": 2.6962025316455698, + "grad_norm": 1.4252339601516724, + "learning_rate": 6.524286552646153e-05, + "loss": 0.5986620783805847, + "step": 6390 + }, + { + "epoch": 2.6970464135021097, + "grad_norm": 1.4102380275726318, + "learning_rate": 6.52199922918092e-05, + "loss": 0.6466318368911743, + "step": 6392 + }, + { + "epoch": 2.6978902953586497, + "grad_norm": 1.184442400932312, + "learning_rate": 6.519711554619692e-05, + "loss": 0.6259894371032715, + "step": 6394 + }, + { + "epoch": 2.69873417721519, + "grad_norm": 1.2751896381378174, + "learning_rate": 6.517423529490198e-05, + "loss": 0.5682622194290161, + "step": 6396 + }, + { + "epoch": 2.69957805907173, + "grad_norm": 1.3333114385604858, + "learning_rate": 6.515135154320236e-05, + "loss": 0.573390007019043, + "step": 6398 + }, + { + "epoch": 2.70042194092827, + "grad_norm": 1.2505477666854858, + "learning_rate": 6.512846429637693e-05, + "loss": 0.5839408040046692, + "step": 6400 + }, + { + "epoch": 2.70042194092827, + "eval_loss": 0.6764505505561829, + "eval_runtime": 512.7682, + "eval_samples_per_second": 4.109, + "eval_steps_per_second": 4.109, + "step": 6400 + }, + { + "epoch": 2.7012658227848103, + "grad_norm": 1.2822065353393555, + "learning_rate": 6.510557355970534e-05, + "loss": 0.6000106334686279, + "step": 6402 + }, + { + "epoch": 2.7021097046413503, + "grad_norm": 1.2144463062286377, + "learning_rate": 6.508267933846803e-05, + "loss": 0.5796633362770081, + "step": 6404 + }, + { + "epoch": 2.70295358649789, + "grad_norm": 1.189985990524292, + "learning_rate": 6.505978163794628e-05, + "loss": 0.5976626873016357, + "step": 6406 + }, + { + "epoch": 2.7037974683544306, + "grad_norm": 1.0484727621078491, + "learning_rate": 6.503688046342212e-05, + "loss": 0.5054599642753601, + "step": 6408 + }, + { + "epoch": 2.7046413502109705, + "grad_norm": 1.4333025217056274, + "learning_rate": 6.501397582017844e-05, + "loss": 0.6539149284362793, + "step": 6410 + }, + { + "epoch": 2.7054852320675105, + "grad_norm": 1.1808522939682007, + "learning_rate": 6.499106771349887e-05, + "loss": 0.5220640301704407, + "step": 6412 + }, + { + "epoch": 2.706329113924051, + "grad_norm": 2.8626298904418945, + "learning_rate": 6.496815614866791e-05, + "loss": 0.6019118428230286, + "step": 6414 + }, + { + "epoch": 2.707172995780591, + "grad_norm": 1.1092768907546997, + "learning_rate": 6.494524113097078e-05, + "loss": 0.5754269361495972, + "step": 6416 + }, + { + "epoch": 2.7080168776371307, + "grad_norm": 1.2416579723358154, + "learning_rate": 6.492232266569353e-05, + "loss": 0.5548025369644165, + "step": 6418 + }, + { + "epoch": 2.708860759493671, + "grad_norm": 1.012360692024231, + "learning_rate": 6.489940075812306e-05, + "loss": 0.5706405639648438, + "step": 6420 + }, + { + "epoch": 2.709704641350211, + "grad_norm": 1.376641869544983, + "learning_rate": 6.487647541354698e-05, + "loss": 0.5862169861793518, + "step": 6422 + }, + { + "epoch": 2.710548523206751, + "grad_norm": 1.2425684928894043, + "learning_rate": 6.485354663725374e-05, + "loss": 0.5928428769111633, + "step": 6424 + }, + { + "epoch": 2.7113924050632914, + "grad_norm": 1.0926302671432495, + "learning_rate": 6.483061443453254e-05, + "loss": 0.5903078317642212, + "step": 6426 + }, + { + "epoch": 2.7122362869198313, + "grad_norm": 1.3698115348815918, + "learning_rate": 6.480767881067342e-05, + "loss": 0.5848883986473083, + "step": 6428 + }, + { + "epoch": 2.7130801687763713, + "grad_norm": 1.2949504852294922, + "learning_rate": 6.478473977096718e-05, + "loss": 0.5285207629203796, + "step": 6430 + }, + { + "epoch": 2.7139240506329116, + "grad_norm": 1.3662208318710327, + "learning_rate": 6.476179732070543e-05, + "loss": 0.5965171456336975, + "step": 6432 + }, + { + "epoch": 2.7147679324894516, + "grad_norm": 1.3127343654632568, + "learning_rate": 6.473885146518055e-05, + "loss": 0.6549378037452698, + "step": 6434 + }, + { + "epoch": 2.7156118143459915, + "grad_norm": 1.199431300163269, + "learning_rate": 6.471590220968568e-05, + "loss": 0.574461042881012, + "step": 6436 + }, + { + "epoch": 2.716455696202532, + "grad_norm": 1.1624091863632202, + "learning_rate": 6.469294955951481e-05, + "loss": 0.6142178177833557, + "step": 6438 + }, + { + "epoch": 2.717299578059072, + "grad_norm": 1.2685147523880005, + "learning_rate": 6.466999351996266e-05, + "loss": 0.5775829553604126, + "step": 6440 + }, + { + "epoch": 2.718143459915612, + "grad_norm": 1.0987834930419922, + "learning_rate": 6.464703409632476e-05, + "loss": 0.5400159955024719, + "step": 6442 + }, + { + "epoch": 2.7189873417721517, + "grad_norm": 1.2638986110687256, + "learning_rate": 6.462407129389736e-05, + "loss": 0.558712899684906, + "step": 6444 + }, + { + "epoch": 2.719831223628692, + "grad_norm": 1.174168586730957, + "learning_rate": 6.46011051179776e-05, + "loss": 0.5465238094329834, + "step": 6446 + }, + { + "epoch": 2.720675105485232, + "grad_norm": 1.2185649871826172, + "learning_rate": 6.457813557386331e-05, + "loss": 0.629173219203949, + "step": 6448 + }, + { + "epoch": 2.721518987341772, + "grad_norm": 1.1563167572021484, + "learning_rate": 6.455516266685311e-05, + "loss": 0.5557543039321899, + "step": 6450 + }, + { + "epoch": 2.7223628691983124, + "grad_norm": 1.2934051752090454, + "learning_rate": 6.453218640224642e-05, + "loss": 0.6350696682929993, + "step": 6452 + }, + { + "epoch": 2.7232067510548523, + "grad_norm": 1.045218825340271, + "learning_rate": 6.450920678534342e-05, + "loss": 0.544219434261322, + "step": 6454 + }, + { + "epoch": 2.7240506329113923, + "grad_norm": 1.3102771043777466, + "learning_rate": 6.44862238214451e-05, + "loss": 0.6312481760978699, + "step": 6456 + }, + { + "epoch": 2.7248945147679327, + "grad_norm": 1.3338704109191895, + "learning_rate": 6.446323751585312e-05, + "loss": 0.5772860050201416, + "step": 6458 + }, + { + "epoch": 2.7257383966244726, + "grad_norm": 1.1826046705245972, + "learning_rate": 6.444024787387003e-05, + "loss": 0.5450227856636047, + "step": 6460 + }, + { + "epoch": 2.7265822784810125, + "grad_norm": 1.2449530363082886, + "learning_rate": 6.441725490079908e-05, + "loss": 0.5775642395019531, + "step": 6462 + }, + { + "epoch": 2.7274261603375525, + "grad_norm": 1.1204898357391357, + "learning_rate": 6.439425860194432e-05, + "loss": 0.5795316100120544, + "step": 6464 + }, + { + "epoch": 2.728270042194093, + "grad_norm": 1.179542064666748, + "learning_rate": 6.437125898261056e-05, + "loss": 0.6187583804130554, + "step": 6466 + }, + { + "epoch": 2.729113924050633, + "grad_norm": 1.2231724262237549, + "learning_rate": 6.434825604810333e-05, + "loss": 0.581790566444397, + "step": 6468 + }, + { + "epoch": 2.7299578059071727, + "grad_norm": 1.178859829902649, + "learning_rate": 6.432524980372902e-05, + "loss": 0.5470858812332153, + "step": 6470 + }, + { + "epoch": 2.730801687763713, + "grad_norm": 1.2092641592025757, + "learning_rate": 6.430224025479469e-05, + "loss": 0.591381311416626, + "step": 6472 + }, + { + "epoch": 2.731645569620253, + "grad_norm": 1.395704746246338, + "learning_rate": 6.42792274066082e-05, + "loss": 0.6809561252593994, + "step": 6474 + }, + { + "epoch": 2.732489451476793, + "grad_norm": 1.1937509775161743, + "learning_rate": 6.42562112644782e-05, + "loss": 0.5667102932929993, + "step": 6476 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 1.2181694507598877, + "learning_rate": 6.423319183371405e-05, + "loss": 0.5832397937774658, + "step": 6478 + }, + { + "epoch": 2.7341772151898733, + "grad_norm": 0.9961143732070923, + "learning_rate": 6.42101691196259e-05, + "loss": 0.5432526469230652, + "step": 6480 + }, + { + "epoch": 2.7350210970464133, + "grad_norm": 1.2029842138290405, + "learning_rate": 6.418714312752466e-05, + "loss": 0.5740163326263428, + "step": 6482 + }, + { + "epoch": 2.7358649789029537, + "grad_norm": 1.4317080974578857, + "learning_rate": 6.416411386272196e-05, + "loss": 0.6384599804878235, + "step": 6484 + }, + { + "epoch": 2.7367088607594936, + "grad_norm": 1.2837908267974854, + "learning_rate": 6.414108133053022e-05, + "loss": 0.6619245409965515, + "step": 6486 + }, + { + "epoch": 2.7375527426160335, + "grad_norm": 1.1140583753585815, + "learning_rate": 6.41180455362626e-05, + "loss": 0.5453745126724243, + "step": 6488 + }, + { + "epoch": 2.738396624472574, + "grad_norm": 1.1226048469543457, + "learning_rate": 6.409500648523302e-05, + "loss": 0.6225460171699524, + "step": 6490 + }, + { + "epoch": 2.739240506329114, + "grad_norm": 1.2367178201675415, + "learning_rate": 6.407196418275613e-05, + "loss": 0.5767168402671814, + "step": 6492 + }, + { + "epoch": 2.740084388185654, + "grad_norm": 1.4078115224838257, + "learning_rate": 6.404891863414736e-05, + "loss": 0.6131237745285034, + "step": 6494 + }, + { + "epoch": 2.740928270042194, + "grad_norm": 1.21550452709198, + "learning_rate": 6.40258698447229e-05, + "loss": 0.5236409306526184, + "step": 6496 + }, + { + "epoch": 2.741772151898734, + "grad_norm": 1.22257661819458, + "learning_rate": 6.400281781979962e-05, + "loss": 0.5483267307281494, + "step": 6498 + }, + { + "epoch": 2.742616033755274, + "grad_norm": 1.1525336503982544, + "learning_rate": 6.39797625646952e-05, + "loss": 0.6161116361618042, + "step": 6500 + }, + { + "epoch": 2.742616033755274, + "eval_loss": 0.6768895387649536, + "eval_runtime": 513.0657, + "eval_samples_per_second": 4.107, + "eval_steps_per_second": 4.107, + "step": 6500 + }, + { + "epoch": 2.7434599156118145, + "grad_norm": 1.094993233680725, + "learning_rate": 6.395670408472804e-05, + "loss": 0.5587809681892395, + "step": 6502 + }, + { + "epoch": 2.7443037974683544, + "grad_norm": 1.1560120582580566, + "learning_rate": 6.393364238521731e-05, + "loss": 0.6118067502975464, + "step": 6504 + }, + { + "epoch": 2.7451476793248943, + "grad_norm": 1.3500670194625854, + "learning_rate": 6.391057747148285e-05, + "loss": 0.6314222812652588, + "step": 6506 + }, + { + "epoch": 2.7459915611814347, + "grad_norm": 1.2182261943817139, + "learning_rate": 6.388750934884535e-05, + "loss": 0.5695898532867432, + "step": 6508 + }, + { + "epoch": 2.7468354430379747, + "grad_norm": 1.3393630981445312, + "learning_rate": 6.386443802262616e-05, + "loss": 0.5848485827445984, + "step": 6510 + }, + { + "epoch": 2.7476793248945146, + "grad_norm": 1.412109375, + "learning_rate": 6.384136349814737e-05, + "loss": 0.5920066237449646, + "step": 6512 + }, + { + "epoch": 2.748523206751055, + "grad_norm": 1.174395203590393, + "learning_rate": 6.381828578073186e-05, + "loss": 0.5770407319068909, + "step": 6514 + }, + { + "epoch": 2.749367088607595, + "grad_norm": 1.2811627388000488, + "learning_rate": 6.37952048757032e-05, + "loss": 0.5780549049377441, + "step": 6516 + }, + { + "epoch": 2.750210970464135, + "grad_norm": 1.0966699123382568, + "learning_rate": 6.377212078838573e-05, + "loss": 0.5276137590408325, + "step": 6518 + }, + { + "epoch": 2.7510548523206753, + "grad_norm": 1.082350730895996, + "learning_rate": 6.374903352410449e-05, + "loss": 0.5744844675064087, + "step": 6520 + }, + { + "epoch": 2.751898734177215, + "grad_norm": 1.342262864112854, + "learning_rate": 6.372594308818527e-05, + "loss": 0.6084962487220764, + "step": 6522 + }, + { + "epoch": 2.752742616033755, + "grad_norm": 1.1922634840011597, + "learning_rate": 6.370284948595458e-05, + "loss": 0.5551698803901672, + "step": 6524 + }, + { + "epoch": 2.7535864978902955, + "grad_norm": 1.1368752717971802, + "learning_rate": 6.36797527227397e-05, + "loss": 0.6398477554321289, + "step": 6526 + }, + { + "epoch": 2.7544303797468355, + "grad_norm": 1.1748154163360596, + "learning_rate": 6.365665280386857e-05, + "loss": 0.6201474666595459, + "step": 6528 + }, + { + "epoch": 2.7552742616033754, + "grad_norm": 1.2439727783203125, + "learning_rate": 6.363354973466993e-05, + "loss": 0.6196629405021667, + "step": 6530 + }, + { + "epoch": 2.756118143459916, + "grad_norm": 1.146153211593628, + "learning_rate": 6.36104435204732e-05, + "loss": 0.6379110813140869, + "step": 6532 + }, + { + "epoch": 2.7569620253164557, + "grad_norm": 1.118996024131775, + "learning_rate": 6.358733416660854e-05, + "loss": 0.5695750713348389, + "step": 6534 + }, + { + "epoch": 2.7578059071729957, + "grad_norm": 1.219043493270874, + "learning_rate": 6.356422167840685e-05, + "loss": 0.5846145153045654, + "step": 6536 + }, + { + "epoch": 2.758649789029536, + "grad_norm": 1.120754361152649, + "learning_rate": 6.354110606119973e-05, + "loss": 0.5762830972671509, + "step": 6538 + }, + { + "epoch": 2.759493670886076, + "grad_norm": 1.0562269687652588, + "learning_rate": 6.351798732031949e-05, + "loss": 0.605473518371582, + "step": 6540 + }, + { + "epoch": 2.760337552742616, + "grad_norm": 1.3034429550170898, + "learning_rate": 6.34948654610992e-05, + "loss": 0.6314473748207092, + "step": 6542 + }, + { + "epoch": 2.7611814345991563, + "grad_norm": 1.1129206418991089, + "learning_rate": 6.347174048887263e-05, + "loss": 0.5332847237586975, + "step": 6544 + }, + { + "epoch": 2.7620253164556963, + "grad_norm": 1.068705439567566, + "learning_rate": 6.344861240897423e-05, + "loss": 0.6015381813049316, + "step": 6546 + }, + { + "epoch": 2.762869198312236, + "grad_norm": 1.161868691444397, + "learning_rate": 6.342548122673925e-05, + "loss": 0.5989309549331665, + "step": 6548 + }, + { + "epoch": 2.7637130801687766, + "grad_norm": 1.1323082447052002, + "learning_rate": 6.340234694750359e-05, + "loss": 0.5843837261199951, + "step": 6550 + }, + { + "epoch": 2.7645569620253165, + "grad_norm": 1.2302695512771606, + "learning_rate": 6.337920957660388e-05, + "loss": 0.603590726852417, + "step": 6552 + }, + { + "epoch": 2.7654008438818565, + "grad_norm": 1.2483820915222168, + "learning_rate": 6.335606911937749e-05, + "loss": 0.6207526326179504, + "step": 6554 + }, + { + "epoch": 2.766244725738397, + "grad_norm": 1.353147029876709, + "learning_rate": 6.333292558116245e-05, + "loss": 0.5964639782905579, + "step": 6556 + }, + { + "epoch": 2.767088607594937, + "grad_norm": 1.2074922323226929, + "learning_rate": 6.330977896729755e-05, + "loss": 0.5078298449516296, + "step": 6558 + }, + { + "epoch": 2.7679324894514767, + "grad_norm": 1.208228588104248, + "learning_rate": 6.328662928312225e-05, + "loss": 0.5649725198745728, + "step": 6560 + }, + { + "epoch": 2.768776371308017, + "grad_norm": 1.2749123573303223, + "learning_rate": 6.326347653397676e-05, + "loss": 0.5552892684936523, + "step": 6562 + }, + { + "epoch": 2.769620253164557, + "grad_norm": 1.1484880447387695, + "learning_rate": 6.324032072520197e-05, + "loss": 0.6514022350311279, + "step": 6564 + }, + { + "epoch": 2.770464135021097, + "grad_norm": 1.1836612224578857, + "learning_rate": 6.321716186213946e-05, + "loss": 0.5342835783958435, + "step": 6566 + }, + { + "epoch": 2.7713080168776374, + "grad_norm": 1.1626124382019043, + "learning_rate": 6.319399995013154e-05, + "loss": 0.6427282691001892, + "step": 6568 + }, + { + "epoch": 2.7721518987341773, + "grad_norm": 1.0736790895462036, + "learning_rate": 6.317083499452123e-05, + "loss": 0.5326613187789917, + "step": 6570 + }, + { + "epoch": 2.7729957805907173, + "grad_norm": 1.1652518510818481, + "learning_rate": 6.314766700065227e-05, + "loss": 0.543228268623352, + "step": 6572 + }, + { + "epoch": 2.7738396624472577, + "grad_norm": 1.232256531715393, + "learning_rate": 6.3124495973869e-05, + "loss": 0.5558459758758545, + "step": 6574 + }, + { + "epoch": 2.7746835443037976, + "grad_norm": 1.3306560516357422, + "learning_rate": 6.310132191951659e-05, + "loss": 0.6432561874389648, + "step": 6576 + }, + { + "epoch": 2.7755274261603375, + "grad_norm": 1.3863320350646973, + "learning_rate": 6.307814484294083e-05, + "loss": 0.6424768567085266, + "step": 6578 + }, + { + "epoch": 2.7763713080168775, + "grad_norm": 1.186691164970398, + "learning_rate": 6.305496474948822e-05, + "loss": 0.5481483936309814, + "step": 6580 + }, + { + "epoch": 2.777215189873418, + "grad_norm": 1.2820651531219482, + "learning_rate": 6.303178164450596e-05, + "loss": 0.5352432727813721, + "step": 6582 + }, + { + "epoch": 2.778059071729958, + "grad_norm": 1.1904656887054443, + "learning_rate": 6.300859553334196e-05, + "loss": 0.6270323991775513, + "step": 6584 + }, + { + "epoch": 2.7789029535864977, + "grad_norm": 1.1635342836380005, + "learning_rate": 6.29854064213448e-05, + "loss": 0.5700342059135437, + "step": 6586 + }, + { + "epoch": 2.779746835443038, + "grad_norm": 1.1065751314163208, + "learning_rate": 6.296221431386379e-05, + "loss": 0.5618587136268616, + "step": 6588 + }, + { + "epoch": 2.780590717299578, + "grad_norm": 1.3106048107147217, + "learning_rate": 6.293901921624885e-05, + "loss": 0.5982993841171265, + "step": 6590 + }, + { + "epoch": 2.781434599156118, + "grad_norm": 1.210839867591858, + "learning_rate": 6.291582113385071e-05, + "loss": 0.6210941076278687, + "step": 6592 + }, + { + "epoch": 2.782278481012658, + "grad_norm": 1.1407668590545654, + "learning_rate": 6.289262007202066e-05, + "loss": 0.5711221694946289, + "step": 6594 + }, + { + "epoch": 2.7831223628691983, + "grad_norm": 1.2315012216567993, + "learning_rate": 6.286941603611078e-05, + "loss": 0.5741305947303772, + "step": 6596 + }, + { + "epoch": 2.7839662447257383, + "grad_norm": 1.3056857585906982, + "learning_rate": 6.284620903147377e-05, + "loss": 0.5329633951187134, + "step": 6598 + }, + { + "epoch": 2.7848101265822782, + "grad_norm": 1.1501489877700806, + "learning_rate": 6.282299906346306e-05, + "loss": 0.6097646951675415, + "step": 6600 + }, + { + "epoch": 2.7848101265822782, + "eval_loss": 0.6737648844718933, + "eval_runtime": 512.921, + "eval_samples_per_second": 4.108, + "eval_steps_per_second": 4.108, + "step": 6600 + }, + { + "epoch": 2.7856540084388186, + "grad_norm": 1.0871381759643555, + "learning_rate": 6.279978613743275e-05, + "loss": 0.5561007857322693, + "step": 6602 + }, + { + "epoch": 2.7864978902953585, + "grad_norm": 1.188563585281372, + "learning_rate": 6.277657025873758e-05, + "loss": 0.5803903341293335, + "step": 6604 + }, + { + "epoch": 2.7873417721518985, + "grad_norm": 1.1444810628890991, + "learning_rate": 6.275335143273305e-05, + "loss": 0.5143039226531982, + "step": 6606 + }, + { + "epoch": 2.788185654008439, + "grad_norm": 1.096595287322998, + "learning_rate": 6.273012966477526e-05, + "loss": 0.543094277381897, + "step": 6608 + }, + { + "epoch": 2.789029535864979, + "grad_norm": 1.195801019668579, + "learning_rate": 6.270690496022105e-05, + "loss": 0.5597999095916748, + "step": 6610 + }, + { + "epoch": 2.7898734177215188, + "grad_norm": 1.236894965171814, + "learning_rate": 6.26836773244279e-05, + "loss": 0.5496288537979126, + "step": 6612 + }, + { + "epoch": 2.790717299578059, + "grad_norm": 1.1474205255508423, + "learning_rate": 6.2660446762754e-05, + "loss": 0.6104549169540405, + "step": 6614 + }, + { + "epoch": 2.791561181434599, + "grad_norm": 1.1649401187896729, + "learning_rate": 6.263721328055818e-05, + "loss": 0.6186942458152771, + "step": 6616 + }, + { + "epoch": 2.792405063291139, + "grad_norm": 1.1187876462936401, + "learning_rate": 6.261397688319993e-05, + "loss": 0.5332194566726685, + "step": 6618 + }, + { + "epoch": 2.7932489451476794, + "grad_norm": 1.2765967845916748, + "learning_rate": 6.25907375760395e-05, + "loss": 0.6478220224380493, + "step": 6620 + }, + { + "epoch": 2.7940928270042193, + "grad_norm": 1.232173204421997, + "learning_rate": 6.256749536443771e-05, + "loss": 0.6406530141830444, + "step": 6622 + }, + { + "epoch": 2.7949367088607593, + "grad_norm": 1.045032262802124, + "learning_rate": 6.254425025375612e-05, + "loss": 0.6082814931869507, + "step": 6624 + }, + { + "epoch": 2.7957805907172997, + "grad_norm": 1.2285528182983398, + "learning_rate": 6.252100224935689e-05, + "loss": 0.6527243852615356, + "step": 6626 + }, + { + "epoch": 2.7966244725738396, + "grad_norm": 1.1741310358047485, + "learning_rate": 6.24977513566029e-05, + "loss": 0.5787529945373535, + "step": 6628 + }, + { + "epoch": 2.7974683544303796, + "grad_norm": 1.1933153867721558, + "learning_rate": 6.247449758085773e-05, + "loss": 0.5816542506217957, + "step": 6630 + }, + { + "epoch": 2.79831223628692, + "grad_norm": 1.3991938829421997, + "learning_rate": 6.245124092748552e-05, + "loss": 0.61644446849823, + "step": 6632 + }, + { + "epoch": 2.79915611814346, + "grad_norm": 1.1720032691955566, + "learning_rate": 6.242798140185117e-05, + "loss": 0.5762863755226135, + "step": 6634 + }, + { + "epoch": 2.8, + "grad_norm": 1.2190258502960205, + "learning_rate": 6.240471900932019e-05, + "loss": 0.656046986579895, + "step": 6636 + }, + { + "epoch": 2.80084388185654, + "grad_norm": 1.128190040588379, + "learning_rate": 6.238145375525877e-05, + "loss": 0.5192724466323853, + "step": 6638 + }, + { + "epoch": 2.80168776371308, + "grad_norm": 1.2625527381896973, + "learning_rate": 6.235818564503377e-05, + "loss": 0.6037933826446533, + "step": 6640 + }, + { + "epoch": 2.80253164556962, + "grad_norm": 1.2483288049697876, + "learning_rate": 6.233491468401268e-05, + "loss": 0.6108730435371399, + "step": 6642 + }, + { + "epoch": 2.8033755274261605, + "grad_norm": 1.3986961841583252, + "learning_rate": 6.231164087756367e-05, + "loss": 0.6408922672271729, + "step": 6644 + }, + { + "epoch": 2.8042194092827004, + "grad_norm": 1.2224489450454712, + "learning_rate": 6.228836423105556e-05, + "loss": 0.648504376411438, + "step": 6646 + }, + { + "epoch": 2.8050632911392404, + "grad_norm": 1.2060397863388062, + "learning_rate": 6.226508474985782e-05, + "loss": 0.5769880414009094, + "step": 6648 + }, + { + "epoch": 2.8059071729957807, + "grad_norm": 1.262581467628479, + "learning_rate": 6.224180243934058e-05, + "loss": 0.6585965752601624, + "step": 6650 + }, + { + "epoch": 2.8067510548523207, + "grad_norm": 1.1175196170806885, + "learning_rate": 6.221851730487463e-05, + "loss": 0.618746817111969, + "step": 6652 + }, + { + "epoch": 2.8075949367088606, + "grad_norm": 1.2256932258605957, + "learning_rate": 6.219522935183141e-05, + "loss": 0.5708954930305481, + "step": 6654 + }, + { + "epoch": 2.808438818565401, + "grad_norm": 1.3388983011245728, + "learning_rate": 6.217193858558298e-05, + "loss": 0.608521580696106, + "step": 6656 + }, + { + "epoch": 2.809282700421941, + "grad_norm": 1.2913719415664673, + "learning_rate": 6.214864501150208e-05, + "loss": 0.64382004737854, + "step": 6658 + }, + { + "epoch": 2.810126582278481, + "grad_norm": 1.039406657218933, + "learning_rate": 6.21253486349621e-05, + "loss": 0.567484438419342, + "step": 6660 + }, + { + "epoch": 2.8109704641350213, + "grad_norm": 1.123612642288208, + "learning_rate": 6.210204946133707e-05, + "loss": 0.5696196556091309, + "step": 6662 + }, + { + "epoch": 2.811814345991561, + "grad_norm": 1.1850367784500122, + "learning_rate": 6.207874749600164e-05, + "loss": 0.6068252921104431, + "step": 6664 + }, + { + "epoch": 2.812658227848101, + "grad_norm": 1.3630138635635376, + "learning_rate": 6.205544274433115e-05, + "loss": 0.6329811215400696, + "step": 6666 + }, + { + "epoch": 2.8135021097046415, + "grad_norm": 1.217410683631897, + "learning_rate": 6.203213521170154e-05, + "loss": 0.5600330829620361, + "step": 6668 + }, + { + "epoch": 2.8143459915611815, + "grad_norm": 3.5133564472198486, + "learning_rate": 6.200882490348942e-05, + "loss": 0.639461874961853, + "step": 6670 + }, + { + "epoch": 2.8151898734177214, + "grad_norm": 1.2535229921340942, + "learning_rate": 6.198551182507203e-05, + "loss": 0.5908592939376831, + "step": 6672 + }, + { + "epoch": 2.816033755274262, + "grad_norm": 1.2667300701141357, + "learning_rate": 6.196219598182726e-05, + "loss": 0.5490466952323914, + "step": 6674 + }, + { + "epoch": 2.8168776371308017, + "grad_norm": 1.332416296005249, + "learning_rate": 6.19388773791336e-05, + "loss": 0.6570454239845276, + "step": 6676 + }, + { + "epoch": 2.8177215189873417, + "grad_norm": 1.2882871627807617, + "learning_rate": 6.191555602237023e-05, + "loss": 0.6296758651733398, + "step": 6678 + }, + { + "epoch": 2.818565400843882, + "grad_norm": 1.2949540615081787, + "learning_rate": 6.189223191691691e-05, + "loss": 0.6238688826560974, + "step": 6680 + }, + { + "epoch": 2.819409282700422, + "grad_norm": 1.3507297039031982, + "learning_rate": 6.18689050681541e-05, + "loss": 0.6287838220596313, + "step": 6682 + }, + { + "epoch": 2.820253164556962, + "grad_norm": 1.0284801721572876, + "learning_rate": 6.184557548146282e-05, + "loss": 0.5871602892875671, + "step": 6684 + }, + { + "epoch": 2.8210970464135023, + "grad_norm": 1.3238089084625244, + "learning_rate": 6.182224316222478e-05, + "loss": 0.5973687171936035, + "step": 6686 + }, + { + "epoch": 2.8219409282700423, + "grad_norm": 1.0406007766723633, + "learning_rate": 6.179890811582232e-05, + "loss": 0.5463243722915649, + "step": 6688 + }, + { + "epoch": 2.8227848101265822, + "grad_norm": 1.1670905351638794, + "learning_rate": 6.177557034763832e-05, + "loss": 0.5976935625076294, + "step": 6690 + }, + { + "epoch": 2.8236286919831226, + "grad_norm": 1.0810848474502563, + "learning_rate": 6.175222986305642e-05, + "loss": 0.6159120798110962, + "step": 6692 + }, + { + "epoch": 2.8244725738396625, + "grad_norm": 1.1419588327407837, + "learning_rate": 6.172888666746078e-05, + "loss": 0.6232127547264099, + "step": 6694 + }, + { + "epoch": 2.8253164556962025, + "grad_norm": 1.118447184562683, + "learning_rate": 6.170554076623627e-05, + "loss": 0.579402506351471, + "step": 6696 + }, + { + "epoch": 2.826160337552743, + "grad_norm": 1.3584961891174316, + "learning_rate": 6.168219216476828e-05, + "loss": 0.5871124863624573, + "step": 6698 + }, + { + "epoch": 2.827004219409283, + "grad_norm": 1.1773170232772827, + "learning_rate": 6.165884086844295e-05, + "loss": 0.6119418144226074, + "step": 6700 + }, + { + "epoch": 2.827004219409283, + "eval_loss": 0.6737436056137085, + "eval_runtime": 513.2559, + "eval_samples_per_second": 4.105, + "eval_steps_per_second": 4.105, + "step": 6700 + }, + { + "epoch": 2.8278481012658228, + "grad_norm": 1.2150315046310425, + "learning_rate": 6.163548688264693e-05, + "loss": 0.606975257396698, + "step": 6702 + }, + { + "epoch": 2.828691983122363, + "grad_norm": 1.23250412940979, + "learning_rate": 6.161213021276754e-05, + "loss": 0.5860852003097534, + "step": 6704 + }, + { + "epoch": 2.829535864978903, + "grad_norm": 1.1053578853607178, + "learning_rate": 6.158877086419273e-05, + "loss": 0.543590784072876, + "step": 6706 + }, + { + "epoch": 2.830379746835443, + "grad_norm": 1.2813301086425781, + "learning_rate": 6.156540884231105e-05, + "loss": 0.6040283441543579, + "step": 6708 + }, + { + "epoch": 2.831223628691983, + "grad_norm": 1.2987254858016968, + "learning_rate": 6.154204415251169e-05, + "loss": 0.586407482624054, + "step": 6710 + }, + { + "epoch": 2.8320675105485233, + "grad_norm": 1.1980805397033691, + "learning_rate": 6.151867680018438e-05, + "loss": 0.6180199384689331, + "step": 6712 + }, + { + "epoch": 2.8329113924050633, + "grad_norm": 1.642957329750061, + "learning_rate": 6.149530679071956e-05, + "loss": 0.5772807002067566, + "step": 6714 + }, + { + "epoch": 2.8337552742616032, + "grad_norm": 1.3908783197402954, + "learning_rate": 6.147193412950825e-05, + "loss": 0.6107099652290344, + "step": 6716 + }, + { + "epoch": 2.8345991561181436, + "grad_norm": 1.3866089582443237, + "learning_rate": 6.144855882194206e-05, + "loss": 0.5335796475410461, + "step": 6718 + }, + { + "epoch": 2.8354430379746836, + "grad_norm": 1.2989959716796875, + "learning_rate": 6.14251808734132e-05, + "loss": 0.5962506532669067, + "step": 6720 + }, + { + "epoch": 2.8362869198312235, + "grad_norm": 1.3145360946655273, + "learning_rate": 6.140180028931456e-05, + "loss": 0.6368465423583984, + "step": 6722 + }, + { + "epoch": 2.8371308016877634, + "grad_norm": 1.1515997648239136, + "learning_rate": 6.137841707503955e-05, + "loss": 0.6448454856872559, + "step": 6724 + }, + { + "epoch": 2.837974683544304, + "grad_norm": 1.0785750150680542, + "learning_rate": 6.135503123598225e-05, + "loss": 0.49946340918540955, + "step": 6726 + }, + { + "epoch": 2.8388185654008438, + "grad_norm": 1.1683695316314697, + "learning_rate": 6.133164277753733e-05, + "loss": 0.550529956817627, + "step": 6728 + }, + { + "epoch": 2.8396624472573837, + "grad_norm": 1.0640658140182495, + "learning_rate": 6.130825170510006e-05, + "loss": 0.5135641098022461, + "step": 6730 + }, + { + "epoch": 2.840506329113924, + "grad_norm": 1.1805553436279297, + "learning_rate": 6.12848580240663e-05, + "loss": 0.6608622670173645, + "step": 6732 + }, + { + "epoch": 2.841350210970464, + "grad_norm": 1.2218462228775024, + "learning_rate": 6.12614617398325e-05, + "loss": 0.6797777414321899, + "step": 6734 + }, + { + "epoch": 2.842194092827004, + "grad_norm": 1.0677950382232666, + "learning_rate": 6.123806285779576e-05, + "loss": 0.5570073127746582, + "step": 6736 + }, + { + "epoch": 2.8430379746835444, + "grad_norm": 1.202785849571228, + "learning_rate": 6.121466138335376e-05, + "loss": 0.6273435354232788, + "step": 6738 + }, + { + "epoch": 2.8438818565400843, + "grad_norm": 1.1837576627731323, + "learning_rate": 6.119125732190477e-05, + "loss": 0.6337732076644897, + "step": 6740 + }, + { + "epoch": 2.8447257383966242, + "grad_norm": 1.2692649364471436, + "learning_rate": 6.116785067884764e-05, + "loss": 0.6228005886077881, + "step": 6742 + }, + { + "epoch": 2.8455696202531646, + "grad_norm": 1.3237874507904053, + "learning_rate": 6.114444145958183e-05, + "loss": 0.5781991481781006, + "step": 6744 + }, + { + "epoch": 2.8464135021097046, + "grad_norm": 1.2384692430496216, + "learning_rate": 6.112102966950742e-05, + "loss": 0.5583632588386536, + "step": 6746 + }, + { + "epoch": 2.8472573839662445, + "grad_norm": 1.1730914115905762, + "learning_rate": 6.109761531402505e-05, + "loss": 0.5704524517059326, + "step": 6748 + }, + { + "epoch": 2.848101265822785, + "grad_norm": 1.3047250509262085, + "learning_rate": 6.107419839853597e-05, + "loss": 0.5658026933670044, + "step": 6750 + }, + { + "epoch": 2.848945147679325, + "grad_norm": 1.2044686079025269, + "learning_rate": 6.105077892844198e-05, + "loss": 0.5919271111488342, + "step": 6752 + }, + { + "epoch": 2.8497890295358648, + "grad_norm": 1.1952540874481201, + "learning_rate": 6.102735690914554e-05, + "loss": 0.578326404094696, + "step": 6754 + }, + { + "epoch": 2.850632911392405, + "grad_norm": 1.2275413274765015, + "learning_rate": 6.1003932346049633e-05, + "loss": 0.6079645156860352, + "step": 6756 + }, + { + "epoch": 2.851476793248945, + "grad_norm": 1.2760299444198608, + "learning_rate": 6.0980505244557884e-05, + "loss": 0.6111302375793457, + "step": 6758 + }, + { + "epoch": 2.852320675105485, + "grad_norm": 1.4044286012649536, + "learning_rate": 6.095707561007444e-05, + "loss": 0.6397197246551514, + "step": 6760 + }, + { + "epoch": 2.8531645569620254, + "grad_norm": 1.3707174062728882, + "learning_rate": 6.0933643448004094e-05, + "loss": 0.6183030605316162, + "step": 6762 + }, + { + "epoch": 2.8540084388185654, + "grad_norm": 1.290480613708496, + "learning_rate": 6.091020876375221e-05, + "loss": 0.6367093920707703, + "step": 6764 + }, + { + "epoch": 2.8548523206751053, + "grad_norm": 1.0469609498977661, + "learning_rate": 6.0886771562724673e-05, + "loss": 0.550685703754425, + "step": 6766 + }, + { + "epoch": 2.8556962025316457, + "grad_norm": 1.312018871307373, + "learning_rate": 6.086333185032804e-05, + "loss": 0.5789266228675842, + "step": 6768 + }, + { + "epoch": 2.8565400843881856, + "grad_norm": 1.3253673315048218, + "learning_rate": 6.0839889631969374e-05, + "loss": 0.5595589876174927, + "step": 6770 + }, + { + "epoch": 2.8573839662447256, + "grad_norm": 1.2848154306411743, + "learning_rate": 6.0816444913056356e-05, + "loss": 0.5642995238304138, + "step": 6772 + }, + { + "epoch": 2.858227848101266, + "grad_norm": 1.2492237091064453, + "learning_rate": 6.079299769899722e-05, + "loss": 0.5502132773399353, + "step": 6774 + }, + { + "epoch": 2.859071729957806, + "grad_norm": 1.2817713022232056, + "learning_rate": 6.076954799520081e-05, + "loss": 0.5535969138145447, + "step": 6776 + }, + { + "epoch": 2.859915611814346, + "grad_norm": 1.1986786127090454, + "learning_rate": 6.074609580707651e-05, + "loss": 0.6086817979812622, + "step": 6778 + }, + { + "epoch": 2.8607594936708862, + "grad_norm": 1.274839162826538, + "learning_rate": 6.0722641140034285e-05, + "loss": 0.6254655718803406, + "step": 6780 + }, + { + "epoch": 2.861603375527426, + "grad_norm": 1.0627212524414062, + "learning_rate": 6.0699183999484685e-05, + "loss": 0.6227576732635498, + "step": 6782 + }, + { + "epoch": 2.862447257383966, + "grad_norm": 1.2313296794891357, + "learning_rate": 6.0675724390838815e-05, + "loss": 0.6257740259170532, + "step": 6784 + }, + { + "epoch": 2.8632911392405065, + "grad_norm": 1.1398836374282837, + "learning_rate": 6.065226231950837e-05, + "loss": 0.6438660621643066, + "step": 6786 + }, + { + "epoch": 2.8641350210970464, + "grad_norm": 1.1606178283691406, + "learning_rate": 6.0628797790905566e-05, + "loss": 0.5654972195625305, + "step": 6788 + }, + { + "epoch": 2.8649789029535864, + "grad_norm": 1.2857846021652222, + "learning_rate": 6.060533081044326e-05, + "loss": 0.5413897633552551, + "step": 6790 + }, + { + "epoch": 2.8658227848101268, + "grad_norm": 1.2358965873718262, + "learning_rate": 6.058186138353481e-05, + "loss": 0.5737078785896301, + "step": 6792 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 1.0813729763031006, + "learning_rate": 6.055838951559417e-05, + "loss": 0.5880253314971924, + "step": 6794 + }, + { + "epoch": 2.8675105485232066, + "grad_norm": 1.2310819625854492, + "learning_rate": 6.0534915212035836e-05, + "loss": 0.5762695074081421, + "step": 6796 + }, + { + "epoch": 2.868354430379747, + "grad_norm": 1.2762445211410522, + "learning_rate": 6.0511438478274906e-05, + "loss": 0.6172254085540771, + "step": 6798 + }, + { + "epoch": 2.869198312236287, + "grad_norm": 1.0100860595703125, + "learning_rate": 6.0487959319726994e-05, + "loss": 0.5419955849647522, + "step": 6800 + }, + { + "epoch": 2.869198312236287, + "eval_loss": 0.6721681356430054, + "eval_runtime": 513.1285, + "eval_samples_per_second": 4.106, + "eval_steps_per_second": 4.106, + "step": 6800 + }, + { + "epoch": 2.870042194092827, + "grad_norm": 1.3078527450561523, + "learning_rate": 6.046447774180827e-05, + "loss": 0.6330351233482361, + "step": 6802 + }, + { + "epoch": 2.8708860759493673, + "grad_norm": 1.3523176908493042, + "learning_rate": 6.044099374993553e-05, + "loss": 0.5479466915130615, + "step": 6804 + }, + { + "epoch": 2.8717299578059072, + "grad_norm": 1.109269142150879, + "learning_rate": 6.041750734952604e-05, + "loss": 0.5516952872276306, + "step": 6806 + }, + { + "epoch": 2.872573839662447, + "grad_norm": 1.2368918657302856, + "learning_rate": 6.039401854599769e-05, + "loss": 0.5878147482872009, + "step": 6808 + }, + { + "epoch": 2.8734177215189876, + "grad_norm": 1.1626032590866089, + "learning_rate": 6.037052734476886e-05, + "loss": 0.5637685656547546, + "step": 6810 + }, + { + "epoch": 2.8742616033755275, + "grad_norm": 1.1955288648605347, + "learning_rate": 6.0347033751258566e-05, + "loss": 0.5398213267326355, + "step": 6812 + }, + { + "epoch": 2.8751054852320674, + "grad_norm": 1.3805105686187744, + "learning_rate": 6.0323537770886285e-05, + "loss": 0.6098157167434692, + "step": 6814 + }, + { + "epoch": 2.875949367088608, + "grad_norm": 1.2644819021224976, + "learning_rate": 6.030003940907212e-05, + "loss": 0.5970560312271118, + "step": 6816 + }, + { + "epoch": 2.8767932489451478, + "grad_norm": 1.1625932455062866, + "learning_rate": 6.027653867123667e-05, + "loss": 0.5918156504631042, + "step": 6818 + }, + { + "epoch": 2.8776371308016877, + "grad_norm": 1.3591371774673462, + "learning_rate": 6.025303556280112e-05, + "loss": 0.5625584721565247, + "step": 6820 + }, + { + "epoch": 2.878481012658228, + "grad_norm": 1.266757845878601, + "learning_rate": 6.022953008918718e-05, + "loss": 0.6422242522239685, + "step": 6822 + }, + { + "epoch": 2.879324894514768, + "grad_norm": 1.273234248161316, + "learning_rate": 6.0206022255817095e-05, + "loss": 0.6625136733055115, + "step": 6824 + }, + { + "epoch": 2.880168776371308, + "grad_norm": 1.2808254957199097, + "learning_rate": 6.0182512068113715e-05, + "loss": 0.6410037279129028, + "step": 6826 + }, + { + "epoch": 2.8810126582278484, + "grad_norm": 1.1684991121292114, + "learning_rate": 6.0158999531500335e-05, + "loss": 0.5269461274147034, + "step": 6828 + }, + { + "epoch": 2.8818565400843883, + "grad_norm": 1.3655736446380615, + "learning_rate": 6.0135484651400886e-05, + "loss": 0.6546348929405212, + "step": 6830 + }, + { + "epoch": 2.8827004219409282, + "grad_norm": 1.3913087844848633, + "learning_rate": 6.011196743323977e-05, + "loss": 0.5872722864151001, + "step": 6832 + }, + { + "epoch": 2.8835443037974686, + "grad_norm": 1.1047117710113525, + "learning_rate": 6.008844788244199e-05, + "loss": 0.5498786568641663, + "step": 6834 + }, + { + "epoch": 2.8843881856540086, + "grad_norm": 1.0897705554962158, + "learning_rate": 6.006492600443301e-05, + "loss": 0.5740244388580322, + "step": 6836 + }, + { + "epoch": 2.8852320675105485, + "grad_norm": 1.0046823024749756, + "learning_rate": 6.004140180463891e-05, + "loss": 0.5618779063224792, + "step": 6838 + }, + { + "epoch": 2.8860759493670884, + "grad_norm": 1.231499195098877, + "learning_rate": 6.001787528848628e-05, + "loss": 0.6124269366264343, + "step": 6840 + }, + { + "epoch": 2.886919831223629, + "grad_norm": 1.1776596307754517, + "learning_rate": 5.999434646140219e-05, + "loss": 0.5512109994888306, + "step": 6842 + }, + { + "epoch": 2.8877637130801688, + "grad_norm": 1.2528871297836304, + "learning_rate": 5.9970815328814334e-05, + "loss": 0.610329270362854, + "step": 6844 + }, + { + "epoch": 2.8886075949367087, + "grad_norm": 1.4408416748046875, + "learning_rate": 5.994728189615087e-05, + "loss": 0.568793773651123, + "step": 6846 + }, + { + "epoch": 2.889451476793249, + "grad_norm": 1.2031673192977905, + "learning_rate": 5.9923746168840523e-05, + "loss": 0.6107773184776306, + "step": 6848 + }, + { + "epoch": 2.890295358649789, + "grad_norm": 1.3201221227645874, + "learning_rate": 5.990020815231251e-05, + "loss": 0.6217910647392273, + "step": 6850 + }, + { + "epoch": 2.891139240506329, + "grad_norm": 1.1753840446472168, + "learning_rate": 5.987666785199661e-05, + "loss": 0.6051784157752991, + "step": 6852 + }, + { + "epoch": 2.8919831223628694, + "grad_norm": 1.2406786680221558, + "learning_rate": 5.985312527332314e-05, + "loss": 0.5736448168754578, + "step": 6854 + }, + { + "epoch": 2.8928270042194093, + "grad_norm": 1.6206021308898926, + "learning_rate": 5.98295804217229e-05, + "loss": 0.5454224944114685, + "step": 6856 + }, + { + "epoch": 2.8936708860759492, + "grad_norm": 1.2756178379058838, + "learning_rate": 5.9806033302627227e-05, + "loss": 0.5912685990333557, + "step": 6858 + }, + { + "epoch": 2.894514767932489, + "grad_norm": 1.223631501197815, + "learning_rate": 5.9782483921468e-05, + "loss": 0.5619014501571655, + "step": 6860 + }, + { + "epoch": 2.8953586497890296, + "grad_norm": 1.06546151638031, + "learning_rate": 5.975893228367762e-05, + "loss": 0.5629459619522095, + "step": 6862 + }, + { + "epoch": 2.8962025316455695, + "grad_norm": 1.0573277473449707, + "learning_rate": 5.9735378394688965e-05, + "loss": 0.4997110366821289, + "step": 6864 + }, + { + "epoch": 2.8970464135021095, + "grad_norm": 1.2832465171813965, + "learning_rate": 5.97118222599355e-05, + "loss": 0.6370334625244141, + "step": 6866 + }, + { + "epoch": 2.89789029535865, + "grad_norm": 1.1721924543380737, + "learning_rate": 5.968826388485116e-05, + "loss": 0.6095840334892273, + "step": 6868 + }, + { + "epoch": 2.8987341772151898, + "grad_norm": 1.1428951025009155, + "learning_rate": 5.966470327487042e-05, + "loss": 0.6075419187545776, + "step": 6870 + }, + { + "epoch": 2.8995780590717297, + "grad_norm": 1.2369399070739746, + "learning_rate": 5.964114043542822e-05, + "loss": 0.6376850605010986, + "step": 6872 + }, + { + "epoch": 2.90042194092827, + "grad_norm": 1.178520679473877, + "learning_rate": 5.961757537196011e-05, + "loss": 0.57747882604599, + "step": 6874 + }, + { + "epoch": 2.90126582278481, + "grad_norm": 1.2600151300430298, + "learning_rate": 5.959400808990205e-05, + "loss": 0.626102864742279, + "step": 6876 + }, + { + "epoch": 2.90210970464135, + "grad_norm": 1.2809659242630005, + "learning_rate": 5.957043859469058e-05, + "loss": 0.6087106466293335, + "step": 6878 + }, + { + "epoch": 2.9029535864978904, + "grad_norm": 1.2029764652252197, + "learning_rate": 5.954686689176274e-05, + "loss": 0.599288284778595, + "step": 6880 + }, + { + "epoch": 2.9037974683544303, + "grad_norm": 1.2000751495361328, + "learning_rate": 5.952329298655607e-05, + "loss": 0.6364397406578064, + "step": 6882 + }, + { + "epoch": 2.9046413502109703, + "grad_norm": 1.3380756378173828, + "learning_rate": 5.949971688450859e-05, + "loss": 0.6032583713531494, + "step": 6884 + }, + { + "epoch": 2.9054852320675106, + "grad_norm": 1.207139015197754, + "learning_rate": 5.9476138591058874e-05, + "loss": 0.6217718720436096, + "step": 6886 + }, + { + "epoch": 2.9063291139240506, + "grad_norm": 1.2060731649398804, + "learning_rate": 5.945255811164598e-05, + "loss": 0.5663400888442993, + "step": 6888 + }, + { + "epoch": 2.9071729957805905, + "grad_norm": 1.3331942558288574, + "learning_rate": 5.9428975451709465e-05, + "loss": 0.583290696144104, + "step": 6890 + }, + { + "epoch": 2.908016877637131, + "grad_norm": 1.226565957069397, + "learning_rate": 5.94053906166894e-05, + "loss": 0.5606404542922974, + "step": 6892 + }, + { + "epoch": 2.908860759493671, + "grad_norm": 1.167909026145935, + "learning_rate": 5.938180361202636e-05, + "loss": 0.5337109565734863, + "step": 6894 + }, + { + "epoch": 2.909704641350211, + "grad_norm": 1.2748368978500366, + "learning_rate": 5.93582144431614e-05, + "loss": 0.64582759141922, + "step": 6896 + }, + { + "epoch": 2.910548523206751, + "grad_norm": 1.2209413051605225, + "learning_rate": 5.93346231155361e-05, + "loss": 0.631919801235199, + "step": 6898 + }, + { + "epoch": 2.911392405063291, + "grad_norm": 1.2692270278930664, + "learning_rate": 5.931102963459252e-05, + "loss": 0.5999054908752441, + "step": 6900 + }, + { + "epoch": 2.911392405063291, + "eval_loss": 0.6713213920593262, + "eval_runtime": 513.1265, + "eval_samples_per_second": 4.106, + "eval_steps_per_second": 4.106, + "step": 6900 + }, + { + "epoch": 2.912236286919831, + "grad_norm": 1.3654414415359497, + "learning_rate": 5.928743400577323e-05, + "loss": 0.634549081325531, + "step": 6902 + }, + { + "epoch": 2.9130801687763714, + "grad_norm": 1.4427542686462402, + "learning_rate": 5.926383623452128e-05, + "loss": 0.684973418712616, + "step": 6904 + }, + { + "epoch": 2.9139240506329114, + "grad_norm": 1.3192591667175293, + "learning_rate": 5.9240236326280216e-05, + "loss": 0.6641559600830078, + "step": 6906 + }, + { + "epoch": 2.9147679324894513, + "grad_norm": 1.3328732252120972, + "learning_rate": 5.921663428649411e-05, + "loss": 0.6443751454353333, + "step": 6908 + }, + { + "epoch": 2.9156118143459917, + "grad_norm": 1.191504716873169, + "learning_rate": 5.9193030120607486e-05, + "loss": 0.674626886844635, + "step": 6910 + }, + { + "epoch": 2.9164556962025316, + "grad_norm": 1.2599490880966187, + "learning_rate": 5.916942383406535e-05, + "loss": 0.6297666430473328, + "step": 6912 + }, + { + "epoch": 2.9172995780590716, + "grad_norm": 0.9829303622245789, + "learning_rate": 5.914581543231324e-05, + "loss": 0.5809952616691589, + "step": 6914 + }, + { + "epoch": 2.918143459915612, + "grad_norm": 1.1566280126571655, + "learning_rate": 5.9122204920797176e-05, + "loss": 0.6383126974105835, + "step": 6916 + }, + { + "epoch": 2.918987341772152, + "grad_norm": 1.047351360321045, + "learning_rate": 5.9098592304963616e-05, + "loss": 0.5681729316711426, + "step": 6918 + }, + { + "epoch": 2.919831223628692, + "grad_norm": 1.2059552669525146, + "learning_rate": 5.907497759025956e-05, + "loss": 0.5985210537910461, + "step": 6920 + }, + { + "epoch": 2.9206751054852322, + "grad_norm": 1.1845992803573608, + "learning_rate": 5.905136078213247e-05, + "loss": 0.5815024375915527, + "step": 6922 + }, + { + "epoch": 2.921518987341772, + "grad_norm": 1.3542579412460327, + "learning_rate": 5.9027741886030266e-05, + "loss": 0.6437575221061707, + "step": 6924 + }, + { + "epoch": 2.922362869198312, + "grad_norm": 1.1001946926116943, + "learning_rate": 5.900412090740139e-05, + "loss": 0.5773448348045349, + "step": 6926 + }, + { + "epoch": 2.9232067510548525, + "grad_norm": 1.220449447631836, + "learning_rate": 5.898049785169476e-05, + "loss": 0.6076427698135376, + "step": 6928 + }, + { + "epoch": 2.9240506329113924, + "grad_norm": 1.126592993736267, + "learning_rate": 5.895687272435975e-05, + "loss": 0.5418170690536499, + "step": 6930 + }, + { + "epoch": 2.9248945147679324, + "grad_norm": 1.1005871295928955, + "learning_rate": 5.893324553084622e-05, + "loss": 0.6057441234588623, + "step": 6932 + }, + { + "epoch": 2.9257383966244728, + "grad_norm": 1.0291813611984253, + "learning_rate": 5.89096162766045e-05, + "loss": 0.4844438433647156, + "step": 6934 + }, + { + "epoch": 2.9265822784810127, + "grad_norm": 1.0685851573944092, + "learning_rate": 5.888598496708543e-05, + "loss": 0.5230311751365662, + "step": 6936 + }, + { + "epoch": 2.9274261603375527, + "grad_norm": 1.1004319190979004, + "learning_rate": 5.8862351607740285e-05, + "loss": 0.6191393136978149, + "step": 6938 + }, + { + "epoch": 2.928270042194093, + "grad_norm": 1.2164443731307983, + "learning_rate": 5.8838716204020815e-05, + "loss": 0.5574309825897217, + "step": 6940 + }, + { + "epoch": 2.929113924050633, + "grad_norm": 1.104511022567749, + "learning_rate": 5.881507876137928e-05, + "loss": 0.5820326209068298, + "step": 6942 + }, + { + "epoch": 2.929957805907173, + "grad_norm": 1.4402027130126953, + "learning_rate": 5.879143928526838e-05, + "loss": 0.6016243696212769, + "step": 6944 + }, + { + "epoch": 2.9308016877637133, + "grad_norm": 1.2131510972976685, + "learning_rate": 5.8767797781141274e-05, + "loss": 0.574772834777832, + "step": 6946 + }, + { + "epoch": 2.9316455696202532, + "grad_norm": 1.2146058082580566, + "learning_rate": 5.874415425445159e-05, + "loss": 0.6725581884384155, + "step": 6948 + }, + { + "epoch": 2.932489451476793, + "grad_norm": 1.2887672185897827, + "learning_rate": 5.872050871065349e-05, + "loss": 0.5900663733482361, + "step": 6950 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 1.340739369392395, + "learning_rate": 5.869686115520148e-05, + "loss": 0.6624540686607361, + "step": 6952 + }, + { + "epoch": 2.9341772151898735, + "grad_norm": 1.3531051874160767, + "learning_rate": 5.867321159355062e-05, + "loss": 0.5319855809211731, + "step": 6954 + }, + { + "epoch": 2.9350210970464135, + "grad_norm": 1.441260814666748, + "learning_rate": 5.864956003115646e-05, + "loss": 0.6661397218704224, + "step": 6956 + }, + { + "epoch": 2.935864978902954, + "grad_norm": 1.314922571182251, + "learning_rate": 5.862590647347488e-05, + "loss": 0.6062843799591064, + "step": 6958 + }, + { + "epoch": 2.9367088607594938, + "grad_norm": 1.134419560432434, + "learning_rate": 5.860225092596237e-05, + "loss": 0.6123294234275818, + "step": 6960 + }, + { + "epoch": 2.9375527426160337, + "grad_norm": 1.3195313215255737, + "learning_rate": 5.8578593394075746e-05, + "loss": 0.5984833240509033, + "step": 6962 + }, + { + "epoch": 2.938396624472574, + "grad_norm": 1.1626067161560059, + "learning_rate": 5.855493388327242e-05, + "loss": 0.5695837736129761, + "step": 6964 + }, + { + "epoch": 2.939240506329114, + "grad_norm": 1.1392630338668823, + "learning_rate": 5.853127239901012e-05, + "loss": 0.5688632726669312, + "step": 6966 + }, + { + "epoch": 2.940084388185654, + "grad_norm": 1.2131112813949585, + "learning_rate": 5.850760894674713e-05, + "loss": 0.6139572262763977, + "step": 6968 + }, + { + "epoch": 2.9409282700421944, + "grad_norm": 1.1740806102752686, + "learning_rate": 5.8483943531942154e-05, + "loss": 0.6654361486434937, + "step": 6970 + }, + { + "epoch": 2.9417721518987343, + "grad_norm": 1.1364716291427612, + "learning_rate": 5.846027616005433e-05, + "loss": 0.5477408766746521, + "step": 6972 + }, + { + "epoch": 2.9426160337552743, + "grad_norm": 1.212761640548706, + "learning_rate": 5.843660683654328e-05, + "loss": 0.6023505926132202, + "step": 6974 + }, + { + "epoch": 2.943459915611814, + "grad_norm": 1.1042946577072144, + "learning_rate": 5.8412935566869075e-05, + "loss": 0.5926207304000854, + "step": 6976 + }, + { + "epoch": 2.9443037974683546, + "grad_norm": 1.2444789409637451, + "learning_rate": 5.83892623564922e-05, + "loss": 0.5590356588363647, + "step": 6978 + }, + { + "epoch": 2.9451476793248945, + "grad_norm": 1.0782465934753418, + "learning_rate": 5.8365587210873616e-05, + "loss": 0.553716778755188, + "step": 6980 + }, + { + "epoch": 2.9459915611814345, + "grad_norm": 1.1914669275283813, + "learning_rate": 5.834191013547473e-05, + "loss": 0.5937044024467468, + "step": 6982 + }, + { + "epoch": 2.946835443037975, + "grad_norm": 1.1819682121276855, + "learning_rate": 5.83182311357574e-05, + "loss": 0.6439019441604614, + "step": 6984 + }, + { + "epoch": 2.947679324894515, + "grad_norm": 1.1807081699371338, + "learning_rate": 5.829455021718389e-05, + "loss": 0.5403141379356384, + "step": 6986 + }, + { + "epoch": 2.9485232067510547, + "grad_norm": 1.2721227407455444, + "learning_rate": 5.827086738521692e-05, + "loss": 0.5281378626823425, + "step": 6988 + }, + { + "epoch": 2.9493670886075947, + "grad_norm": 1.6942147016525269, + "learning_rate": 5.824718264531972e-05, + "loss": 0.5722067952156067, + "step": 6990 + }, + { + "epoch": 2.950210970464135, + "grad_norm": 1.3415225744247437, + "learning_rate": 5.8223496002955865e-05, + "loss": 0.6228076815605164, + "step": 6992 + }, + { + "epoch": 2.951054852320675, + "grad_norm": 1.235356092453003, + "learning_rate": 5.819980746358941e-05, + "loss": 0.6019303202629089, + "step": 6994 + }, + { + "epoch": 2.951898734177215, + "grad_norm": 1.2500600814819336, + "learning_rate": 5.817611703268486e-05, + "loss": 0.5699147582054138, + "step": 6996 + }, + { + "epoch": 2.9527426160337553, + "grad_norm": 1.1581830978393555, + "learning_rate": 5.8152424715707145e-05, + "loss": 0.6304079294204712, + "step": 6998 + }, + { + "epoch": 2.9535864978902953, + "grad_norm": 1.2924201488494873, + "learning_rate": 5.812873051812161e-05, + "loss": 0.5464767217636108, + "step": 7000 + }, + { + "epoch": 2.9535864978902953, + "eval_loss": 0.6706293225288391, + "eval_runtime": 513.4396, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 7000 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 2 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.270271124805675e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7000/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7500/README.md b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0028988c0ff29a9ff4da9494c7bae60663cf8af --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/README.md @@ -0,0 +1,207 @@ +--- +base_model: Models/Devstral-Small-2-24B-HS-CPT +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Models/Devstral-Small-2-24B-HS-CPT +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.0 \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7500/adapter_config.json b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31810a8c9ae7f10d7755e383bf916a17d8099b79 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/adapter_config.json @@ -0,0 +1,43 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Models/Devstral-Small-2-24B-HS-CPT", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.0", + "qalora_group_size": 16, + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7500/adapter_model.safetensors b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..87f87c6c6a3b97f937ebb247301604df0d0b5a1e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0ef7ae49a4941d42fe49462493f5ffad21e852f638ea43fbdb9bdbcc73648bc +size 45690960 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7500/optimizer.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..46f61603f1ede603767f1b544a89aea923ff3f4a --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0365e99227a5d9346e52769b29710c7a29b99aa0572579a1e797dd29bf8a2ad5 +size 78912442 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7500/rng_state.pth b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..655072eb6e033007cab0253e43a0a7f77c76e0c6 --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ead024ffad3a3e73307ede64c06b20de774f03ae89f82c955879566121478ca7 +size 14244 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7500/scheduler.pt b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c42ca71b60a3a9aafb8225a115c0ea4fed67efbc --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7910aae9689c7b6e0e0d7abee60179f7c3dfde12095026fc1b95149104585032 +size 1064 diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7500/trainer_state.json b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0e0322a55ac028b522ab03cecf791b1f945b3b5c --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/trainer_state.json @@ -0,0 +1,26893 @@ +{ + "best_global_step": 7100, + "best_metric": 0.6692973375320435, + "best_model_checkpoint": "task2file/sft_devstral_24B_v2/checkpoints/checkpoint-7000", + "epoch": 3.1645569620253164, + "eval_steps": 100, + "global_step": 7500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008438818565400844, + "grad_norm": 1.597854733467102, + "learning_rate": 8.787346221441124e-08, + "loss": 1.3927901983261108, + "step": 2 + }, + { + "epoch": 0.0016877637130801688, + "grad_norm": 1.6547431945800781, + "learning_rate": 2.6362038664323375e-07, + "loss": 1.407160758972168, + "step": 4 + }, + { + "epoch": 0.002531645569620253, + "grad_norm": 1.8221601247787476, + "learning_rate": 4.393673110720563e-07, + "loss": 1.376656174659729, + "step": 6 + }, + { + "epoch": 0.0033755274261603376, + "grad_norm": 1.4831048250198364, + "learning_rate": 6.151142355008788e-07, + "loss": 1.247712254524231, + "step": 8 + }, + { + "epoch": 0.004219409282700422, + "grad_norm": 1.668201208114624, + "learning_rate": 7.908611599297013e-07, + "loss": 1.2685163021087646, + "step": 10 + }, + { + "epoch": 0.005063291139240506, + "grad_norm": 1.67417311668396, + "learning_rate": 9.666080843585237e-07, + "loss": 1.2942761182785034, + "step": 12 + }, + { + "epoch": 0.00590717299578059, + "grad_norm": 1.7154079675674438, + "learning_rate": 1.1423550087873463e-06, + "loss": 1.3638604879379272, + "step": 14 + }, + { + "epoch": 0.006751054852320675, + "grad_norm": 1.729427456855774, + "learning_rate": 1.3181019332161688e-06, + "loss": 1.3476728200912476, + "step": 16 + }, + { + "epoch": 0.007594936708860759, + "grad_norm": 1.3813447952270508, + "learning_rate": 1.4938488576449913e-06, + "loss": 1.3476393222808838, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 1.557220458984375, + "learning_rate": 1.6695957820738139e-06, + "loss": 1.2449309825897217, + "step": 20 + }, + { + "epoch": 0.009282700421940928, + "grad_norm": 1.1883500814437866, + "learning_rate": 1.8453427065026362e-06, + "loss": 1.3125361204147339, + "step": 22 + }, + { + "epoch": 0.010126582278481013, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.0210896309314587e-06, + "loss": 1.3724769353866577, + "step": 24 + }, + { + "epoch": 0.010970464135021098, + "grad_norm": 1.5627557039260864, + "learning_rate": 2.1968365553602812e-06, + "loss": 1.3401387929916382, + "step": 26 + }, + { + "epoch": 0.01181434599156118, + "grad_norm": 1.796866774559021, + "learning_rate": 2.3725834797891038e-06, + "loss": 1.365437388420105, + "step": 28 + }, + { + "epoch": 0.012658227848101266, + "grad_norm": 1.7030404806137085, + "learning_rate": 2.5483304042179263e-06, + "loss": 1.2706533670425415, + "step": 30 + }, + { + "epoch": 0.01350210970464135, + "grad_norm": 1.3186293840408325, + "learning_rate": 2.724077328646749e-06, + "loss": 1.3084994554519653, + "step": 32 + }, + { + "epoch": 0.014345991561181435, + "grad_norm": 1.5762513875961304, + "learning_rate": 2.8998242530755714e-06, + "loss": 1.3259696960449219, + "step": 34 + }, + { + "epoch": 0.015189873417721518, + "grad_norm": 1.422295331954956, + "learning_rate": 3.075571177504394e-06, + "loss": 1.3205676078796387, + "step": 36 + }, + { + "epoch": 0.016033755274261603, + "grad_norm": 1.495523452758789, + "learning_rate": 3.2513181019332165e-06, + "loss": 1.3740568161010742, + "step": 38 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 1.5112254619598389, + "learning_rate": 3.427065026362039e-06, + "loss": 1.321828842163086, + "step": 40 + }, + { + "epoch": 0.017721518987341773, + "grad_norm": 1.4667807817459106, + "learning_rate": 3.602811950790861e-06, + "loss": 1.3673173189163208, + "step": 42 + }, + { + "epoch": 0.018565400843881856, + "grad_norm": 1.6609723567962646, + "learning_rate": 3.7785588752196836e-06, + "loss": 1.3968093395233154, + "step": 44 + }, + { + "epoch": 0.019409282700421943, + "grad_norm": 1.59381103515625, + "learning_rate": 3.954305799648506e-06, + "loss": 1.4295302629470825, + "step": 46 + }, + { + "epoch": 0.020253164556962026, + "grad_norm": 1.1470608711242676, + "learning_rate": 4.130052724077329e-06, + "loss": 1.2536572217941284, + "step": 48 + }, + { + "epoch": 0.02109704641350211, + "grad_norm": 1.2014588117599487, + "learning_rate": 4.305799648506151e-06, + "loss": 1.242217779159546, + "step": 50 + }, + { + "epoch": 0.021940928270042195, + "grad_norm": 1.2327464818954468, + "learning_rate": 4.481546572934974e-06, + "loss": 1.2166963815689087, + "step": 52 + }, + { + "epoch": 0.02278481012658228, + "grad_norm": 1.9708983898162842, + "learning_rate": 4.657293497363796e-06, + "loss": 1.25709867477417, + "step": 54 + }, + { + "epoch": 0.02362869198312236, + "grad_norm": 1.180569052696228, + "learning_rate": 4.833040421792619e-06, + "loss": 1.2886158227920532, + "step": 56 + }, + { + "epoch": 0.024472573839662448, + "grad_norm": 1.5029548406600952, + "learning_rate": 5.008787346221441e-06, + "loss": 1.29886794090271, + "step": 58 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 1.5380216836929321, + "learning_rate": 5.184534270650264e-06, + "loss": 1.2387628555297852, + "step": 60 + }, + { + "epoch": 0.026160337552742614, + "grad_norm": 1.572144865989685, + "learning_rate": 5.3602811950790864e-06, + "loss": 1.2177000045776367, + "step": 62 + }, + { + "epoch": 0.0270042194092827, + "grad_norm": 1.4882780313491821, + "learning_rate": 5.536028119507909e-06, + "loss": 1.181516170501709, + "step": 64 + }, + { + "epoch": 0.027848101265822784, + "grad_norm": 1.2982488870620728, + "learning_rate": 5.7117750439367315e-06, + "loss": 1.2101733684539795, + "step": 66 + }, + { + "epoch": 0.02869198312236287, + "grad_norm": 1.5236955881118774, + "learning_rate": 5.887521968365554e-06, + "loss": 1.2277681827545166, + "step": 68 + }, + { + "epoch": 0.029535864978902954, + "grad_norm": 1.4521006345748901, + "learning_rate": 6.0632688927943766e-06, + "loss": 1.1688424348831177, + "step": 70 + }, + { + "epoch": 0.030379746835443037, + "grad_norm": 1.2352311611175537, + "learning_rate": 6.239015817223199e-06, + "loss": 1.273059368133545, + "step": 72 + }, + { + "epoch": 0.031223628691983123, + "grad_norm": 1.3438209295272827, + "learning_rate": 6.414762741652021e-06, + "loss": 1.1609034538269043, + "step": 74 + }, + { + "epoch": 0.032067510548523206, + "grad_norm": 1.9009398221969604, + "learning_rate": 6.590509666080843e-06, + "loss": 1.2508260011672974, + "step": 76 + }, + { + "epoch": 0.03291139240506329, + "grad_norm": 1.6718412637710571, + "learning_rate": 6.766256590509666e-06, + "loss": 1.2524956464767456, + "step": 78 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 1.249891757965088, + "learning_rate": 6.942003514938488e-06, + "loss": 1.1472493410110474, + "step": 80 + }, + { + "epoch": 0.03459915611814346, + "grad_norm": 1.4398653507232666, + "learning_rate": 7.117750439367312e-06, + "loss": 1.0845389366149902, + "step": 82 + }, + { + "epoch": 0.035443037974683546, + "grad_norm": 1.3701167106628418, + "learning_rate": 7.293497363796134e-06, + "loss": 1.1088868379592896, + "step": 84 + }, + { + "epoch": 0.036286919831223625, + "grad_norm": 1.277998924255371, + "learning_rate": 7.469244288224957e-06, + "loss": 1.1513772010803223, + "step": 86 + }, + { + "epoch": 0.03713080168776371, + "grad_norm": 1.4970002174377441, + "learning_rate": 7.644991212653779e-06, + "loss": 1.1385771036148071, + "step": 88 + }, + { + "epoch": 0.0379746835443038, + "grad_norm": 1.3384218215942383, + "learning_rate": 7.820738137082601e-06, + "loss": 1.1632680892944336, + "step": 90 + }, + { + "epoch": 0.038818565400843885, + "grad_norm": 1.4317446947097778, + "learning_rate": 7.996485061511425e-06, + "loss": 1.2256064414978027, + "step": 92 + }, + { + "epoch": 0.039662447257383965, + "grad_norm": 1.8743640184402466, + "learning_rate": 8.172231985940246e-06, + "loss": 1.1935789585113525, + "step": 94 + }, + { + "epoch": 0.04050632911392405, + "grad_norm": 1.4789546728134155, + "learning_rate": 8.347978910369069e-06, + "loss": 1.1429362297058105, + "step": 96 + }, + { + "epoch": 0.04135021097046414, + "grad_norm": 1.658605694770813, + "learning_rate": 8.523725834797891e-06, + "loss": 1.1831508874893188, + "step": 98 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 1.5077892541885376, + "learning_rate": 8.699472759226714e-06, + "loss": 1.0539867877960205, + "step": 100 + }, + { + "epoch": 0.04219409282700422, + "eval_loss": 1.138856053352356, + "eval_runtime": 859.7128, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 100 + }, + { + "epoch": 0.043037974683544304, + "grad_norm": 1.4335681200027466, + "learning_rate": 8.875219683655536e-06, + "loss": 1.0719901323318481, + "step": 102 + }, + { + "epoch": 0.04388185654008439, + "grad_norm": 1.7387681007385254, + "learning_rate": 9.050966608084359e-06, + "loss": 1.0654313564300537, + "step": 104 + }, + { + "epoch": 0.04472573839662447, + "grad_norm": 1.6071950197219849, + "learning_rate": 9.226713532513181e-06, + "loss": 1.0752698183059692, + "step": 106 + }, + { + "epoch": 0.04556962025316456, + "grad_norm": 1.40005362033844, + "learning_rate": 9.402460456942004e-06, + "loss": 1.1029763221740723, + "step": 108 + }, + { + "epoch": 0.046413502109704644, + "grad_norm": 2.2338669300079346, + "learning_rate": 9.578207381370826e-06, + "loss": 1.1157960891723633, + "step": 110 + }, + { + "epoch": 0.04725738396624472, + "grad_norm": 1.4972727298736572, + "learning_rate": 9.753954305799649e-06, + "loss": 1.1095420122146606, + "step": 112 + }, + { + "epoch": 0.04810126582278481, + "grad_norm": 1.317979097366333, + "learning_rate": 9.929701230228471e-06, + "loss": 1.109113097190857, + "step": 114 + }, + { + "epoch": 0.048945147679324896, + "grad_norm": 1.496346116065979, + "learning_rate": 1.0105448154657294e-05, + "loss": 1.1055104732513428, + "step": 116 + }, + { + "epoch": 0.049789029535864976, + "grad_norm": 1.385406732559204, + "learning_rate": 1.0281195079086117e-05, + "loss": 1.118395209312439, + "step": 118 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.524222731590271, + "learning_rate": 1.0456942003514939e-05, + "loss": 1.1008446216583252, + "step": 120 + }, + { + "epoch": 0.05147679324894515, + "grad_norm": 1.6308200359344482, + "learning_rate": 1.0632688927943762e-05, + "loss": 1.0891425609588623, + "step": 122 + }, + { + "epoch": 0.05232067510548523, + "grad_norm": 1.3681106567382812, + "learning_rate": 1.0808435852372584e-05, + "loss": 0.9080473184585571, + "step": 124 + }, + { + "epoch": 0.053164556962025315, + "grad_norm": 1.9429908990859985, + "learning_rate": 1.0984182776801407e-05, + "loss": 1.0337369441986084, + "step": 126 + }, + { + "epoch": 0.0540084388185654, + "grad_norm": 1.5830830335617065, + "learning_rate": 1.115992970123023e-05, + "loss": 1.0703333616256714, + "step": 128 + }, + { + "epoch": 0.05485232067510549, + "grad_norm": 1.4792555570602417, + "learning_rate": 1.1335676625659052e-05, + "loss": 1.004652738571167, + "step": 130 + }, + { + "epoch": 0.05569620253164557, + "grad_norm": 1.7196226119995117, + "learning_rate": 1.1511423550087874e-05, + "loss": 0.9798293709754944, + "step": 132 + }, + { + "epoch": 0.056540084388185655, + "grad_norm": 1.8733659982681274, + "learning_rate": 1.1687170474516697e-05, + "loss": 1.0213249921798706, + "step": 134 + }, + { + "epoch": 0.05738396624472574, + "grad_norm": 1.3431142568588257, + "learning_rate": 1.186291739894552e-05, + "loss": 1.0358591079711914, + "step": 136 + }, + { + "epoch": 0.05822784810126582, + "grad_norm": 1.527864933013916, + "learning_rate": 1.2038664323374342e-05, + "loss": 0.9372249841690063, + "step": 138 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5495563745498657, + "learning_rate": 1.2214411247803164e-05, + "loss": 1.0277758836746216, + "step": 140 + }, + { + "epoch": 0.059915611814345994, + "grad_norm": 1.6792418956756592, + "learning_rate": 1.2390158172231985e-05, + "loss": 1.0349801778793335, + "step": 142 + }, + { + "epoch": 0.060759493670886074, + "grad_norm": 1.6468945741653442, + "learning_rate": 1.256590509666081e-05, + "loss": 0.9578297734260559, + "step": 144 + }, + { + "epoch": 0.06160337552742616, + "grad_norm": 1.7243824005126953, + "learning_rate": 1.2741652021089632e-05, + "loss": 1.0628854036331177, + "step": 146 + }, + { + "epoch": 0.06244725738396625, + "grad_norm": 1.7286981344223022, + "learning_rate": 1.2917398945518455e-05, + "loss": 0.9336449503898621, + "step": 148 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.6411832571029663, + "learning_rate": 1.3093145869947277e-05, + "loss": 0.953730583190918, + "step": 150 + }, + { + "epoch": 0.06413502109704641, + "grad_norm": 1.8297001123428345, + "learning_rate": 1.3268892794376098e-05, + "loss": 1.051239013671875, + "step": 152 + }, + { + "epoch": 0.06497890295358649, + "grad_norm": 1.9660519361495972, + "learning_rate": 1.3444639718804922e-05, + "loss": 0.9955035448074341, + "step": 154 + }, + { + "epoch": 0.06582278481012659, + "grad_norm": 1.8423733711242676, + "learning_rate": 1.3620386643233743e-05, + "loss": 0.913300096988678, + "step": 156 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.9146347045898438, + "learning_rate": 1.3796133567662567e-05, + "loss": 1.0429846048355103, + "step": 158 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 1.6221821308135986, + "learning_rate": 1.3971880492091388e-05, + "loss": 1.0360238552093506, + "step": 160 + }, + { + "epoch": 0.06835443037974684, + "grad_norm": 2.173283338546753, + "learning_rate": 1.4147627416520212e-05, + "loss": 1.0227266550064087, + "step": 162 + }, + { + "epoch": 0.06919831223628692, + "grad_norm": 1.7091665267944336, + "learning_rate": 1.4323374340949033e-05, + "loss": 1.0075194835662842, + "step": 164 + }, + { + "epoch": 0.070042194092827, + "grad_norm": 1.7219135761260986, + "learning_rate": 1.4499121265377857e-05, + "loss": 1.0044782161712646, + "step": 166 + }, + { + "epoch": 0.07088607594936709, + "grad_norm": 1.6558159589767456, + "learning_rate": 1.4674868189806678e-05, + "loss": 0.9393973350524902, + "step": 168 + }, + { + "epoch": 0.07172995780590717, + "grad_norm": 1.9362739324569702, + "learning_rate": 1.4850615114235502e-05, + "loss": 0.9955337643623352, + "step": 170 + }, + { + "epoch": 0.07257383966244725, + "grad_norm": 1.7792853116989136, + "learning_rate": 1.5026362038664323e-05, + "loss": 0.9659126400947571, + "step": 172 + }, + { + "epoch": 0.07341772151898734, + "grad_norm": 1.7184511423110962, + "learning_rate": 1.5202108963093147e-05, + "loss": 0.9077855348587036, + "step": 174 + }, + { + "epoch": 0.07426160337552742, + "grad_norm": 1.5701428651809692, + "learning_rate": 1.537785588752197e-05, + "loss": 0.9305018782615662, + "step": 176 + }, + { + "epoch": 0.0751054852320675, + "grad_norm": 1.970229148864746, + "learning_rate": 1.555360281195079e-05, + "loss": 1.0211774110794067, + "step": 178 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 1.8410269021987915, + "learning_rate": 1.5729349736379615e-05, + "loss": 0.9479315876960754, + "step": 180 + }, + { + "epoch": 0.07679324894514768, + "grad_norm": 1.8991246223449707, + "learning_rate": 1.5905096660808434e-05, + "loss": 1.0629050731658936, + "step": 182 + }, + { + "epoch": 0.07763713080168777, + "grad_norm": 1.8052008152008057, + "learning_rate": 1.608084358523726e-05, + "loss": 0.946983814239502, + "step": 184 + }, + { + "epoch": 0.07848101265822785, + "grad_norm": 1.547108769416809, + "learning_rate": 1.625659050966608e-05, + "loss": 0.9413356184959412, + "step": 186 + }, + { + "epoch": 0.07932489451476793, + "grad_norm": 1.9713538885116577, + "learning_rate": 1.6432337434094905e-05, + "loss": 0.9337888956069946, + "step": 188 + }, + { + "epoch": 0.08016877637130802, + "grad_norm": 1.708789348602295, + "learning_rate": 1.6608084358523728e-05, + "loss": 0.9816337823867798, + "step": 190 + }, + { + "epoch": 0.0810126582278481, + "grad_norm": 1.815292477607727, + "learning_rate": 1.678383128295255e-05, + "loss": 1.017122507095337, + "step": 192 + }, + { + "epoch": 0.08185654008438818, + "grad_norm": 1.7950682640075684, + "learning_rate": 1.6959578207381373e-05, + "loss": 0.991599440574646, + "step": 194 + }, + { + "epoch": 0.08270042194092828, + "grad_norm": 1.692512035369873, + "learning_rate": 1.7135325131810195e-05, + "loss": 0.9570834040641785, + "step": 196 + }, + { + "epoch": 0.08354430379746836, + "grad_norm": 2.056089162826538, + "learning_rate": 1.7311072056239018e-05, + "loss": 1.035754919052124, + "step": 198 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.7022203207015991, + "learning_rate": 1.7486818980667837e-05, + "loss": 1.0124205350875854, + "step": 200 + }, + { + "epoch": 0.08438818565400844, + "eval_loss": 0.995743453502655, + "eval_runtime": 846.8257, + "eval_samples_per_second": 2.488, + "eval_steps_per_second": 2.488, + "step": 200 + }, + { + "epoch": 0.08523206751054853, + "grad_norm": 1.6088604927062988, + "learning_rate": 1.7662565905096663e-05, + "loss": 0.8946985006332397, + "step": 202 + }, + { + "epoch": 0.08607594936708861, + "grad_norm": 2.02270770072937, + "learning_rate": 1.7838312829525482e-05, + "loss": 0.976133406162262, + "step": 204 + }, + { + "epoch": 0.08691983122362869, + "grad_norm": 1.7832789421081543, + "learning_rate": 1.8014059753954308e-05, + "loss": 0.9079383611679077, + "step": 206 + }, + { + "epoch": 0.08776371308016878, + "grad_norm": 1.9793545007705688, + "learning_rate": 1.8189806678383127e-05, + "loss": 0.8650367856025696, + "step": 208 + }, + { + "epoch": 0.08860759493670886, + "grad_norm": 1.8124271631240845, + "learning_rate": 1.8365553602811953e-05, + "loss": 0.9327266812324524, + "step": 210 + }, + { + "epoch": 0.08945147679324894, + "grad_norm": 1.8581212759017944, + "learning_rate": 1.8541300527240772e-05, + "loss": 0.9811079502105713, + "step": 212 + }, + { + "epoch": 0.09029535864978903, + "grad_norm": 2.001699447631836, + "learning_rate": 1.8717047451669598e-05, + "loss": 0.9546971321105957, + "step": 214 + }, + { + "epoch": 0.09113924050632911, + "grad_norm": 1.6994978189468384, + "learning_rate": 1.8892794376098417e-05, + "loss": 0.9611319899559021, + "step": 216 + }, + { + "epoch": 0.0919831223628692, + "grad_norm": 2.1379497051239014, + "learning_rate": 1.9068541300527243e-05, + "loss": 0.9781531095504761, + "step": 218 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.8961224555969238, + "learning_rate": 1.9244288224956066e-05, + "loss": 0.9374833106994629, + "step": 220 + }, + { + "epoch": 0.09367088607594937, + "grad_norm": 1.851464033126831, + "learning_rate": 1.9420035149384885e-05, + "loss": 0.9681299328804016, + "step": 222 + }, + { + "epoch": 0.09451476793248945, + "grad_norm": 2.0642266273498535, + "learning_rate": 1.959578207381371e-05, + "loss": 1.0086225271224976, + "step": 224 + }, + { + "epoch": 0.09535864978902954, + "grad_norm": 1.8658756017684937, + "learning_rate": 1.977152899824253e-05, + "loss": 0.9190312623977661, + "step": 226 + }, + { + "epoch": 0.09620253164556962, + "grad_norm": 2.4398674964904785, + "learning_rate": 1.9947275922671356e-05, + "loss": 0.9740874171257019, + "step": 228 + }, + { + "epoch": 0.0970464135021097, + "grad_norm": 1.849183440208435, + "learning_rate": 2.0123022847100175e-05, + "loss": 0.884376049041748, + "step": 230 + }, + { + "epoch": 0.09789029535864979, + "grad_norm": 2.027320384979248, + "learning_rate": 2.0298769771529e-05, + "loss": 0.9116487503051758, + "step": 232 + }, + { + "epoch": 0.09873417721518987, + "grad_norm": 1.6800135374069214, + "learning_rate": 2.047451669595782e-05, + "loss": 0.9035115242004395, + "step": 234 + }, + { + "epoch": 0.09957805907172995, + "grad_norm": 2.2362256050109863, + "learning_rate": 2.0650263620386646e-05, + "loss": 0.9043796062469482, + "step": 236 + }, + { + "epoch": 0.10042194092827005, + "grad_norm": 1.938215970993042, + "learning_rate": 2.0826010544815465e-05, + "loss": 1.0888828039169312, + "step": 238 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.890328049659729, + "learning_rate": 2.100175746924429e-05, + "loss": 0.9960280656814575, + "step": 240 + }, + { + "epoch": 0.1021097046413502, + "grad_norm": 2.021235227584839, + "learning_rate": 2.117750439367311e-05, + "loss": 0.9848901629447937, + "step": 242 + }, + { + "epoch": 0.1029535864978903, + "grad_norm": 2.023920774459839, + "learning_rate": 2.1353251318101936e-05, + "loss": 0.891694188117981, + "step": 244 + }, + { + "epoch": 0.10379746835443038, + "grad_norm": 1.8061069250106812, + "learning_rate": 2.1528998242530755e-05, + "loss": 0.9059976935386658, + "step": 246 + }, + { + "epoch": 0.10464135021097046, + "grad_norm": 2.176302194595337, + "learning_rate": 2.1704745166959578e-05, + "loss": 1.0056109428405762, + "step": 248 + }, + { + "epoch": 0.10548523206751055, + "grad_norm": 1.9820969104766846, + "learning_rate": 2.18804920913884e-05, + "loss": 0.9645357728004456, + "step": 250 + }, + { + "epoch": 0.10632911392405063, + "grad_norm": 1.8764572143554688, + "learning_rate": 2.2056239015817223e-05, + "loss": 1.0178182125091553, + "step": 252 + }, + { + "epoch": 0.10717299578059072, + "grad_norm": 2.56221342086792, + "learning_rate": 2.223198594024605e-05, + "loss": 0.9546761512756348, + "step": 254 + }, + { + "epoch": 0.1080168776371308, + "grad_norm": 2.6779074668884277, + "learning_rate": 2.2407732864674868e-05, + "loss": 0.9300968647003174, + "step": 256 + }, + { + "epoch": 0.10886075949367088, + "grad_norm": 2.140897512435913, + "learning_rate": 2.2583479789103694e-05, + "loss": 0.926638662815094, + "step": 258 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 2.0880508422851562, + "learning_rate": 2.2759226713532513e-05, + "loss": 1.0681840181350708, + "step": 260 + }, + { + "epoch": 0.11054852320675106, + "grad_norm": 2.7273616790771484, + "learning_rate": 2.293497363796134e-05, + "loss": 1.0840941667556763, + "step": 262 + }, + { + "epoch": 0.11139240506329114, + "grad_norm": 1.6723874807357788, + "learning_rate": 2.3110720562390158e-05, + "loss": 0.8637182116508484, + "step": 264 + }, + { + "epoch": 0.11223628691983123, + "grad_norm": 1.806243896484375, + "learning_rate": 2.3286467486818984e-05, + "loss": 0.9554686546325684, + "step": 266 + }, + { + "epoch": 0.11308016877637131, + "grad_norm": 1.9086743593215942, + "learning_rate": 2.3462214411247803e-05, + "loss": 0.9556593894958496, + "step": 268 + }, + { + "epoch": 0.11392405063291139, + "grad_norm": 2.1822304725646973, + "learning_rate": 2.3637961335676626e-05, + "loss": 0.9177709817886353, + "step": 270 + }, + { + "epoch": 0.11476793248945148, + "grad_norm": 2.1009039878845215, + "learning_rate": 2.3813708260105448e-05, + "loss": 0.9288759827613831, + "step": 272 + }, + { + "epoch": 0.11561181434599156, + "grad_norm": 1.9814810752868652, + "learning_rate": 2.398945518453427e-05, + "loss": 0.9881691932678223, + "step": 274 + }, + { + "epoch": 0.11645569620253164, + "grad_norm": 1.9946284294128418, + "learning_rate": 2.4165202108963093e-05, + "loss": 0.9390727281570435, + "step": 276 + }, + { + "epoch": 0.11729957805907174, + "grad_norm": 2.4489169120788574, + "learning_rate": 2.4340949033391916e-05, + "loss": 0.9625692963600159, + "step": 278 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.0919103622436523, + "learning_rate": 2.451669595782074e-05, + "loss": 0.9304702877998352, + "step": 280 + }, + { + "epoch": 0.1189873417721519, + "grad_norm": 1.912914752960205, + "learning_rate": 2.469244288224956e-05, + "loss": 0.9313994646072388, + "step": 282 + }, + { + "epoch": 0.11983122362869199, + "grad_norm": 2.1553256511688232, + "learning_rate": 2.4868189806678387e-05, + "loss": 1.004011869430542, + "step": 284 + }, + { + "epoch": 0.12067510548523207, + "grad_norm": 2.0129058361053467, + "learning_rate": 2.504393673110721e-05, + "loss": 0.9092531204223633, + "step": 286 + }, + { + "epoch": 0.12151898734177215, + "grad_norm": 2.1632325649261475, + "learning_rate": 2.5219683655536032e-05, + "loss": 0.993347704410553, + "step": 288 + }, + { + "epoch": 0.12236286919831224, + "grad_norm": 2.3072738647460938, + "learning_rate": 2.539543057996485e-05, + "loss": 0.978348433971405, + "step": 290 + }, + { + "epoch": 0.12320675105485232, + "grad_norm": 2.056560516357422, + "learning_rate": 2.5571177504393674e-05, + "loss": 1.0018101930618286, + "step": 292 + }, + { + "epoch": 0.1240506329113924, + "grad_norm": 1.8906747102737427, + "learning_rate": 2.5746924428822493e-05, + "loss": 0.9607775211334229, + "step": 294 + }, + { + "epoch": 0.1248945147679325, + "grad_norm": 2.1375651359558105, + "learning_rate": 2.5922671353251322e-05, + "loss": 0.9259153008460999, + "step": 296 + }, + { + "epoch": 0.1257383966244726, + "grad_norm": 1.9994823932647705, + "learning_rate": 2.609841827768014e-05, + "loss": 0.8524524569511414, + "step": 298 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 2.2421181201934814, + "learning_rate": 2.6274165202108964e-05, + "loss": 1.0047069787979126, + "step": 300 + }, + { + "epoch": 0.12658227848101267, + "eval_loss": 0.9517185688018799, + "eval_runtime": 860.0287, + "eval_samples_per_second": 2.45, + "eval_steps_per_second": 2.45, + "step": 300 + }, + { + "epoch": 0.12742616033755275, + "grad_norm": 2.1206254959106445, + "learning_rate": 2.6449912126537786e-05, + "loss": 0.8475471138954163, + "step": 302 + }, + { + "epoch": 0.12827004219409283, + "grad_norm": 1.885161280632019, + "learning_rate": 2.6625659050966612e-05, + "loss": 0.8643121123313904, + "step": 304 + }, + { + "epoch": 0.1291139240506329, + "grad_norm": 3.1441781520843506, + "learning_rate": 2.680140597539543e-05, + "loss": 0.8804612159729004, + "step": 306 + }, + { + "epoch": 0.12995780590717299, + "grad_norm": 1.953133225440979, + "learning_rate": 2.6977152899824254e-05, + "loss": 0.8348029255867004, + "step": 308 + }, + { + "epoch": 0.1308016877637131, + "grad_norm": 2.3762667179107666, + "learning_rate": 2.7152899824253076e-05, + "loss": 0.8889057040214539, + "step": 310 + }, + { + "epoch": 0.13164556962025317, + "grad_norm": 2.4651103019714355, + "learning_rate": 2.7328646748681902e-05, + "loss": 1.025565505027771, + "step": 312 + }, + { + "epoch": 0.13248945147679325, + "grad_norm": 1.8522284030914307, + "learning_rate": 2.7504393673110725e-05, + "loss": 0.868915855884552, + "step": 314 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.8048083782196045, + "learning_rate": 2.7680140597539544e-05, + "loss": 0.8821638226509094, + "step": 316 + }, + { + "epoch": 0.1341772151898734, + "grad_norm": 1.9933605194091797, + "learning_rate": 2.7855887521968367e-05, + "loss": 0.8735360503196716, + "step": 318 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 2.044337034225464, + "learning_rate": 2.8031634446397186e-05, + "loss": 0.8288834691047668, + "step": 320 + }, + { + "epoch": 0.1358649789029536, + "grad_norm": 2.416067361831665, + "learning_rate": 2.8207381370826015e-05, + "loss": 0.9104969501495361, + "step": 322 + }, + { + "epoch": 0.13670886075949368, + "grad_norm": 2.0731265544891357, + "learning_rate": 2.8383128295254834e-05, + "loss": 0.8689924478530884, + "step": 324 + }, + { + "epoch": 0.13755274261603376, + "grad_norm": 2.049126386642456, + "learning_rate": 2.8558875219683657e-05, + "loss": 0.9312222003936768, + "step": 326 + }, + { + "epoch": 0.13839662447257384, + "grad_norm": 2.131026268005371, + "learning_rate": 2.8734622144112476e-05, + "loss": 0.8933501839637756, + "step": 328 + }, + { + "epoch": 0.13924050632911392, + "grad_norm": 1.766754150390625, + "learning_rate": 2.8910369068541305e-05, + "loss": 0.8998261094093323, + "step": 330 + }, + { + "epoch": 0.140084388185654, + "grad_norm": 2.197706460952759, + "learning_rate": 2.9086115992970124e-05, + "loss": 0.8826426267623901, + "step": 332 + }, + { + "epoch": 0.1409282700421941, + "grad_norm": 1.953715443611145, + "learning_rate": 2.9261862917398947e-05, + "loss": 0.8590307831764221, + "step": 334 + }, + { + "epoch": 0.14177215189873418, + "grad_norm": 2.200929880142212, + "learning_rate": 2.943760984182777e-05, + "loss": 0.9317060708999634, + "step": 336 + }, + { + "epoch": 0.14261603375527426, + "grad_norm": 2.1195082664489746, + "learning_rate": 2.961335676625659e-05, + "loss": 0.9965578317642212, + "step": 338 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 2.3449771404266357, + "learning_rate": 2.9789103690685414e-05, + "loss": 0.8353848457336426, + "step": 340 + }, + { + "epoch": 0.14430379746835442, + "grad_norm": 2.000497579574585, + "learning_rate": 2.9964850615114237e-05, + "loss": 0.9154735803604126, + "step": 342 + }, + { + "epoch": 0.1451476793248945, + "grad_norm": 2.141890525817871, + "learning_rate": 3.014059753954306e-05, + "loss": 0.9530655741691589, + "step": 344 + }, + { + "epoch": 0.1459915611814346, + "grad_norm": 1.7717392444610596, + "learning_rate": 3.031634446397188e-05, + "loss": 0.896998405456543, + "step": 346 + }, + { + "epoch": 0.1468354430379747, + "grad_norm": 1.8796685934066772, + "learning_rate": 3.0492091388400708e-05, + "loss": 0.9084208011627197, + "step": 348 + }, + { + "epoch": 0.14767932489451477, + "grad_norm": 2.0298709869384766, + "learning_rate": 3.066783831282953e-05, + "loss": 0.9183387756347656, + "step": 350 + }, + { + "epoch": 0.14852320675105485, + "grad_norm": 1.9245645999908447, + "learning_rate": 3.084358523725835e-05, + "loss": 0.8624772429466248, + "step": 352 + }, + { + "epoch": 0.14936708860759493, + "grad_norm": 2.325681209564209, + "learning_rate": 3.101933216168717e-05, + "loss": 0.9142400026321411, + "step": 354 + }, + { + "epoch": 0.150210970464135, + "grad_norm": 2.1200530529022217, + "learning_rate": 3.1195079086115995e-05, + "loss": 0.9064018130302429, + "step": 356 + }, + { + "epoch": 0.15105485232067511, + "grad_norm": 1.979314923286438, + "learning_rate": 3.137082601054482e-05, + "loss": 0.9199238419532776, + "step": 358 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 2.1122689247131348, + "learning_rate": 3.154657293497364e-05, + "loss": 0.8030132055282593, + "step": 360 + }, + { + "epoch": 0.15274261603375527, + "grad_norm": 2.105767250061035, + "learning_rate": 3.172231985940246e-05, + "loss": 0.9185854196548462, + "step": 362 + }, + { + "epoch": 0.15358649789029535, + "grad_norm": 2.179471015930176, + "learning_rate": 3.1898066783831285e-05, + "loss": 0.9365083575248718, + "step": 364 + }, + { + "epoch": 0.15443037974683543, + "grad_norm": 2.1444311141967773, + "learning_rate": 3.207381370826011e-05, + "loss": 0.8965140581130981, + "step": 366 + }, + { + "epoch": 0.15527426160337554, + "grad_norm": 2.4171674251556396, + "learning_rate": 3.224956063268893e-05, + "loss": 0.8787504434585571, + "step": 368 + }, + { + "epoch": 0.15611814345991562, + "grad_norm": 2.418628215789795, + "learning_rate": 3.242530755711775e-05, + "loss": 0.8925284147262573, + "step": 370 + }, + { + "epoch": 0.1569620253164557, + "grad_norm": 2.2228314876556396, + "learning_rate": 3.2601054481546575e-05, + "loss": 0.876179039478302, + "step": 372 + }, + { + "epoch": 0.15780590717299578, + "grad_norm": 2.324237108230591, + "learning_rate": 3.27768014059754e-05, + "loss": 0.8365707993507385, + "step": 374 + }, + { + "epoch": 0.15864978902953586, + "grad_norm": 2.6344552040100098, + "learning_rate": 3.295254833040422e-05, + "loss": 0.7864399552345276, + "step": 376 + }, + { + "epoch": 0.15949367088607594, + "grad_norm": 2.047536611557007, + "learning_rate": 3.312829525483304e-05, + "loss": 0.9271875023841858, + "step": 378 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 2.120025157928467, + "learning_rate": 3.3304042179261865e-05, + "loss": 0.8799133896827698, + "step": 380 + }, + { + "epoch": 0.16118143459915613, + "grad_norm": 2.363692045211792, + "learning_rate": 3.347978910369069e-05, + "loss": 0.8973530530929565, + "step": 382 + }, + { + "epoch": 0.1620253164556962, + "grad_norm": 2.1796772480010986, + "learning_rate": 3.365553602811951e-05, + "loss": 1.0277652740478516, + "step": 384 + }, + { + "epoch": 0.16286919831223629, + "grad_norm": 1.9192595481872559, + "learning_rate": 3.383128295254833e-05, + "loss": 0.8909643888473511, + "step": 386 + }, + { + "epoch": 0.16371308016877636, + "grad_norm": 1.7874376773834229, + "learning_rate": 3.4007029876977155e-05, + "loss": 0.837049663066864, + "step": 388 + }, + { + "epoch": 0.16455696202531644, + "grad_norm": 2.3402366638183594, + "learning_rate": 3.4182776801405974e-05, + "loss": 0.8625202775001526, + "step": 390 + }, + { + "epoch": 0.16540084388185655, + "grad_norm": 2.1137185096740723, + "learning_rate": 3.43585237258348e-05, + "loss": 0.9288321137428284, + "step": 392 + }, + { + "epoch": 0.16624472573839663, + "grad_norm": 2.3776895999908447, + "learning_rate": 3.453427065026362e-05, + "loss": 0.9328726530075073, + "step": 394 + }, + { + "epoch": 0.1670886075949367, + "grad_norm": 2.34941029548645, + "learning_rate": 3.4710017574692445e-05, + "loss": 0.9273309707641602, + "step": 396 + }, + { + "epoch": 0.1679324894514768, + "grad_norm": 2.1272573471069336, + "learning_rate": 3.4885764499121264e-05, + "loss": 0.8703887462615967, + "step": 398 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 2.047290802001953, + "learning_rate": 3.506151142355009e-05, + "loss": 0.8808165788650513, + "step": 400 + }, + { + "epoch": 0.16877637130801687, + "eval_loss": 0.9282881617546082, + "eval_runtime": 869.6867, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 2.423, + "step": 400 + }, + { + "epoch": 0.16962025316455695, + "grad_norm": 1.9874159097671509, + "learning_rate": 3.5237258347978916e-05, + "loss": 0.9643645286560059, + "step": 402 + }, + { + "epoch": 0.17046413502109706, + "grad_norm": 1.9299919605255127, + "learning_rate": 3.5413005272407735e-05, + "loss": 0.9173495769500732, + "step": 404 + }, + { + "epoch": 0.17130801687763714, + "grad_norm": 2.3379697799682617, + "learning_rate": 3.5588752196836555e-05, + "loss": 0.8998411893844604, + "step": 406 + }, + { + "epoch": 0.17215189873417722, + "grad_norm": 2.241370916366577, + "learning_rate": 3.5764499121265374e-05, + "loss": 0.9310802221298218, + "step": 408 + }, + { + "epoch": 0.1729957805907173, + "grad_norm": 2.4490108489990234, + "learning_rate": 3.5940246045694206e-05, + "loss": 0.9605053067207336, + "step": 410 + }, + { + "epoch": 0.17383966244725738, + "grad_norm": 1.8247230052947998, + "learning_rate": 3.6115992970123026e-05, + "loss": 0.8485683798789978, + "step": 412 + }, + { + "epoch": 0.17468354430379746, + "grad_norm": 2.4608843326568604, + "learning_rate": 3.6291739894551845e-05, + "loss": 0.9325968623161316, + "step": 414 + }, + { + "epoch": 0.17552742616033756, + "grad_norm": 1.8923161029815674, + "learning_rate": 3.646748681898067e-05, + "loss": 0.9125096201896667, + "step": 416 + }, + { + "epoch": 0.17637130801687764, + "grad_norm": 1.8502769470214844, + "learning_rate": 3.6643233743409497e-05, + "loss": 0.8852217197418213, + "step": 418 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.9155100584030151, + "learning_rate": 3.6818980667838316e-05, + "loss": 0.9192792773246765, + "step": 420 + }, + { + "epoch": 0.1780590717299578, + "grad_norm": 2.181476593017578, + "learning_rate": 3.6994727592267135e-05, + "loss": 0.8787404298782349, + "step": 422 + }, + { + "epoch": 0.17890295358649788, + "grad_norm": 2.2469847202301025, + "learning_rate": 3.717047451669596e-05, + "loss": 0.9109582901000977, + "step": 424 + }, + { + "epoch": 0.17974683544303796, + "grad_norm": 2.08145809173584, + "learning_rate": 3.734622144112479e-05, + "loss": 0.8560389280319214, + "step": 426 + }, + { + "epoch": 0.18059071729957807, + "grad_norm": 4.121932506561279, + "learning_rate": 3.7521968365553606e-05, + "loss": 0.9456104040145874, + "step": 428 + }, + { + "epoch": 0.18143459915611815, + "grad_norm": 2.177459478378296, + "learning_rate": 3.7697715289982425e-05, + "loss": 0.8421300649642944, + "step": 430 + }, + { + "epoch": 0.18227848101265823, + "grad_norm": 2.324970245361328, + "learning_rate": 3.787346221441125e-05, + "loss": 0.9199858903884888, + "step": 432 + }, + { + "epoch": 0.1831223628691983, + "grad_norm": 2.133718490600586, + "learning_rate": 3.804920913884007e-05, + "loss": 0.8953126668930054, + "step": 434 + }, + { + "epoch": 0.1839662447257384, + "grad_norm": 1.8527995347976685, + "learning_rate": 3.8224956063268896e-05, + "loss": 0.8732239007949829, + "step": 436 + }, + { + "epoch": 0.1848101265822785, + "grad_norm": 1.95817232131958, + "learning_rate": 3.8400702987697715e-05, + "loss": 0.8818746209144592, + "step": 438 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 2.2107293605804443, + "learning_rate": 3.857644991212654e-05, + "loss": 0.9153507947921753, + "step": 440 + }, + { + "epoch": 0.18649789029535865, + "grad_norm": 2.004754066467285, + "learning_rate": 3.875219683655536e-05, + "loss": 0.8960154056549072, + "step": 442 + }, + { + "epoch": 0.18734177215189873, + "grad_norm": 2.1851706504821777, + "learning_rate": 3.8927943760984186e-05, + "loss": 0.909011721611023, + "step": 444 + }, + { + "epoch": 0.1881856540084388, + "grad_norm": 2.4492485523223877, + "learning_rate": 3.9103690685413005e-05, + "loss": 0.8880158066749573, + "step": 446 + }, + { + "epoch": 0.1890295358649789, + "grad_norm": 2.745453119277954, + "learning_rate": 3.927943760984183e-05, + "loss": 0.8500842452049255, + "step": 448 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 2.1924264430999756, + "learning_rate": 3.945518453427065e-05, + "loss": 0.9004045724868774, + "step": 450 + }, + { + "epoch": 0.19071729957805908, + "grad_norm": 2.4051687717437744, + "learning_rate": 3.9630931458699476e-05, + "loss": 0.9020664095878601, + "step": 452 + }, + { + "epoch": 0.19156118143459916, + "grad_norm": 1.8077667951583862, + "learning_rate": 3.9806678383128295e-05, + "loss": 0.8639500737190247, + "step": 454 + }, + { + "epoch": 0.19240506329113924, + "grad_norm": 2.089043378829956, + "learning_rate": 3.998242530755712e-05, + "loss": 0.8642048239707947, + "step": 456 + }, + { + "epoch": 0.19324894514767932, + "grad_norm": 2.029578447341919, + "learning_rate": 4.015817223198594e-05, + "loss": 0.9371927380561829, + "step": 458 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 2.26582407951355, + "learning_rate": 4.033391915641476e-05, + "loss": 0.9120588302612305, + "step": 460 + }, + { + "epoch": 0.1949367088607595, + "grad_norm": 1.8671411275863647, + "learning_rate": 4.050966608084359e-05, + "loss": 0.8758644461631775, + "step": 462 + }, + { + "epoch": 0.19578059071729959, + "grad_norm": 1.9403492212295532, + "learning_rate": 4.068541300527241e-05, + "loss": 0.914577305316925, + "step": 464 + }, + { + "epoch": 0.19662447257383966, + "grad_norm": 1.9939641952514648, + "learning_rate": 4.086115992970123e-05, + "loss": 0.8592531681060791, + "step": 466 + }, + { + "epoch": 0.19746835443037974, + "grad_norm": 2.1511380672454834, + "learning_rate": 4.103690685413005e-05, + "loss": 0.9251965880393982, + "step": 468 + }, + { + "epoch": 0.19831223628691982, + "grad_norm": 2.2260982990264893, + "learning_rate": 4.121265377855888e-05, + "loss": 0.8465172052383423, + "step": 470 + }, + { + "epoch": 0.1991561181434599, + "grad_norm": 2.0510010719299316, + "learning_rate": 4.13884007029877e-05, + "loss": 0.8943672180175781, + "step": 472 + }, + { + "epoch": 0.2, + "grad_norm": 2.2040133476257324, + "learning_rate": 4.156414762741652e-05, + "loss": 0.9594319462776184, + "step": 474 + }, + { + "epoch": 0.2008438818565401, + "grad_norm": 2.355181932449341, + "learning_rate": 4.173989455184534e-05, + "loss": 0.9031813144683838, + "step": 476 + }, + { + "epoch": 0.20168776371308017, + "grad_norm": 2.8434665203094482, + "learning_rate": 4.1915641476274166e-05, + "loss": 0.9225798845291138, + "step": 478 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 2.1715340614318848, + "learning_rate": 4.209138840070299e-05, + "loss": 0.894163966178894, + "step": 480 + }, + { + "epoch": 0.20337552742616033, + "grad_norm": 2.078916072845459, + "learning_rate": 4.226713532513181e-05, + "loss": 0.8424109816551208, + "step": 482 + }, + { + "epoch": 0.2042194092827004, + "grad_norm": 1.9760961532592773, + "learning_rate": 4.244288224956064e-05, + "loss": 0.9102715849876404, + "step": 484 + }, + { + "epoch": 0.20506329113924052, + "grad_norm": 1.9684507846832275, + "learning_rate": 4.2618629173989456e-05, + "loss": 0.8693854808807373, + "step": 486 + }, + { + "epoch": 0.2059071729957806, + "grad_norm": 2.1633450984954834, + "learning_rate": 4.279437609841828e-05, + "loss": 0.8617543578147888, + "step": 488 + }, + { + "epoch": 0.20675105485232068, + "grad_norm": 2.2695257663726807, + "learning_rate": 4.29701230228471e-05, + "loss": 0.9167086482048035, + "step": 490 + }, + { + "epoch": 0.20759493670886076, + "grad_norm": 2.4180049896240234, + "learning_rate": 4.314586994727593e-05, + "loss": 0.8333520889282227, + "step": 492 + }, + { + "epoch": 0.20843881856540084, + "grad_norm": 2.2942769527435303, + "learning_rate": 4.3321616871704746e-05, + "loss": 0.918351411819458, + "step": 494 + }, + { + "epoch": 0.20928270042194091, + "grad_norm": 1.826458215713501, + "learning_rate": 4.349736379613357e-05, + "loss": 0.8565171957015991, + "step": 496 + }, + { + "epoch": 0.21012658227848102, + "grad_norm": 1.9694055318832397, + "learning_rate": 4.367311072056239e-05, + "loss": 0.8684167861938477, + "step": 498 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.892659306526184, + "learning_rate": 4.384885764499122e-05, + "loss": 0.7752788662910461, + "step": 500 + }, + { + "epoch": 0.2109704641350211, + "eval_loss": 0.9080732464790344, + "eval_runtime": 857.0753, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 500 + }, + { + "epoch": 0.21181434599156118, + "grad_norm": 1.9322253465652466, + "learning_rate": 4.4024604569420036e-05, + "loss": 0.948570728302002, + "step": 502 + }, + { + "epoch": 0.21265822784810126, + "grad_norm": 2.0456058979034424, + "learning_rate": 4.4200351493848855e-05, + "loss": 0.8741024732589722, + "step": 504 + }, + { + "epoch": 0.21350210970464134, + "grad_norm": 2.2406177520751953, + "learning_rate": 4.437609841827768e-05, + "loss": 0.9053841829299927, + "step": 506 + }, + { + "epoch": 0.21434599156118145, + "grad_norm": 2.013934850692749, + "learning_rate": 4.455184534270651e-05, + "loss": 0.8886576294898987, + "step": 508 + }, + { + "epoch": 0.21518987341772153, + "grad_norm": 1.9771125316619873, + "learning_rate": 4.4727592267135326e-05, + "loss": 0.8834167718887329, + "step": 510 + }, + { + "epoch": 0.2160337552742616, + "grad_norm": 1.785905361175537, + "learning_rate": 4.4903339191564146e-05, + "loss": 0.7938863039016724, + "step": 512 + }, + { + "epoch": 0.2168776371308017, + "grad_norm": 1.7946031093597412, + "learning_rate": 4.507908611599297e-05, + "loss": 0.8071596026420593, + "step": 514 + }, + { + "epoch": 0.21772151898734177, + "grad_norm": 2.2217721939086914, + "learning_rate": 4.52548330404218e-05, + "loss": 0.797417163848877, + "step": 516 + }, + { + "epoch": 0.21856540084388185, + "grad_norm": 1.9022471904754639, + "learning_rate": 4.5430579964850617e-05, + "loss": 0.8109536170959473, + "step": 518 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.8988343477249146, + "learning_rate": 4.5606326889279436e-05, + "loss": 0.8647034168243408, + "step": 520 + }, + { + "epoch": 0.22025316455696203, + "grad_norm": 2.6014881134033203, + "learning_rate": 4.578207381370827e-05, + "loss": 0.8763713240623474, + "step": 522 + }, + { + "epoch": 0.2210970464135021, + "grad_norm": 1.9512032270431519, + "learning_rate": 4.595782073813709e-05, + "loss": 0.9525764584541321, + "step": 524 + }, + { + "epoch": 0.2219409282700422, + "grad_norm": 1.9246160984039307, + "learning_rate": 4.613356766256591e-05, + "loss": 0.8839208483695984, + "step": 526 + }, + { + "epoch": 0.22278481012658227, + "grad_norm": 1.9713703393936157, + "learning_rate": 4.6309314586994726e-05, + "loss": 0.8888868093490601, + "step": 528 + }, + { + "epoch": 0.22362869198312235, + "grad_norm": 2.1175239086151123, + "learning_rate": 4.648506151142355e-05, + "loss": 0.8123540878295898, + "step": 530 + }, + { + "epoch": 0.22447257383966246, + "grad_norm": 1.7656135559082031, + "learning_rate": 4.666080843585238e-05, + "loss": 0.7447702884674072, + "step": 532 + }, + { + "epoch": 0.22531645569620254, + "grad_norm": 2.15748929977417, + "learning_rate": 4.68365553602812e-05, + "loss": 0.8778411746025085, + "step": 534 + }, + { + "epoch": 0.22616033755274262, + "grad_norm": 2.1733345985412598, + "learning_rate": 4.7012302284710016e-05, + "loss": 0.8985894918441772, + "step": 536 + }, + { + "epoch": 0.2270042194092827, + "grad_norm": 1.7182204723358154, + "learning_rate": 4.718804920913884e-05, + "loss": 0.8031114339828491, + "step": 538 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 1.8586329221725464, + "learning_rate": 4.736379613356767e-05, + "loss": 0.9399706721305847, + "step": 540 + }, + { + "epoch": 0.22869198312236286, + "grad_norm": 2.105637311935425, + "learning_rate": 4.753954305799649e-05, + "loss": 0.8672119975090027, + "step": 542 + }, + { + "epoch": 0.22953586497890296, + "grad_norm": 1.760584831237793, + "learning_rate": 4.771528998242531e-05, + "loss": 0.8663905262947083, + "step": 544 + }, + { + "epoch": 0.23037974683544304, + "grad_norm": 1.579990267753601, + "learning_rate": 4.789103690685413e-05, + "loss": 0.8575801849365234, + "step": 546 + }, + { + "epoch": 0.23122362869198312, + "grad_norm": 1.9242485761642456, + "learning_rate": 4.806678383128295e-05, + "loss": 0.828412652015686, + "step": 548 + }, + { + "epoch": 0.2320675105485232, + "grad_norm": 1.812137246131897, + "learning_rate": 4.824253075571178e-05, + "loss": 0.8183464407920837, + "step": 550 + }, + { + "epoch": 0.23291139240506328, + "grad_norm": 1.804733395576477, + "learning_rate": 4.84182776801406e-05, + "loss": 0.7822491526603699, + "step": 552 + }, + { + "epoch": 0.23375527426160336, + "grad_norm": 2.052257537841797, + "learning_rate": 4.859402460456942e-05, + "loss": 0.9050943851470947, + "step": 554 + }, + { + "epoch": 0.23459915611814347, + "grad_norm": 1.9803621768951416, + "learning_rate": 4.876977152899824e-05, + "loss": 0.8846852779388428, + "step": 556 + }, + { + "epoch": 0.23544303797468355, + "grad_norm": 1.820125937461853, + "learning_rate": 4.894551845342707e-05, + "loss": 0.8649531602859497, + "step": 558 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 2.0963921546936035, + "learning_rate": 4.912126537785589e-05, + "loss": 0.9307748079299927, + "step": 560 + }, + { + "epoch": 0.2371308016877637, + "grad_norm": 2.079697847366333, + "learning_rate": 4.929701230228471e-05, + "loss": 0.9092473387718201, + "step": 562 + }, + { + "epoch": 0.2379746835443038, + "grad_norm": 2.0291287899017334, + "learning_rate": 4.947275922671353e-05, + "loss": 0.8976567983627319, + "step": 564 + }, + { + "epoch": 0.23881856540084387, + "grad_norm": 1.9636707305908203, + "learning_rate": 4.964850615114236e-05, + "loss": 0.8931006193161011, + "step": 566 + }, + { + "epoch": 0.23966244725738398, + "grad_norm": 1.922049880027771, + "learning_rate": 4.982425307557118e-05, + "loss": 0.829562246799469, + "step": 568 + }, + { + "epoch": 0.24050632911392406, + "grad_norm": 2.150334596633911, + "learning_rate": 5e-05, + "loss": 0.8568030595779419, + "step": 570 + }, + { + "epoch": 0.24135021097046414, + "grad_norm": 2.024437427520752, + "learning_rate": 5.017574692442882e-05, + "loss": 0.8623508810997009, + "step": 572 + }, + { + "epoch": 0.24219409282700421, + "grad_norm": 1.8312673568725586, + "learning_rate": 5.035149384885765e-05, + "loss": 0.7853795886039734, + "step": 574 + }, + { + "epoch": 0.2430379746835443, + "grad_norm": 1.9271961450576782, + "learning_rate": 5.0527240773286467e-05, + "loss": 0.9727587103843689, + "step": 576 + }, + { + "epoch": 0.2438818565400844, + "grad_norm": 1.931249976158142, + "learning_rate": 5.0702987697715286e-05, + "loss": 0.8859632015228271, + "step": 578 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.8195210695266724, + "learning_rate": 5.087873462214412e-05, + "loss": 0.8959492444992065, + "step": 580 + }, + { + "epoch": 0.24556962025316456, + "grad_norm": 2.0018749237060547, + "learning_rate": 5.105448154657294e-05, + "loss": 0.8146185874938965, + "step": 582 + }, + { + "epoch": 0.24641350210970464, + "grad_norm": 2.09798526763916, + "learning_rate": 5.1230228471001764e-05, + "loss": 0.8545317053794861, + "step": 584 + }, + { + "epoch": 0.24725738396624472, + "grad_norm": 1.8063944578170776, + "learning_rate": 5.140597539543058e-05, + "loss": 0.8650105595588684, + "step": 586 + }, + { + "epoch": 0.2481012658227848, + "grad_norm": 1.8535740375518799, + "learning_rate": 5.15817223198594e-05, + "loss": 0.8395693302154541, + "step": 588 + }, + { + "epoch": 0.2489451476793249, + "grad_norm": 2.1443960666656494, + "learning_rate": 5.175746924428823e-05, + "loss": 0.8267397284507751, + "step": 590 + }, + { + "epoch": 0.249789029535865, + "grad_norm": 1.9637391567230225, + "learning_rate": 5.193321616871705e-05, + "loss": 0.8500015139579773, + "step": 592 + }, + { + "epoch": 0.25063291139240507, + "grad_norm": 1.9457582235336304, + "learning_rate": 5.2108963093145866e-05, + "loss": 0.887481153011322, + "step": 594 + }, + { + "epoch": 0.2514767932489452, + "grad_norm": 1.7458715438842773, + "learning_rate": 5.228471001757469e-05, + "loss": 0.8444154858589172, + "step": 596 + }, + { + "epoch": 0.2523206751054852, + "grad_norm": 1.8341439962387085, + "learning_rate": 5.2460456942003525e-05, + "loss": 0.8301781415939331, + "step": 598 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 2.127747058868408, + "learning_rate": 5.2636203866432344e-05, + "loss": 0.8921551704406738, + "step": 600 + }, + { + "epoch": 0.25316455696202533, + "eval_loss": 0.8903881311416626, + "eval_runtime": 845.9969, + "eval_samples_per_second": 2.491, + "eval_steps_per_second": 2.491, + "step": 600 + }, + { + "epoch": 0.2540084388185654, + "grad_norm": 2.421459674835205, + "learning_rate": 5.281195079086116e-05, + "loss": 0.8678019642829895, + "step": 602 + }, + { + "epoch": 0.2548523206751055, + "grad_norm": 1.7736057043075562, + "learning_rate": 5.298769771528999e-05, + "loss": 0.8564275503158569, + "step": 604 + }, + { + "epoch": 0.25569620253164554, + "grad_norm": 2.28430438041687, + "learning_rate": 5.316344463971881e-05, + "loss": 0.8529049158096313, + "step": 606 + }, + { + "epoch": 0.25654008438818565, + "grad_norm": 1.8892366886138916, + "learning_rate": 5.333919156414763e-05, + "loss": 0.8672881126403809, + "step": 608 + }, + { + "epoch": 0.25738396624472576, + "grad_norm": 1.9059702157974243, + "learning_rate": 5.3514938488576446e-05, + "loss": 0.9094445109367371, + "step": 610 + }, + { + "epoch": 0.2582278481012658, + "grad_norm": 2.0657339096069336, + "learning_rate": 5.369068541300527e-05, + "loss": 0.8361946940422058, + "step": 612 + }, + { + "epoch": 0.2590717299578059, + "grad_norm": 1.8987553119659424, + "learning_rate": 5.3866432337434105e-05, + "loss": 0.8319925665855408, + "step": 614 + }, + { + "epoch": 0.25991561181434597, + "grad_norm": 2.1176226139068604, + "learning_rate": 5.4042179261862924e-05, + "loss": 0.9818069934844971, + "step": 616 + }, + { + "epoch": 0.2607594936708861, + "grad_norm": 2.142096519470215, + "learning_rate": 5.421792618629174e-05, + "loss": 0.8675919771194458, + "step": 618 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 1.9527089595794678, + "learning_rate": 5.439367311072057e-05, + "loss": 0.8845479488372803, + "step": 620 + }, + { + "epoch": 0.26244725738396624, + "grad_norm": 1.7071453332901, + "learning_rate": 5.456942003514939e-05, + "loss": 0.809393048286438, + "step": 622 + }, + { + "epoch": 0.26329113924050634, + "grad_norm": 1.9133527278900146, + "learning_rate": 5.474516695957821e-05, + "loss": 0.8262377977371216, + "step": 624 + }, + { + "epoch": 0.2641350210970464, + "grad_norm": 2.0217554569244385, + "learning_rate": 5.492091388400703e-05, + "loss": 0.9006736278533936, + "step": 626 + }, + { + "epoch": 0.2649789029535865, + "grad_norm": 1.773273229598999, + "learning_rate": 5.509666080843585e-05, + "loss": 0.8243603110313416, + "step": 628 + }, + { + "epoch": 0.26582278481012656, + "grad_norm": 1.6580880880355835, + "learning_rate": 5.527240773286467e-05, + "loss": 0.8112778663635254, + "step": 630 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.8342082500457764, + "learning_rate": 5.5448154657293504e-05, + "loss": 0.8390820622444153, + "step": 632 + }, + { + "epoch": 0.26751054852320677, + "grad_norm": 1.863695502281189, + "learning_rate": 5.5623901581722323e-05, + "loss": 0.8264521360397339, + "step": 634 + }, + { + "epoch": 0.2683544303797468, + "grad_norm": 1.9462928771972656, + "learning_rate": 5.579964850615115e-05, + "loss": 0.9512701630592346, + "step": 636 + }, + { + "epoch": 0.26919831223628693, + "grad_norm": 1.7776058912277222, + "learning_rate": 5.597539543057997e-05, + "loss": 0.9422703981399536, + "step": 638 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 2.9457077980041504, + "learning_rate": 5.615114235500879e-05, + "loss": 0.7991042137145996, + "step": 640 + }, + { + "epoch": 0.2708860759493671, + "grad_norm": 1.445265531539917, + "learning_rate": 5.6326889279437614e-05, + "loss": 0.8188099265098572, + "step": 642 + }, + { + "epoch": 0.2717299578059072, + "grad_norm": 2.063850164413452, + "learning_rate": 5.650263620386643e-05, + "loss": 0.9799772500991821, + "step": 644 + }, + { + "epoch": 0.27257383966244725, + "grad_norm": 2.0488009452819824, + "learning_rate": 5.667838312829525e-05, + "loss": 0.8462742567062378, + "step": 646 + }, + { + "epoch": 0.27341772151898736, + "grad_norm": 1.8747851848602295, + "learning_rate": 5.685413005272408e-05, + "loss": 0.8226412534713745, + "step": 648 + }, + { + "epoch": 0.2742616033755274, + "grad_norm": 1.849074125289917, + "learning_rate": 5.702987697715291e-05, + "loss": 0.9146338105201721, + "step": 650 + }, + { + "epoch": 0.2751054852320675, + "grad_norm": 1.7738500833511353, + "learning_rate": 5.720562390158173e-05, + "loss": 0.7574424147605896, + "step": 652 + }, + { + "epoch": 0.2759493670886076, + "grad_norm": 1.911102294921875, + "learning_rate": 5.738137082601055e-05, + "loss": 0.8930003046989441, + "step": 654 + }, + { + "epoch": 0.2767932489451477, + "grad_norm": 1.5716617107391357, + "learning_rate": 5.755711775043937e-05, + "loss": 0.7578965425491333, + "step": 656 + }, + { + "epoch": 0.2776371308016878, + "grad_norm": 1.789036512374878, + "learning_rate": 5.7732864674868194e-05, + "loss": 0.8149038553237915, + "step": 658 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.68622624874115, + "learning_rate": 5.790861159929701e-05, + "loss": 0.8265765905380249, + "step": 660 + }, + { + "epoch": 0.27932489451476794, + "grad_norm": 2.078423261642456, + "learning_rate": 5.808435852372583e-05, + "loss": 0.9651970267295837, + "step": 662 + }, + { + "epoch": 0.280168776371308, + "grad_norm": 1.7878645658493042, + "learning_rate": 5.826010544815466e-05, + "loss": 0.8295148015022278, + "step": 664 + }, + { + "epoch": 0.2810126582278481, + "grad_norm": 1.970838189125061, + "learning_rate": 5.843585237258348e-05, + "loss": 0.7778491377830505, + "step": 666 + }, + { + "epoch": 0.2818565400843882, + "grad_norm": 1.943596363067627, + "learning_rate": 5.861159929701231e-05, + "loss": 0.9818071722984314, + "step": 668 + }, + { + "epoch": 0.28270042194092826, + "grad_norm": 1.8793812990188599, + "learning_rate": 5.878734622144113e-05, + "loss": 0.9297797083854675, + "step": 670 + }, + { + "epoch": 0.28354430379746837, + "grad_norm": 1.8813483715057373, + "learning_rate": 5.8963093145869955e-05, + "loss": 0.8748109936714172, + "step": 672 + }, + { + "epoch": 0.2843881856540084, + "grad_norm": 1.7658562660217285, + "learning_rate": 5.9138840070298774e-05, + "loss": 0.8505244851112366, + "step": 674 + }, + { + "epoch": 0.2852320675105485, + "grad_norm": 1.6767617464065552, + "learning_rate": 5.931458699472759e-05, + "loss": 0.8476597666740417, + "step": 676 + }, + { + "epoch": 0.28607594936708863, + "grad_norm": 2.703104257583618, + "learning_rate": 5.949033391915641e-05, + "loss": 0.8775192499160767, + "step": 678 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.9959728717803955, + "learning_rate": 5.966608084358524e-05, + "loss": 0.855262279510498, + "step": 680 + }, + { + "epoch": 0.2877637130801688, + "grad_norm": 1.9093716144561768, + "learning_rate": 5.984182776801406e-05, + "loss": 0.7574936151504517, + "step": 682 + }, + { + "epoch": 0.28860759493670884, + "grad_norm": 1.9829599857330322, + "learning_rate": 6.001757469244289e-05, + "loss": 0.8630690574645996, + "step": 684 + }, + { + "epoch": 0.28945147679324895, + "grad_norm": 1.8777490854263306, + "learning_rate": 6.019332161687171e-05, + "loss": 0.8513249158859253, + "step": 686 + }, + { + "epoch": 0.290295358649789, + "grad_norm": 1.9453173875808716, + "learning_rate": 6.0369068541300535e-05, + "loss": 0.9097008109092712, + "step": 688 + }, + { + "epoch": 0.2911392405063291, + "grad_norm": 1.8527908325195312, + "learning_rate": 6.0544815465729354e-05, + "loss": 0.8291722536087036, + "step": 690 + }, + { + "epoch": 0.2919831223628692, + "grad_norm": 1.9255812168121338, + "learning_rate": 6.0720562390158174e-05, + "loss": 0.880009651184082, + "step": 692 + }, + { + "epoch": 0.29282700421940927, + "grad_norm": 1.6637977361679077, + "learning_rate": 6.0896309314587e-05, + "loss": 0.8791794180870056, + "step": 694 + }, + { + "epoch": 0.2936708860759494, + "grad_norm": 1.825940728187561, + "learning_rate": 6.107205623901582e-05, + "loss": 0.8662407398223877, + "step": 696 + }, + { + "epoch": 0.29451476793248943, + "grad_norm": 1.9348198175430298, + "learning_rate": 6.124780316344464e-05, + "loss": 0.8984515070915222, + "step": 698 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 1.659345030784607, + "learning_rate": 6.142355008787346e-05, + "loss": 0.827385663986206, + "step": 700 + }, + { + "epoch": 0.29535864978902954, + "eval_loss": 0.8730722069740295, + "eval_runtime": 858.184, + "eval_samples_per_second": 2.455, + "eval_steps_per_second": 2.455, + "step": 700 + }, + { + "epoch": 0.29620253164556964, + "grad_norm": 1.6531789302825928, + "learning_rate": 6.159929701230229e-05, + "loss": 0.9337764382362366, + "step": 702 + }, + { + "epoch": 0.2970464135021097, + "grad_norm": 1.8269121646881104, + "learning_rate": 6.177504393673111e-05, + "loss": 0.8250943422317505, + "step": 704 + }, + { + "epoch": 0.2978902953586498, + "grad_norm": 1.692808747291565, + "learning_rate": 6.195079086115994e-05, + "loss": 0.8657428026199341, + "step": 706 + }, + { + "epoch": 0.29873417721518986, + "grad_norm": 1.6736913919448853, + "learning_rate": 6.212653778558876e-05, + "loss": 0.8889590501785278, + "step": 708 + }, + { + "epoch": 0.29957805907172996, + "grad_norm": 1.6841140985488892, + "learning_rate": 6.230228471001758e-05, + "loss": 0.7822914123535156, + "step": 710 + }, + { + "epoch": 0.30042194092827, + "grad_norm": 1.6644599437713623, + "learning_rate": 6.24780316344464e-05, + "loss": 0.8747053742408752, + "step": 712 + }, + { + "epoch": 0.3012658227848101, + "grad_norm": 1.8187819719314575, + "learning_rate": 6.265377855887522e-05, + "loss": 0.8976446390151978, + "step": 714 + }, + { + "epoch": 0.30210970464135023, + "grad_norm": 1.7845178842544556, + "learning_rate": 6.282952548330404e-05, + "loss": 0.9401160478591919, + "step": 716 + }, + { + "epoch": 0.3029535864978903, + "grad_norm": 1.559773564338684, + "learning_rate": 6.300527240773286e-05, + "loss": 0.8754280209541321, + "step": 718 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 1.5919631719589233, + "learning_rate": 6.318101933216169e-05, + "loss": 0.8278581500053406, + "step": 720 + }, + { + "epoch": 0.30464135021097044, + "grad_norm": 1.8551076650619507, + "learning_rate": 6.335676625659052e-05, + "loss": 0.8868640065193176, + "step": 722 + }, + { + "epoch": 0.30548523206751055, + "grad_norm": 1.6907769441604614, + "learning_rate": 6.353251318101934e-05, + "loss": 0.8631605505943298, + "step": 724 + }, + { + "epoch": 0.30632911392405066, + "grad_norm": 1.820867657661438, + "learning_rate": 6.370826010544816e-05, + "loss": 0.9142873883247375, + "step": 726 + }, + { + "epoch": 0.3071729957805907, + "grad_norm": 1.685154676437378, + "learning_rate": 6.388400702987698e-05, + "loss": 0.8258634805679321, + "step": 728 + }, + { + "epoch": 0.3080168776371308, + "grad_norm": 1.9294627904891968, + "learning_rate": 6.40597539543058e-05, + "loss": 0.9545516967773438, + "step": 730 + }, + { + "epoch": 0.30886075949367087, + "grad_norm": 1.6075409650802612, + "learning_rate": 6.423550087873462e-05, + "loss": 0.8370757699012756, + "step": 732 + }, + { + "epoch": 0.309704641350211, + "grad_norm": 1.635750651359558, + "learning_rate": 6.441124780316345e-05, + "loss": 0.8356084823608398, + "step": 734 + }, + { + "epoch": 0.3105485232067511, + "grad_norm": 1.6376131772994995, + "learning_rate": 6.458699472759227e-05, + "loss": 0.7579531669616699, + "step": 736 + }, + { + "epoch": 0.31139240506329113, + "grad_norm": 1.7135766744613647, + "learning_rate": 6.47627416520211e-05, + "loss": 0.8436318039894104, + "step": 738 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 1.7095093727111816, + "learning_rate": 6.493848857644992e-05, + "loss": 0.7998805046081543, + "step": 740 + }, + { + "epoch": 0.3130801687763713, + "grad_norm": 1.782615303993225, + "learning_rate": 6.511423550087874e-05, + "loss": 0.915776789188385, + "step": 742 + }, + { + "epoch": 0.3139240506329114, + "grad_norm": 1.8461172580718994, + "learning_rate": 6.528998242530756e-05, + "loss": 0.8300962448120117, + "step": 744 + }, + { + "epoch": 0.31476793248945145, + "grad_norm": 1.5659871101379395, + "learning_rate": 6.546572934973638e-05, + "loss": 0.8239848017692566, + "step": 746 + }, + { + "epoch": 0.31561181434599156, + "grad_norm": 1.9997349977493286, + "learning_rate": 6.56414762741652e-05, + "loss": 0.8236988186836243, + "step": 748 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.9811526536941528, + "learning_rate": 6.581722319859403e-05, + "loss": 0.8516603112220764, + "step": 750 + }, + { + "epoch": 0.3172995780590717, + "grad_norm": 1.9877923727035522, + "learning_rate": 6.599297012302285e-05, + "loss": 0.9037567973136902, + "step": 752 + }, + { + "epoch": 0.3181434599156118, + "grad_norm": 1.6729352474212646, + "learning_rate": 6.616871704745168e-05, + "loss": 0.8350864052772522, + "step": 754 + }, + { + "epoch": 0.3189873417721519, + "grad_norm": 1.9055802822113037, + "learning_rate": 6.63444639718805e-05, + "loss": 0.8246616125106812, + "step": 756 + }, + { + "epoch": 0.319831223628692, + "grad_norm": 1.597999930381775, + "learning_rate": 6.652021089630932e-05, + "loss": 0.8014416098594666, + "step": 758 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.7432531118392944, + "learning_rate": 6.669595782073814e-05, + "loss": 0.9199523329734802, + "step": 760 + }, + { + "epoch": 0.32151898734177214, + "grad_norm": 1.820164442062378, + "learning_rate": 6.687170474516696e-05, + "loss": 0.7764829397201538, + "step": 762 + }, + { + "epoch": 0.32236286919831225, + "grad_norm": 1.6408652067184448, + "learning_rate": 6.704745166959578e-05, + "loss": 0.8072620630264282, + "step": 764 + }, + { + "epoch": 0.3232067510548523, + "grad_norm": 1.8894155025482178, + "learning_rate": 6.722319859402461e-05, + "loss": 0.9006885886192322, + "step": 766 + }, + { + "epoch": 0.3240506329113924, + "grad_norm": 1.6903613805770874, + "learning_rate": 6.739894551845343e-05, + "loss": 0.7772189378738403, + "step": 768 + }, + { + "epoch": 0.32489451476793246, + "grad_norm": 1.7540696859359741, + "learning_rate": 6.757469244288225e-05, + "loss": 0.8825590014457703, + "step": 770 + }, + { + "epoch": 0.32573839662447257, + "grad_norm": 1.603008508682251, + "learning_rate": 6.775043936731108e-05, + "loss": 0.8376453518867493, + "step": 772 + }, + { + "epoch": 0.3265822784810127, + "grad_norm": 1.5381462574005127, + "learning_rate": 6.79261862917399e-05, + "loss": 0.92608243227005, + "step": 774 + }, + { + "epoch": 0.32742616033755273, + "grad_norm": 1.4815537929534912, + "learning_rate": 6.810193321616872e-05, + "loss": 0.6842183470726013, + "step": 776 + }, + { + "epoch": 0.32827004219409284, + "grad_norm": 1.8543411493301392, + "learning_rate": 6.827768014059754e-05, + "loss": 0.8868235349655151, + "step": 778 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 1.8895748853683472, + "learning_rate": 6.845342706502637e-05, + "loss": 0.8148112297058105, + "step": 780 + }, + { + "epoch": 0.329957805907173, + "grad_norm": 1.8150591850280762, + "learning_rate": 6.862917398945519e-05, + "loss": 0.8760337829589844, + "step": 782 + }, + { + "epoch": 0.3308016877637131, + "grad_norm": 1.6661378145217896, + "learning_rate": 6.880492091388401e-05, + "loss": 0.8266322612762451, + "step": 784 + }, + { + "epoch": 0.33164556962025316, + "grad_norm": 2.2849128246307373, + "learning_rate": 6.898066783831283e-05, + "loss": 0.8599053025245667, + "step": 786 + }, + { + "epoch": 0.33248945147679326, + "grad_norm": 1.7233171463012695, + "learning_rate": 6.915641476274165e-05, + "loss": 0.8312317132949829, + "step": 788 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.7637618780136108, + "learning_rate": 6.933216168717048e-05, + "loss": 0.8379700779914856, + "step": 790 + }, + { + "epoch": 0.3341772151898734, + "grad_norm": 1.7780474424362183, + "learning_rate": 6.95079086115993e-05, + "loss": 0.8994934558868408, + "step": 792 + }, + { + "epoch": 0.33502109704641353, + "grad_norm": 1.5798883438110352, + "learning_rate": 6.968365553602812e-05, + "loss": 0.8021857738494873, + "step": 794 + }, + { + "epoch": 0.3358649789029536, + "grad_norm": 1.7316070795059204, + "learning_rate": 6.985940246045695e-05, + "loss": 0.8814419507980347, + "step": 796 + }, + { + "epoch": 0.3367088607594937, + "grad_norm": 1.711315631866455, + "learning_rate": 7.003514938488577e-05, + "loss": 0.8545029163360596, + "step": 798 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 1.5023137331008911, + "learning_rate": 7.021089630931459e-05, + "loss": 0.8006189465522766, + "step": 800 + }, + { + "epoch": 0.33755274261603374, + "eval_loss": 0.8635594248771667, + "eval_runtime": 865.9348, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 800 + }, + { + "epoch": 0.33839662447257385, + "grad_norm": 1.8377124071121216, + "learning_rate": 7.038664323374341e-05, + "loss": 0.7625874280929565, + "step": 802 + }, + { + "epoch": 0.3392405063291139, + "grad_norm": 1.5361332893371582, + "learning_rate": 7.056239015817223e-05, + "loss": 0.8490484356880188, + "step": 804 + }, + { + "epoch": 0.340084388185654, + "grad_norm": 1.8727388381958008, + "learning_rate": 7.073813708260105e-05, + "loss": 0.8915753364562988, + "step": 806 + }, + { + "epoch": 0.3409282700421941, + "grad_norm": 1.567700743675232, + "learning_rate": 7.091388400702988e-05, + "loss": 0.8902620077133179, + "step": 808 + }, + { + "epoch": 0.34177215189873417, + "grad_norm": 1.5302914381027222, + "learning_rate": 7.10896309314587e-05, + "loss": 0.7897103428840637, + "step": 810 + }, + { + "epoch": 0.3426160337552743, + "grad_norm": 1.8819153308868408, + "learning_rate": 7.126537785588753e-05, + "loss": 0.8648831248283386, + "step": 812 + }, + { + "epoch": 0.3434599156118143, + "grad_norm": 1.5671379566192627, + "learning_rate": 7.144112478031635e-05, + "loss": 0.8449499607086182, + "step": 814 + }, + { + "epoch": 0.34430379746835443, + "grad_norm": 1.6570971012115479, + "learning_rate": 7.161687170474517e-05, + "loss": 0.848559558391571, + "step": 816 + }, + { + "epoch": 0.34514767932489454, + "grad_norm": 1.9108437299728394, + "learning_rate": 7.179261862917399e-05, + "loss": 0.8847543597221375, + "step": 818 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.4909496307373047, + "learning_rate": 7.196836555360281e-05, + "loss": 0.7642563581466675, + "step": 820 + }, + { + "epoch": 0.3468354430379747, + "grad_norm": 1.768518328666687, + "learning_rate": 7.214411247803163e-05, + "loss": 0.8714305758476257, + "step": 822 + }, + { + "epoch": 0.34767932489451475, + "grad_norm": 1.715343952178955, + "learning_rate": 7.231985940246046e-05, + "loss": 0.7712987661361694, + "step": 824 + }, + { + "epoch": 0.34852320675105486, + "grad_norm": 1.6687803268432617, + "learning_rate": 7.24956063268893e-05, + "loss": 0.8122798204421997, + "step": 826 + }, + { + "epoch": 0.3493670886075949, + "grad_norm": 1.5160514116287231, + "learning_rate": 7.267135325131811e-05, + "loss": 0.793245792388916, + "step": 828 + }, + { + "epoch": 0.350210970464135, + "grad_norm": 1.6449401378631592, + "learning_rate": 7.284710017574693e-05, + "loss": 0.8747497200965881, + "step": 830 + }, + { + "epoch": 0.3510548523206751, + "grad_norm": 1.3907722234725952, + "learning_rate": 7.302284710017575e-05, + "loss": 0.6743978261947632, + "step": 832 + }, + { + "epoch": 0.3518987341772152, + "grad_norm": 1.633555293083191, + "learning_rate": 7.319859402460457e-05, + "loss": 0.8524789214134216, + "step": 834 + }, + { + "epoch": 0.3527426160337553, + "grad_norm": 1.5414257049560547, + "learning_rate": 7.337434094903339e-05, + "loss": 0.8045110702514648, + "step": 836 + }, + { + "epoch": 0.35358649789029534, + "grad_norm": 1.8520616292953491, + "learning_rate": 7.355008787346221e-05, + "loss": 0.8319593071937561, + "step": 838 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.6629763841629028, + "learning_rate": 7.372583479789104e-05, + "loss": 0.8188939094543457, + "step": 840 + }, + { + "epoch": 0.35527426160337555, + "grad_norm": 1.804087519645691, + "learning_rate": 7.390158172231987e-05, + "loss": 0.8875360488891602, + "step": 842 + }, + { + "epoch": 0.3561181434599156, + "grad_norm": 1.6031663417816162, + "learning_rate": 7.407732864674869e-05, + "loss": 0.8159612417221069, + "step": 844 + }, + { + "epoch": 0.3569620253164557, + "grad_norm": 1.7413033246994019, + "learning_rate": 7.425307557117751e-05, + "loss": 0.8422684669494629, + "step": 846 + }, + { + "epoch": 0.35780590717299576, + "grad_norm": 1.7699719667434692, + "learning_rate": 7.442882249560633e-05, + "loss": 0.9343502521514893, + "step": 848 + }, + { + "epoch": 0.35864978902953587, + "grad_norm": 1.4613301753997803, + "learning_rate": 7.460456942003515e-05, + "loss": 0.8168979287147522, + "step": 850 + }, + { + "epoch": 0.3594936708860759, + "grad_norm": 1.542431354522705, + "learning_rate": 7.478031634446397e-05, + "loss": 0.9014382362365723, + "step": 852 + }, + { + "epoch": 0.36033755274261603, + "grad_norm": 1.6070159673690796, + "learning_rate": 7.49560632688928e-05, + "loss": 0.8162738084793091, + "step": 854 + }, + { + "epoch": 0.36118143459915614, + "grad_norm": 1.7979451417922974, + "learning_rate": 7.513181019332162e-05, + "loss": 0.8354527950286865, + "step": 856 + }, + { + "epoch": 0.3620253164556962, + "grad_norm": 2.327045202255249, + "learning_rate": 7.530755711775044e-05, + "loss": 0.8214042782783508, + "step": 858 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 1.5085111856460571, + "learning_rate": 7.548330404217927e-05, + "loss": 0.7472147941589355, + "step": 860 + }, + { + "epoch": 0.36371308016877635, + "grad_norm": 1.6006290912628174, + "learning_rate": 7.565905096660809e-05, + "loss": 0.7586950063705444, + "step": 862 + }, + { + "epoch": 0.36455696202531646, + "grad_norm": 1.5170620679855347, + "learning_rate": 7.583479789103691e-05, + "loss": 0.8169914484024048, + "step": 864 + }, + { + "epoch": 0.36540084388185656, + "grad_norm": 1.5848352909088135, + "learning_rate": 7.601054481546573e-05, + "loss": 0.8263922929763794, + "step": 866 + }, + { + "epoch": 0.3662447257383966, + "grad_norm": 1.8502342700958252, + "learning_rate": 7.618629173989455e-05, + "loss": 0.8726240992546082, + "step": 868 + }, + { + "epoch": 0.3670886075949367, + "grad_norm": 1.506847620010376, + "learning_rate": 7.636203866432338e-05, + "loss": 0.7220374941825867, + "step": 870 + }, + { + "epoch": 0.3679324894514768, + "grad_norm": 1.5350452661514282, + "learning_rate": 7.65377855887522e-05, + "loss": 0.8028547167778015, + "step": 872 + }, + { + "epoch": 0.3687763713080169, + "grad_norm": 1.5011043548583984, + "learning_rate": 7.671353251318102e-05, + "loss": 0.7659649848937988, + "step": 874 + }, + { + "epoch": 0.369620253164557, + "grad_norm": 1.7019832134246826, + "learning_rate": 7.688927943760984e-05, + "loss": 0.8773653507232666, + "step": 876 + }, + { + "epoch": 0.37046413502109704, + "grad_norm": 1.4918498992919922, + "learning_rate": 7.706502636203867e-05, + "loss": 0.7977569103240967, + "step": 878 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 1.6422638893127441, + "learning_rate": 7.724077328646749e-05, + "loss": 0.7491976022720337, + "step": 880 + }, + { + "epoch": 0.3721518987341772, + "grad_norm": 1.7590434551239014, + "learning_rate": 7.741652021089631e-05, + "loss": 0.8754181265830994, + "step": 882 + }, + { + "epoch": 0.3729957805907173, + "grad_norm": 3.868894100189209, + "learning_rate": 7.759226713532513e-05, + "loss": 0.8482301235198975, + "step": 884 + }, + { + "epoch": 0.37383966244725736, + "grad_norm": 2.111875534057617, + "learning_rate": 7.776801405975396e-05, + "loss": 0.8109031915664673, + "step": 886 + }, + { + "epoch": 0.37468354430379747, + "grad_norm": 2.0838418006896973, + "learning_rate": 7.794376098418278e-05, + "loss": 0.8660775423049927, + "step": 888 + }, + { + "epoch": 0.3755274261603376, + "grad_norm": 1.553022027015686, + "learning_rate": 7.81195079086116e-05, + "loss": 0.8418024778366089, + "step": 890 + }, + { + "epoch": 0.3763713080168776, + "grad_norm": 1.334747314453125, + "learning_rate": 7.829525483304042e-05, + "loss": 0.7764869928359985, + "step": 892 + }, + { + "epoch": 0.37721518987341773, + "grad_norm": 1.4692286252975464, + "learning_rate": 7.847100175746925e-05, + "loss": 0.7460401654243469, + "step": 894 + }, + { + "epoch": 0.3780590717299578, + "grad_norm": 1.5374023914337158, + "learning_rate": 7.864674868189807e-05, + "loss": 0.7662873268127441, + "step": 896 + }, + { + "epoch": 0.3789029535864979, + "grad_norm": 1.5662524700164795, + "learning_rate": 7.882249560632689e-05, + "loss": 0.8165306448936462, + "step": 898 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 4.498590469360352, + "learning_rate": 7.899824253075572e-05, + "loss": 0.7913232445716858, + "step": 900 + }, + { + "epoch": 0.379746835443038, + "eval_loss": 0.8491304516792297, + "eval_runtime": 852.6211, + "eval_samples_per_second": 2.471, + "eval_steps_per_second": 2.471, + "step": 900 + }, + { + "epoch": 0.38059071729957805, + "grad_norm": 1.6320613622665405, + "learning_rate": 7.917398945518454e-05, + "loss": 0.8097161054611206, + "step": 902 + }, + { + "epoch": 0.38143459915611816, + "grad_norm": 1.2562934160232544, + "learning_rate": 7.934973637961336e-05, + "loss": 0.786399781703949, + "step": 904 + }, + { + "epoch": 0.3822784810126582, + "grad_norm": 1.6957594156265259, + "learning_rate": 7.952548330404218e-05, + "loss": 0.8385500311851501, + "step": 906 + }, + { + "epoch": 0.3831223628691983, + "grad_norm": 1.6662386655807495, + "learning_rate": 7.9701230228471e-05, + "loss": 0.8157848715782166, + "step": 908 + }, + { + "epoch": 0.38396624472573837, + "grad_norm": 1.6717777252197266, + "learning_rate": 7.987697715289982e-05, + "loss": 0.7937968373298645, + "step": 910 + }, + { + "epoch": 0.3848101265822785, + "grad_norm": 1.399484395980835, + "learning_rate": 8.005272407732865e-05, + "loss": 0.7800109386444092, + "step": 912 + }, + { + "epoch": 0.3856540084388186, + "grad_norm": 1.5671080350875854, + "learning_rate": 8.022847100175747e-05, + "loss": 0.8135939240455627, + "step": 914 + }, + { + "epoch": 0.38649789029535864, + "grad_norm": 1.4427763223648071, + "learning_rate": 8.04042179261863e-05, + "loss": 0.7482035160064697, + "step": 916 + }, + { + "epoch": 0.38734177215189874, + "grad_norm": 1.3314121961593628, + "learning_rate": 8.057996485061512e-05, + "loss": 0.7201873064041138, + "step": 918 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 1.5695286989212036, + "learning_rate": 8.075571177504394e-05, + "loss": 0.7933040857315063, + "step": 920 + }, + { + "epoch": 0.3890295358649789, + "grad_norm": 1.5091747045516968, + "learning_rate": 8.093145869947276e-05, + "loss": 0.8058338165283203, + "step": 922 + }, + { + "epoch": 0.389873417721519, + "grad_norm": 1.6287630796432495, + "learning_rate": 8.110720562390158e-05, + "loss": 0.7617828249931335, + "step": 924 + }, + { + "epoch": 0.39071729957805906, + "grad_norm": 1.6129482984542847, + "learning_rate": 8.12829525483304e-05, + "loss": 0.8710150122642517, + "step": 926 + }, + { + "epoch": 0.39156118143459917, + "grad_norm": 1.6457173824310303, + "learning_rate": 8.145869947275922e-05, + "loss": 0.9122233390808105, + "step": 928 + }, + { + "epoch": 0.3924050632911392, + "grad_norm": 1.6768827438354492, + "learning_rate": 8.163444639718805e-05, + "loss": 0.8339303731918335, + "step": 930 + }, + { + "epoch": 0.39324894514767933, + "grad_norm": 1.5419740676879883, + "learning_rate": 8.181019332161688e-05, + "loss": 0.8220396041870117, + "step": 932 + }, + { + "epoch": 0.39409282700421944, + "grad_norm": 1.4563747644424438, + "learning_rate": 8.19859402460457e-05, + "loss": 0.8531478047370911, + "step": 934 + }, + { + "epoch": 0.3949367088607595, + "grad_norm": 1.6208328008651733, + "learning_rate": 8.216168717047452e-05, + "loss": 0.8330869078636169, + "step": 936 + }, + { + "epoch": 0.3957805907172996, + "grad_norm": 1.6492482423782349, + "learning_rate": 8.233743409490334e-05, + "loss": 0.8011296987533569, + "step": 938 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 2.1611905097961426, + "learning_rate": 8.251318101933216e-05, + "loss": 0.8111353516578674, + "step": 940 + }, + { + "epoch": 0.39746835443037976, + "grad_norm": 1.7108231782913208, + "learning_rate": 8.268892794376098e-05, + "loss": 0.8282017111778259, + "step": 942 + }, + { + "epoch": 0.3983122362869198, + "grad_norm": 1.543465495109558, + "learning_rate": 8.286467486818981e-05, + "loss": 0.7770059704780579, + "step": 944 + }, + { + "epoch": 0.3991561181434599, + "grad_norm": 1.419969081878662, + "learning_rate": 8.304042179261863e-05, + "loss": 0.8646430373191833, + "step": 946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5002100467681885, + "learning_rate": 8.321616871704746e-05, + "loss": 0.7949403524398804, + "step": 948 + }, + { + "epoch": 0.4008438818565401, + "grad_norm": 1.38933265209198, + "learning_rate": 8.339191564147628e-05, + "loss": 0.8124079704284668, + "step": 950 + }, + { + "epoch": 0.4016877637130802, + "grad_norm": 1.5948443412780762, + "learning_rate": 8.35676625659051e-05, + "loss": 0.8634148836135864, + "step": 952 + }, + { + "epoch": 0.40253164556962023, + "grad_norm": 1.4437624216079712, + "learning_rate": 8.374340949033392e-05, + "loss": 0.7410681247711182, + "step": 954 + }, + { + "epoch": 0.40337552742616034, + "grad_norm": 1.3457095623016357, + "learning_rate": 8.391915641476274e-05, + "loss": 0.7680280208587646, + "step": 956 + }, + { + "epoch": 0.40421940928270045, + "grad_norm": 1.610288143157959, + "learning_rate": 8.409490333919156e-05, + "loss": 0.7921904921531677, + "step": 958 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 1.5321530103683472, + "learning_rate": 8.427065026362039e-05, + "loss": 0.8320037126541138, + "step": 960 + }, + { + "epoch": 0.4059071729957806, + "grad_norm": 1.699881672859192, + "learning_rate": 8.444639718804921e-05, + "loss": 0.8303092122077942, + "step": 962 + }, + { + "epoch": 0.40675105485232066, + "grad_norm": 1.591515064239502, + "learning_rate": 8.462214411247804e-05, + "loss": 0.9029796719551086, + "step": 964 + }, + { + "epoch": 0.40759493670886077, + "grad_norm": 1.5930429697036743, + "learning_rate": 8.479789103690686e-05, + "loss": 0.8165359497070312, + "step": 966 + }, + { + "epoch": 0.4084388185654008, + "grad_norm": 1.509774923324585, + "learning_rate": 8.497363796133568e-05, + "loss": 0.8276026248931885, + "step": 968 + }, + { + "epoch": 0.4092827004219409, + "grad_norm": 1.3617016077041626, + "learning_rate": 8.51493848857645e-05, + "loss": 0.8159419894218445, + "step": 970 + }, + { + "epoch": 0.41012658227848103, + "grad_norm": 1.3580708503723145, + "learning_rate": 8.532513181019332e-05, + "loss": 0.7882336378097534, + "step": 972 + }, + { + "epoch": 0.4109704641350211, + "grad_norm": 1.3337358236312866, + "learning_rate": 8.550087873462214e-05, + "loss": 0.7462319731712341, + "step": 974 + }, + { + "epoch": 0.4118143459915612, + "grad_norm": 1.450363278388977, + "learning_rate": 8.567662565905097e-05, + "loss": 0.7500866651535034, + "step": 976 + }, + { + "epoch": 0.41265822784810124, + "grad_norm": 1.5305321216583252, + "learning_rate": 8.585237258347979e-05, + "loss": 0.8432503342628479, + "step": 978 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 1.2097326517105103, + "learning_rate": 8.602811950790861e-05, + "loss": 0.8330482840538025, + "step": 980 + }, + { + "epoch": 0.41434599156118146, + "grad_norm": 1.3916101455688477, + "learning_rate": 8.620386643233744e-05, + "loss": 0.8137149810791016, + "step": 982 + }, + { + "epoch": 0.4151898734177215, + "grad_norm": 1.6411453485488892, + "learning_rate": 8.637961335676626e-05, + "loss": 0.8273854851722717, + "step": 984 + }, + { + "epoch": 0.4160337552742616, + "grad_norm": 1.6734566688537598, + "learning_rate": 8.655536028119508e-05, + "loss": 0.794026255607605, + "step": 986 + }, + { + "epoch": 0.41687763713080167, + "grad_norm": 1.352325677871704, + "learning_rate": 8.67311072056239e-05, + "loss": 0.7721655368804932, + "step": 988 + }, + { + "epoch": 0.4177215189873418, + "grad_norm": 1.5368729829788208, + "learning_rate": 8.690685413005273e-05, + "loss": 0.8123438954353333, + "step": 990 + }, + { + "epoch": 0.41856540084388183, + "grad_norm": 1.4903568029403687, + "learning_rate": 8.708260105448155e-05, + "loss": 0.8370974659919739, + "step": 992 + }, + { + "epoch": 0.41940928270042194, + "grad_norm": 1.3405622243881226, + "learning_rate": 8.725834797891037e-05, + "loss": 0.780426561832428, + "step": 994 + }, + { + "epoch": 0.42025316455696204, + "grad_norm": 1.4761021137237549, + "learning_rate": 8.743409490333919e-05, + "loss": 0.8304934501647949, + "step": 996 + }, + { + "epoch": 0.4210970464135021, + "grad_norm": 1.520033359527588, + "learning_rate": 8.760984182776801e-05, + "loss": 0.7960568070411682, + "step": 998 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 1.6916255950927734, + "learning_rate": 8.778558875219684e-05, + "loss": 0.7884663939476013, + "step": 1000 + }, + { + "epoch": 0.4219409282700422, + "eval_loss": 0.8388314247131348, + "eval_runtime": 847.4828, + "eval_samples_per_second": 2.486, + "eval_steps_per_second": 2.486, + "step": 1000 + }, + { + "epoch": 0.42278481012658226, + "grad_norm": 1.6796396970748901, + "learning_rate": 8.796133567662566e-05, + "loss": 0.7930826544761658, + "step": 1002 + }, + { + "epoch": 0.42362869198312236, + "grad_norm": 1.4480048418045044, + "learning_rate": 8.813708260105448e-05, + "loss": 0.7138194441795349, + "step": 1004 + }, + { + "epoch": 0.42447257383966247, + "grad_norm": 1.2499021291732788, + "learning_rate": 8.831282952548331e-05, + "loss": 0.7367453575134277, + "step": 1006 + }, + { + "epoch": 0.4253164556962025, + "grad_norm": 1.6906769275665283, + "learning_rate": 8.848857644991213e-05, + "loss": 0.9051005244255066, + "step": 1008 + }, + { + "epoch": 0.42616033755274263, + "grad_norm": 1.4196792840957642, + "learning_rate": 8.866432337434095e-05, + "loss": 0.7469457387924194, + "step": 1010 + }, + { + "epoch": 0.4270042194092827, + "grad_norm": 1.5132776498794556, + "learning_rate": 8.884007029876977e-05, + "loss": 0.7443049550056458, + "step": 1012 + }, + { + "epoch": 0.4278481012658228, + "grad_norm": 1.335705280303955, + "learning_rate": 8.901581722319859e-05, + "loss": 0.784084677696228, + "step": 1014 + }, + { + "epoch": 0.4286919831223629, + "grad_norm": 1.6510252952575684, + "learning_rate": 8.919156414762741e-05, + "loss": 0.8603647947311401, + "step": 1016 + }, + { + "epoch": 0.42953586497890295, + "grad_norm": 1.35535728931427, + "learning_rate": 8.936731107205624e-05, + "loss": 0.7921645641326904, + "step": 1018 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 1.4952049255371094, + "learning_rate": 8.954305799648506e-05, + "loss": 0.799993634223938, + "step": 1020 + }, + { + "epoch": 0.4312236286919831, + "grad_norm": 1.5026042461395264, + "learning_rate": 8.97188049209139e-05, + "loss": 0.7697094082832336, + "step": 1022 + }, + { + "epoch": 0.4320675105485232, + "grad_norm": 1.5424275398254395, + "learning_rate": 8.989455184534271e-05, + "loss": 0.7988215684890747, + "step": 1024 + }, + { + "epoch": 0.43291139240506327, + "grad_norm": 1.438716173171997, + "learning_rate": 9.007029876977153e-05, + "loss": 0.7841635942459106, + "step": 1026 + }, + { + "epoch": 0.4337552742616034, + "grad_norm": 1.5040369033813477, + "learning_rate": 9.024604569420035e-05, + "loss": 0.7485025525093079, + "step": 1028 + }, + { + "epoch": 0.4345991561181435, + "grad_norm": 1.4354394674301147, + "learning_rate": 9.042179261862917e-05, + "loss": 0.7735623121261597, + "step": 1030 + }, + { + "epoch": 0.43544303797468353, + "grad_norm": 1.4841680526733398, + "learning_rate": 9.059753954305799e-05, + "loss": 0.8918828964233398, + "step": 1032 + }, + { + "epoch": 0.43628691983122364, + "grad_norm": 1.428813099861145, + "learning_rate": 9.077328646748682e-05, + "loss": 0.835110068321228, + "step": 1034 + }, + { + "epoch": 0.4371308016877637, + "grad_norm": 1.559020757675171, + "learning_rate": 9.094903339191566e-05, + "loss": 0.746295690536499, + "step": 1036 + }, + { + "epoch": 0.4379746835443038, + "grad_norm": 1.6996115446090698, + "learning_rate": 9.112478031634448e-05, + "loss": 0.8089123368263245, + "step": 1038 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 1.6615465879440308, + "learning_rate": 9.13005272407733e-05, + "loss": 0.8807073831558228, + "step": 1040 + }, + { + "epoch": 0.43966244725738396, + "grad_norm": 1.239142894744873, + "learning_rate": 9.147627416520211e-05, + "loss": 0.7638427019119263, + "step": 1042 + }, + { + "epoch": 0.44050632911392407, + "grad_norm": 1.1915178298950195, + "learning_rate": 9.165202108963093e-05, + "loss": 0.7817409634590149, + "step": 1044 + }, + { + "epoch": 0.4413502109704641, + "grad_norm": 1.6276934146881104, + "learning_rate": 9.182776801405975e-05, + "loss": 0.8586427569389343, + "step": 1046 + }, + { + "epoch": 0.4421940928270042, + "grad_norm": 1.480345606803894, + "learning_rate": 9.200351493848857e-05, + "loss": 0.7481811046600342, + "step": 1048 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.308419108390808, + "learning_rate": 9.21792618629174e-05, + "loss": 0.8074686527252197, + "step": 1050 + }, + { + "epoch": 0.4438818565400844, + "grad_norm": 1.6167182922363281, + "learning_rate": 9.235500878734624e-05, + "loss": 0.8455166816711426, + "step": 1052 + }, + { + "epoch": 0.4447257383966245, + "grad_norm": 1.6058826446533203, + "learning_rate": 9.253075571177506e-05, + "loss": 0.7255295515060425, + "step": 1054 + }, + { + "epoch": 0.44556962025316454, + "grad_norm": 1.6745728254318237, + "learning_rate": 9.270650263620387e-05, + "loss": 0.8329368233680725, + "step": 1056 + }, + { + "epoch": 0.44641350210970465, + "grad_norm": 1.5657380819320679, + "learning_rate": 9.28822495606327e-05, + "loss": 0.8583613634109497, + "step": 1058 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 1.5052601099014282, + "learning_rate": 9.305799648506151e-05, + "loss": 0.8546127080917358, + "step": 1060 + }, + { + "epoch": 0.4481012658227848, + "grad_norm": 1.510636806488037, + "learning_rate": 9.323374340949033e-05, + "loss": 0.8416863679885864, + "step": 1062 + }, + { + "epoch": 0.4489451476793249, + "grad_norm": 1.4446617364883423, + "learning_rate": 9.340949033391916e-05, + "loss": 0.830390453338623, + "step": 1064 + }, + { + "epoch": 0.44978902953586497, + "grad_norm": 1.6032582521438599, + "learning_rate": 9.358523725834798e-05, + "loss": 0.8000447154045105, + "step": 1066 + }, + { + "epoch": 0.4506329113924051, + "grad_norm": 1.5295692682266235, + "learning_rate": 9.37609841827768e-05, + "loss": 0.8310818672180176, + "step": 1068 + }, + { + "epoch": 0.45147679324894513, + "grad_norm": 1.3161942958831787, + "learning_rate": 9.393673110720564e-05, + "loss": 0.8377846479415894, + "step": 1070 + }, + { + "epoch": 0.45232067510548524, + "grad_norm": 1.4101601839065552, + "learning_rate": 9.411247803163445e-05, + "loss": 0.7852389216423035, + "step": 1072 + }, + { + "epoch": 0.4531645569620253, + "grad_norm": 1.4352775812149048, + "learning_rate": 9.428822495606327e-05, + "loss": 0.8763723969459534, + "step": 1074 + }, + { + "epoch": 0.4540084388185654, + "grad_norm": 1.4584673643112183, + "learning_rate": 9.44639718804921e-05, + "loss": 0.8177199363708496, + "step": 1076 + }, + { + "epoch": 0.4548523206751055, + "grad_norm": 1.6470575332641602, + "learning_rate": 9.463971880492091e-05, + "loss": 0.8333053588867188, + "step": 1078 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 1.4429512023925781, + "learning_rate": 9.481546572934975e-05, + "loss": 0.8546649217605591, + "step": 1080 + }, + { + "epoch": 0.45654008438818566, + "grad_norm": 1.4885371923446655, + "learning_rate": 9.499121265377856e-05, + "loss": 0.838036298751831, + "step": 1082 + }, + { + "epoch": 0.4573839662447257, + "grad_norm": 1.4601678848266602, + "learning_rate": 9.516695957820738e-05, + "loss": 0.7295010089874268, + "step": 1084 + }, + { + "epoch": 0.4582278481012658, + "grad_norm": 1.2399365901947021, + "learning_rate": 9.53427065026362e-05, + "loss": 0.6990782618522644, + "step": 1086 + }, + { + "epoch": 0.45907172995780593, + "grad_norm": 1.2936921119689941, + "learning_rate": 9.551845342706504e-05, + "loss": 0.7790928483009338, + "step": 1088 + }, + { + "epoch": 0.459915611814346, + "grad_norm": 1.3408331871032715, + "learning_rate": 9.569420035149385e-05, + "loss": 0.8061056733131409, + "step": 1090 + }, + { + "epoch": 0.4607594936708861, + "grad_norm": 1.5525178909301758, + "learning_rate": 9.586994727592267e-05, + "loss": 0.856796383857727, + "step": 1092 + }, + { + "epoch": 0.46160337552742614, + "grad_norm": 1.2944618463516235, + "learning_rate": 9.604569420035149e-05, + "loss": 0.7626663446426392, + "step": 1094 + }, + { + "epoch": 0.46244725738396625, + "grad_norm": 1.412204623222351, + "learning_rate": 9.622144112478033e-05, + "loss": 0.7524681091308594, + "step": 1096 + }, + { + "epoch": 0.46329113924050636, + "grad_norm": 1.4851596355438232, + "learning_rate": 9.639718804920914e-05, + "loss": 0.8430375456809998, + "step": 1098 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 1.831943154335022, + "learning_rate": 9.657293497363796e-05, + "loss": 0.8374918103218079, + "step": 1100 + }, + { + "epoch": 0.4641350210970464, + "eval_loss": 0.8283821940422058, + "eval_runtime": 861.0464, + "eval_samples_per_second": 2.447, + "eval_steps_per_second": 2.447, + "step": 1100 + }, + { + "epoch": 0.4649789029535865, + "grad_norm": 1.4989945888519287, + "learning_rate": 9.674868189806678e-05, + "loss": 0.8063139915466309, + "step": 1102 + }, + { + "epoch": 0.46582278481012657, + "grad_norm": 1.3772722482681274, + "learning_rate": 9.692442882249562e-05, + "loss": 0.8109207153320312, + "step": 1104 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.4963124990463257, + "learning_rate": 9.710017574692443e-05, + "loss": 0.8667853474617004, + "step": 1106 + }, + { + "epoch": 0.4675105485232067, + "grad_norm": 1.4250836372375488, + "learning_rate": 9.727592267135325e-05, + "loss": 0.8020523190498352, + "step": 1108 + }, + { + "epoch": 0.46835443037974683, + "grad_norm": 1.475599765777588, + "learning_rate": 9.745166959578209e-05, + "loss": 0.8271048069000244, + "step": 1110 + }, + { + "epoch": 0.46919831223628694, + "grad_norm": 1.3727436065673828, + "learning_rate": 9.76274165202109e-05, + "loss": 0.7615619897842407, + "step": 1112 + }, + { + "epoch": 0.470042194092827, + "grad_norm": 1.2233914136886597, + "learning_rate": 9.780316344463972e-05, + "loss": 0.7843242883682251, + "step": 1114 + }, + { + "epoch": 0.4708860759493671, + "grad_norm": 1.5734832286834717, + "learning_rate": 9.797891036906854e-05, + "loss": 0.834839940071106, + "step": 1116 + }, + { + "epoch": 0.47172995780590715, + "grad_norm": 1.3778531551361084, + "learning_rate": 9.815465729349736e-05, + "loss": 0.7584373950958252, + "step": 1118 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 1.5535035133361816, + "learning_rate": 9.833040421792618e-05, + "loss": 0.8204697370529175, + "step": 1120 + }, + { + "epoch": 0.47341772151898737, + "grad_norm": 1.4743636846542358, + "learning_rate": 9.850615114235501e-05, + "loss": 0.9012852311134338, + "step": 1122 + }, + { + "epoch": 0.4742616033755274, + "grad_norm": 1.4134864807128906, + "learning_rate": 9.868189806678383e-05, + "loss": 0.8392805457115173, + "step": 1124 + }, + { + "epoch": 0.4751054852320675, + "grad_norm": 1.3308019638061523, + "learning_rate": 9.885764499121267e-05, + "loss": 0.7135441303253174, + "step": 1126 + }, + { + "epoch": 0.4759493670886076, + "grad_norm": 1.5354844331741333, + "learning_rate": 9.903339191564149e-05, + "loss": 0.8464727401733398, + "step": 1128 + }, + { + "epoch": 0.4767932489451477, + "grad_norm": 1.2730523347854614, + "learning_rate": 9.92091388400703e-05, + "loss": 0.7691597938537598, + "step": 1130 + }, + { + "epoch": 0.47763713080168774, + "grad_norm": 1.5459758043289185, + "learning_rate": 9.938488576449912e-05, + "loss": 0.8068788647651672, + "step": 1132 + }, + { + "epoch": 0.47848101265822784, + "grad_norm": 1.345678687095642, + "learning_rate": 9.956063268892794e-05, + "loss": 0.8091006278991699, + "step": 1134 + }, + { + "epoch": 0.47932489451476795, + "grad_norm": 1.317076563835144, + "learning_rate": 9.973637961335676e-05, + "loss": 0.735533595085144, + "step": 1136 + }, + { + "epoch": 0.480168776371308, + "grad_norm": 1.5011168718338013, + "learning_rate": 9.99121265377856e-05, + "loss": 0.7935182452201843, + "step": 1138 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.673899531364441, + "learning_rate": 9.999999855824502e-05, + "loss": 0.8203520774841309, + "step": 1140 + }, + { + "epoch": 0.48185654008438816, + "grad_norm": 1.344337821006775, + "learning_rate": 9.999998702420562e-05, + "loss": 0.7233241200447083, + "step": 1142 + }, + { + "epoch": 0.48270042194092827, + "grad_norm": 1.5819076299667358, + "learning_rate": 9.999996395612948e-05, + "loss": 0.8795552849769592, + "step": 1144 + }, + { + "epoch": 0.4835443037974684, + "grad_norm": 1.7427241802215576, + "learning_rate": 9.999992935402192e-05, + "loss": 0.8482733964920044, + "step": 1146 + }, + { + "epoch": 0.48438818565400843, + "grad_norm": 1.2877503633499146, + "learning_rate": 9.999988321789093e-05, + "loss": 0.7905706167221069, + "step": 1148 + }, + { + "epoch": 0.48523206751054854, + "grad_norm": 1.4887222051620483, + "learning_rate": 9.999982554774715e-05, + "loss": 0.8609708547592163, + "step": 1150 + }, + { + "epoch": 0.4860759493670886, + "grad_norm": 1.3625136613845825, + "learning_rate": 9.999975634360388e-05, + "loss": 0.7890065908432007, + "step": 1152 + }, + { + "epoch": 0.4869198312236287, + "grad_norm": 1.3631492853164673, + "learning_rate": 9.999967560547708e-05, + "loss": 0.7908958196640015, + "step": 1154 + }, + { + "epoch": 0.4877637130801688, + "grad_norm": 1.5244156122207642, + "learning_rate": 9.99995833333854e-05, + "loss": 0.8509655594825745, + "step": 1156 + }, + { + "epoch": 0.48860759493670886, + "grad_norm": 1.2513200044631958, + "learning_rate": 9.999947952735007e-05, + "loss": 0.7329106330871582, + "step": 1158 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 1.1539413928985596, + "learning_rate": 9.99993641873951e-05, + "loss": 0.7237489223480225, + "step": 1160 + }, + { + "epoch": 0.490295358649789, + "grad_norm": 1.3859314918518066, + "learning_rate": 9.999923731354706e-05, + "loss": 0.8650591373443604, + "step": 1162 + }, + { + "epoch": 0.4911392405063291, + "grad_norm": 1.2910805940628052, + "learning_rate": 9.999909890583521e-05, + "loss": 0.7516807913780212, + "step": 1164 + }, + { + "epoch": 0.4919831223628692, + "grad_norm": 1.6100077629089355, + "learning_rate": 9.999894896429152e-05, + "loss": 0.7082475423812866, + "step": 1166 + }, + { + "epoch": 0.4928270042194093, + "grad_norm": 1.2313556671142578, + "learning_rate": 9.999878748895053e-05, + "loss": 0.8403750658035278, + "step": 1168 + }, + { + "epoch": 0.4936708860759494, + "grad_norm": 1.3402830362319946, + "learning_rate": 9.999861447984952e-05, + "loss": 0.8083041906356812, + "step": 1170 + }, + { + "epoch": 0.49451476793248944, + "grad_norm": 1.516775131225586, + "learning_rate": 9.999842993702839e-05, + "loss": 0.8339354991912842, + "step": 1172 + }, + { + "epoch": 0.49535864978902955, + "grad_norm": 1.2698423862457275, + "learning_rate": 9.999823386052971e-05, + "loss": 0.7708724141120911, + "step": 1174 + }, + { + "epoch": 0.4962025316455696, + "grad_norm": 1.339390516281128, + "learning_rate": 9.999802625039872e-05, + "loss": 0.7589715719223022, + "step": 1176 + }, + { + "epoch": 0.4970464135021097, + "grad_norm": 1.4618452787399292, + "learning_rate": 9.99978071066833e-05, + "loss": 0.8523206114768982, + "step": 1178 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 1.4812564849853516, + "learning_rate": 9.9997576429434e-05, + "loss": 0.8143196105957031, + "step": 1180 + }, + { + "epoch": 0.49873417721518987, + "grad_norm": 1.5720716714859009, + "learning_rate": 9.999733421870405e-05, + "loss": 0.800125002861023, + "step": 1182 + }, + { + "epoch": 0.49957805907173, + "grad_norm": 1.4421230554580688, + "learning_rate": 9.99970804745493e-05, + "loss": 0.7618259191513062, + "step": 1184 + }, + { + "epoch": 0.5004219409282701, + "grad_norm": 1.5794934034347534, + "learning_rate": 9.99968151970283e-05, + "loss": 0.7162163853645325, + "step": 1186 + }, + { + "epoch": 0.5012658227848101, + "grad_norm": 1.8590432405471802, + "learning_rate": 9.999653838620225e-05, + "loss": 0.8089820146560669, + "step": 1188 + }, + { + "epoch": 0.5021097046413502, + "grad_norm": 1.5194507837295532, + "learning_rate": 9.999625004213498e-05, + "loss": 0.8011203408241272, + "step": 1190 + }, + { + "epoch": 0.5029535864978903, + "grad_norm": 1.6986470222473145, + "learning_rate": 9.999595016489303e-05, + "loss": 0.761158287525177, + "step": 1192 + }, + { + "epoch": 0.5037974683544304, + "grad_norm": 1.4413946866989136, + "learning_rate": 9.999563875454559e-05, + "loss": 0.7898027300834656, + "step": 1194 + }, + { + "epoch": 0.5046413502109705, + "grad_norm": 1.4509994983673096, + "learning_rate": 9.999531581116443e-05, + "loss": 0.8018442392349243, + "step": 1196 + }, + { + "epoch": 0.5054852320675105, + "grad_norm": 1.400659441947937, + "learning_rate": 9.999498133482412e-05, + "loss": 0.7804076075553894, + "step": 1198 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.486840009689331, + "learning_rate": 9.999463532560178e-05, + "loss": 0.82496178150177, + "step": 1200 + }, + { + "epoch": 0.5063291139240507, + "eval_loss": 0.8186545968055725, + "eval_runtime": 862.1638, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 1200 + }, + { + "epoch": 0.5071729957805907, + "grad_norm": 1.2770357131958008, + "learning_rate": 9.999427778357723e-05, + "loss": 0.8037722706794739, + "step": 1202 + }, + { + "epoch": 0.5080168776371308, + "grad_norm": 1.4540977478027344, + "learning_rate": 9.999390870883297e-05, + "loss": 0.7329373359680176, + "step": 1204 + }, + { + "epoch": 0.5088607594936709, + "grad_norm": 1.4469913244247437, + "learning_rate": 9.999352810145412e-05, + "loss": 0.8224589824676514, + "step": 1206 + }, + { + "epoch": 0.509704641350211, + "grad_norm": 1.46500563621521, + "learning_rate": 9.999313596152847e-05, + "loss": 0.8106292486190796, + "step": 1208 + }, + { + "epoch": 0.510548523206751, + "grad_norm": 1.3526637554168701, + "learning_rate": 9.999273228914649e-05, + "loss": 0.747698187828064, + "step": 1210 + }, + { + "epoch": 0.5113924050632911, + "grad_norm": 1.28840172290802, + "learning_rate": 9.999231708440131e-05, + "loss": 0.7612425684928894, + "step": 1212 + }, + { + "epoch": 0.5122362869198313, + "grad_norm": 1.0283230543136597, + "learning_rate": 9.99918903473887e-05, + "loss": 0.6839463710784912, + "step": 1214 + }, + { + "epoch": 0.5130801687763713, + "grad_norm": 1.5231431722640991, + "learning_rate": 9.999145207820708e-05, + "loss": 0.8539203405380249, + "step": 1216 + }, + { + "epoch": 0.5139240506329114, + "grad_norm": 1.3289231061935425, + "learning_rate": 9.999100227695758e-05, + "loss": 0.7960102558135986, + "step": 1218 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 1.3770930767059326, + "learning_rate": 9.999054094374396e-05, + "loss": 0.7639255523681641, + "step": 1220 + }, + { + "epoch": 0.5156118143459916, + "grad_norm": 1.3028030395507812, + "learning_rate": 9.999006807867262e-05, + "loss": 0.7743061780929565, + "step": 1222 + }, + { + "epoch": 0.5164556962025316, + "grad_norm": 1.1827034950256348, + "learning_rate": 9.998958368185265e-05, + "loss": 0.7922407984733582, + "step": 1224 + }, + { + "epoch": 0.5172995780590718, + "grad_norm": 1.2973705530166626, + "learning_rate": 9.99890877533958e-05, + "loss": 0.7671286463737488, + "step": 1226 + }, + { + "epoch": 0.5181434599156118, + "grad_norm": 1.5820153951644897, + "learning_rate": 9.998858029341646e-05, + "loss": 0.7546951174736023, + "step": 1228 + }, + { + "epoch": 0.5189873417721519, + "grad_norm": 1.6140317916870117, + "learning_rate": 9.99880613020317e-05, + "loss": 0.8734183311462402, + "step": 1230 + }, + { + "epoch": 0.5198312236286919, + "grad_norm": 1.1190184354782104, + "learning_rate": 9.998753077936122e-05, + "loss": 0.8410643339157104, + "step": 1232 + }, + { + "epoch": 0.5206751054852321, + "grad_norm": 1.3876196146011353, + "learning_rate": 9.998698872552744e-05, + "loss": 0.7769841551780701, + "step": 1234 + }, + { + "epoch": 0.5215189873417722, + "grad_norm": 1.699522852897644, + "learning_rate": 9.998643514065535e-05, + "loss": 0.8846109509468079, + "step": 1236 + }, + { + "epoch": 0.5223628691983122, + "grad_norm": 1.3805134296417236, + "learning_rate": 9.998587002487271e-05, + "loss": 0.7664945125579834, + "step": 1238 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 1.3679476976394653, + "learning_rate": 9.998529337830984e-05, + "loss": 0.7243514060974121, + "step": 1240 + }, + { + "epoch": 0.5240506329113924, + "grad_norm": 1.399200677871704, + "learning_rate": 9.998470520109977e-05, + "loss": 0.8061941862106323, + "step": 1242 + }, + { + "epoch": 0.5248945147679325, + "grad_norm": 1.3441044092178345, + "learning_rate": 9.99841054933782e-05, + "loss": 0.7741840481758118, + "step": 1244 + }, + { + "epoch": 0.5257383966244725, + "grad_norm": 1.3375325202941895, + "learning_rate": 9.998349425528344e-05, + "loss": 0.7619491815567017, + "step": 1246 + }, + { + "epoch": 0.5265822784810127, + "grad_norm": 1.5517847537994385, + "learning_rate": 9.998287148695651e-05, + "loss": 0.8315094113349915, + "step": 1248 + }, + { + "epoch": 0.5274261603375527, + "grad_norm": 1.244997501373291, + "learning_rate": 9.998223718854107e-05, + "loss": 0.7536082863807678, + "step": 1250 + }, + { + "epoch": 0.5282700421940928, + "grad_norm": 1.3190033435821533, + "learning_rate": 9.998159136018344e-05, + "loss": 0.826419472694397, + "step": 1252 + }, + { + "epoch": 0.529113924050633, + "grad_norm": 1.2750061750411987, + "learning_rate": 9.998093400203259e-05, + "loss": 0.7866435647010803, + "step": 1254 + }, + { + "epoch": 0.529957805907173, + "grad_norm": 1.422908067703247, + "learning_rate": 9.998026511424017e-05, + "loss": 0.7796626687049866, + "step": 1256 + }, + { + "epoch": 0.5308016877637131, + "grad_norm": 1.435552954673767, + "learning_rate": 9.997958469696048e-05, + "loss": 0.815027117729187, + "step": 1258 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 1.1950994729995728, + "learning_rate": 9.997889275035049e-05, + "loss": 0.6925795674324036, + "step": 1260 + }, + { + "epoch": 0.5324894514767933, + "grad_norm": 1.3049622774124146, + "learning_rate": 9.997818927456978e-05, + "loss": 0.822464108467102, + "step": 1262 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.2197340726852417, + "learning_rate": 9.997747426978066e-05, + "loss": 0.7955381274223328, + "step": 1264 + }, + { + "epoch": 0.5341772151898734, + "grad_norm": 1.2463661432266235, + "learning_rate": 9.997674773614807e-05, + "loss": 0.8642181754112244, + "step": 1266 + }, + { + "epoch": 0.5350210970464135, + "grad_norm": 1.421393871307373, + "learning_rate": 9.99760096738396e-05, + "loss": 0.8776891827583313, + "step": 1268 + }, + { + "epoch": 0.5358649789029536, + "grad_norm": 1.4347561597824097, + "learning_rate": 9.997526008302549e-05, + "loss": 0.7446491122245789, + "step": 1270 + }, + { + "epoch": 0.5367088607594936, + "grad_norm": 1.2056710720062256, + "learning_rate": 9.99744989638787e-05, + "loss": 0.8581281304359436, + "step": 1272 + }, + { + "epoch": 0.5375527426160338, + "grad_norm": 1.1672608852386475, + "learning_rate": 9.997372631657475e-05, + "loss": 0.7386330366134644, + "step": 1274 + }, + { + "epoch": 0.5383966244725739, + "grad_norm": 1.4313966035842896, + "learning_rate": 9.997294214129191e-05, + "loss": 0.7806804776191711, + "step": 1276 + }, + { + "epoch": 0.5392405063291139, + "grad_norm": 1.1666971445083618, + "learning_rate": 9.997214643821107e-05, + "loss": 0.6830351948738098, + "step": 1278 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 1.491783857345581, + "learning_rate": 9.997133920751578e-05, + "loss": 0.8570694327354431, + "step": 1280 + }, + { + "epoch": 0.5409282700421941, + "grad_norm": 1.1879212856292725, + "learning_rate": 9.997052044939226e-05, + "loss": 0.7016772031784058, + "step": 1282 + }, + { + "epoch": 0.5417721518987342, + "grad_norm": 1.2692012786865234, + "learning_rate": 9.996969016402935e-05, + "loss": 0.7711107134819031, + "step": 1284 + }, + { + "epoch": 0.5426160337552742, + "grad_norm": 1.3318448066711426, + "learning_rate": 9.996884835161863e-05, + "loss": 0.7807164788246155, + "step": 1286 + }, + { + "epoch": 0.5434599156118144, + "grad_norm": 1.1786744594573975, + "learning_rate": 9.996799501235425e-05, + "loss": 0.7331319451332092, + "step": 1288 + }, + { + "epoch": 0.5443037974683544, + "grad_norm": 1.4092369079589844, + "learning_rate": 9.996713014643309e-05, + "loss": 0.7191547155380249, + "step": 1290 + }, + { + "epoch": 0.5451476793248945, + "grad_norm": 1.377099633216858, + "learning_rate": 9.996625375405463e-05, + "loss": 0.7233871221542358, + "step": 1292 + }, + { + "epoch": 0.5459915611814345, + "grad_norm": 1.404945969581604, + "learning_rate": 9.996536583542105e-05, + "loss": 0.7925472855567932, + "step": 1294 + }, + { + "epoch": 0.5468354430379747, + "grad_norm": 1.2555286884307861, + "learning_rate": 9.996446639073718e-05, + "loss": 0.7749786376953125, + "step": 1296 + }, + { + "epoch": 0.5476793248945148, + "grad_norm": 1.2577459812164307, + "learning_rate": 9.996355542021048e-05, + "loss": 0.7647517919540405, + "step": 1298 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 1.3587758541107178, + "learning_rate": 9.996263292405113e-05, + "loss": 0.8621891140937805, + "step": 1300 + }, + { + "epoch": 0.5485232067510548, + "eval_loss": 0.808323085308075, + "eval_runtime": 853.577, + "eval_samples_per_second": 2.468, + "eval_steps_per_second": 2.468, + "step": 1300 + }, + { + "epoch": 0.549367088607595, + "grad_norm": 1.327125906944275, + "learning_rate": 9.996169890247191e-05, + "loss": 0.749254584312439, + "step": 1302 + }, + { + "epoch": 0.550210970464135, + "grad_norm": 1.4620670080184937, + "learning_rate": 9.99607533556883e-05, + "loss": 0.7362856268882751, + "step": 1304 + }, + { + "epoch": 0.5510548523206751, + "grad_norm": 1.4119454622268677, + "learning_rate": 9.99597962839184e-05, + "loss": 0.7918445467948914, + "step": 1306 + }, + { + "epoch": 0.5518987341772152, + "grad_norm": 1.497522234916687, + "learning_rate": 9.995882768738298e-05, + "loss": 0.7348005175590515, + "step": 1308 + }, + { + "epoch": 0.5527426160337553, + "grad_norm": 1.535741925239563, + "learning_rate": 9.99578475663055e-05, + "loss": 0.8310725688934326, + "step": 1310 + }, + { + "epoch": 0.5535864978902953, + "grad_norm": 1.4606215953826904, + "learning_rate": 9.995685592091204e-05, + "loss": 0.8232766389846802, + "step": 1312 + }, + { + "epoch": 0.5544303797468354, + "grad_norm": 1.2442357540130615, + "learning_rate": 9.995585275143136e-05, + "loss": 0.8273071050643921, + "step": 1314 + }, + { + "epoch": 0.5552742616033756, + "grad_norm": 1.5128520727157593, + "learning_rate": 9.995483805809487e-05, + "loss": 0.7518656253814697, + "step": 1316 + }, + { + "epoch": 0.5561181434599156, + "grad_norm": 1.340149998664856, + "learning_rate": 9.995381184113664e-05, + "loss": 0.8261662721633911, + "step": 1318 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 1.1409451961517334, + "learning_rate": 9.99527741007934e-05, + "loss": 0.5775256156921387, + "step": 1320 + }, + { + "epoch": 0.5578059071729958, + "grad_norm": 1.3489247560501099, + "learning_rate": 9.995172483730455e-05, + "loss": 0.7698423862457275, + "step": 1322 + }, + { + "epoch": 0.5586497890295359, + "grad_norm": 1.4950530529022217, + "learning_rate": 9.995066405091211e-05, + "loss": 0.8053334355354309, + "step": 1324 + }, + { + "epoch": 0.5594936708860759, + "grad_norm": 1.3814653158187866, + "learning_rate": 9.994959174186078e-05, + "loss": 0.7826266288757324, + "step": 1326 + }, + { + "epoch": 0.560337552742616, + "grad_norm": 1.3383625745773315, + "learning_rate": 9.994850791039796e-05, + "loss": 0.7862131595611572, + "step": 1328 + }, + { + "epoch": 0.5611814345991561, + "grad_norm": 1.3529670238494873, + "learning_rate": 9.994741255677363e-05, + "loss": 0.8428501486778259, + "step": 1330 + }, + { + "epoch": 0.5620253164556962, + "grad_norm": 1.254215121269226, + "learning_rate": 9.994630568124049e-05, + "loss": 0.7340869307518005, + "step": 1332 + }, + { + "epoch": 0.5628691983122363, + "grad_norm": 1.2869828939437866, + "learning_rate": 9.994518728405386e-05, + "loss": 0.7052226662635803, + "step": 1334 + }, + { + "epoch": 0.5637130801687764, + "grad_norm": 1.4321808815002441, + "learning_rate": 9.994405736547174e-05, + "loss": 0.8297074437141418, + "step": 1336 + }, + { + "epoch": 0.5645569620253165, + "grad_norm": 1.4638891220092773, + "learning_rate": 9.994291592575478e-05, + "loss": 0.7183220982551575, + "step": 1338 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 1.4947413206100464, + "learning_rate": 9.994176296516628e-05, + "loss": 0.8146093487739563, + "step": 1340 + }, + { + "epoch": 0.5662447257383966, + "grad_norm": 1.343862533569336, + "learning_rate": 9.994059848397221e-05, + "loss": 0.7583593130111694, + "step": 1342 + }, + { + "epoch": 0.5670886075949367, + "grad_norm": 1.203550100326538, + "learning_rate": 9.993942248244121e-05, + "loss": 0.7682924270629883, + "step": 1344 + }, + { + "epoch": 0.5679324894514768, + "grad_norm": 1.287660002708435, + "learning_rate": 9.993823496084455e-05, + "loss": 0.8139828443527222, + "step": 1346 + }, + { + "epoch": 0.5687763713080168, + "grad_norm": 1.3326014280319214, + "learning_rate": 9.993703591945616e-05, + "loss": 0.7529099583625793, + "step": 1348 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.2441487312316895, + "learning_rate": 9.993582535855263e-05, + "loss": 0.6997471451759338, + "step": 1350 + }, + { + "epoch": 0.570464135021097, + "grad_norm": 1.2647649049758911, + "learning_rate": 9.993460327841325e-05, + "loss": 0.7421218752861023, + "step": 1352 + }, + { + "epoch": 0.5713080168776371, + "grad_norm": 1.146399974822998, + "learning_rate": 9.99333696793199e-05, + "loss": 0.7342398166656494, + "step": 1354 + }, + { + "epoch": 0.5721518987341773, + "grad_norm": 1.3346691131591797, + "learning_rate": 9.993212456155715e-05, + "loss": 0.7175891399383545, + "step": 1356 + }, + { + "epoch": 0.5729957805907173, + "grad_norm": 1.3950672149658203, + "learning_rate": 9.993086792541222e-05, + "loss": 0.8108891248703003, + "step": 1358 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 1.339931845664978, + "learning_rate": 9.992959977117502e-05, + "loss": 0.6979889273643494, + "step": 1360 + }, + { + "epoch": 0.5746835443037974, + "grad_norm": 1.3276840448379517, + "learning_rate": 9.992832009913806e-05, + "loss": 0.7635799050331116, + "step": 1362 + }, + { + "epoch": 0.5755274261603376, + "grad_norm": 1.5015610456466675, + "learning_rate": 9.992702890959653e-05, + "loss": 0.7575043439865112, + "step": 1364 + }, + { + "epoch": 0.5763713080168776, + "grad_norm": 1.4755414724349976, + "learning_rate": 9.99257262028483e-05, + "loss": 0.8134847283363342, + "step": 1366 + }, + { + "epoch": 0.5772151898734177, + "grad_norm": 1.3788783550262451, + "learning_rate": 9.992441197919388e-05, + "loss": 0.7663828134536743, + "step": 1368 + }, + { + "epoch": 0.5780590717299579, + "grad_norm": 1.2814711332321167, + "learning_rate": 9.992308623893644e-05, + "loss": 0.6711251735687256, + "step": 1370 + }, + { + "epoch": 0.5789029535864979, + "grad_norm": 1.5343635082244873, + "learning_rate": 9.99217489823818e-05, + "loss": 0.8097200393676758, + "step": 1372 + }, + { + "epoch": 0.579746835443038, + "grad_norm": 1.3029557466506958, + "learning_rate": 9.992040020983843e-05, + "loss": 0.8274240493774414, + "step": 1374 + }, + { + "epoch": 0.580590717299578, + "grad_norm": 1.4034144878387451, + "learning_rate": 9.991903992161746e-05, + "loss": 0.7758964896202087, + "step": 1376 + }, + { + "epoch": 0.5814345991561182, + "grad_norm": 1.2340021133422852, + "learning_rate": 9.991766811803271e-05, + "loss": 0.6571930050849915, + "step": 1378 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 1.3082842826843262, + "learning_rate": 9.991628479940061e-05, + "loss": 0.7381542921066284, + "step": 1380 + }, + { + "epoch": 0.5831223628691983, + "grad_norm": 1.8134801387786865, + "learning_rate": 9.991488996604025e-05, + "loss": 0.8081237077713013, + "step": 1382 + }, + { + "epoch": 0.5839662447257384, + "grad_norm": 1.4598309993743896, + "learning_rate": 9.991348361827343e-05, + "loss": 0.7761610746383667, + "step": 1384 + }, + { + "epoch": 0.5848101265822785, + "grad_norm": 1.2974225282669067, + "learning_rate": 9.991206575642453e-05, + "loss": 0.6872953176498413, + "step": 1386 + }, + { + "epoch": 0.5856540084388185, + "grad_norm": 1.24009370803833, + "learning_rate": 9.991063638082065e-05, + "loss": 0.7601345777511597, + "step": 1388 + }, + { + "epoch": 0.5864978902953587, + "grad_norm": 1.176713228225708, + "learning_rate": 9.99091954917915e-05, + "loss": 0.7138593792915344, + "step": 1390 + }, + { + "epoch": 0.5873417721518988, + "grad_norm": 1.1056525707244873, + "learning_rate": 9.990774308966949e-05, + "loss": 0.7730305194854736, + "step": 1392 + }, + { + "epoch": 0.5881856540084388, + "grad_norm": 1.382847547531128, + "learning_rate": 9.990627917478962e-05, + "loss": 0.7076689600944519, + "step": 1394 + }, + { + "epoch": 0.5890295358649789, + "grad_norm": 1.2507930994033813, + "learning_rate": 9.990480374748964e-05, + "loss": 0.7970513105392456, + "step": 1396 + }, + { + "epoch": 0.589873417721519, + "grad_norm": 1.2266724109649658, + "learning_rate": 9.990331680810987e-05, + "loss": 0.7906717658042908, + "step": 1398 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 1.299920916557312, + "learning_rate": 9.99018183569933e-05, + "loss": 0.853204607963562, + "step": 1400 + }, + { + "epoch": 0.5907172995780591, + "eval_loss": 0.8009664416313171, + "eval_runtime": 851.9417, + "eval_samples_per_second": 2.473, + "eval_steps_per_second": 2.473, + "step": 1400 + }, + { + "epoch": 0.5915611814345991, + "grad_norm": 1.2114863395690918, + "learning_rate": 9.990030839448564e-05, + "loss": 0.8140703439712524, + "step": 1402 + }, + { + "epoch": 0.5924050632911393, + "grad_norm": 1.3301794528961182, + "learning_rate": 9.989878692093518e-05, + "loss": 0.7471320629119873, + "step": 1404 + }, + { + "epoch": 0.5932489451476793, + "grad_norm": 1.2611899375915527, + "learning_rate": 9.98972539366929e-05, + "loss": 0.7307024002075195, + "step": 1406 + }, + { + "epoch": 0.5940928270042194, + "grad_norm": 1.1717802286148071, + "learning_rate": 9.989570944211244e-05, + "loss": 0.6843112111091614, + "step": 1408 + }, + { + "epoch": 0.5949367088607594, + "grad_norm": 1.3323513269424438, + "learning_rate": 9.989415343755006e-05, + "loss": 0.7025372385978699, + "step": 1410 + }, + { + "epoch": 0.5957805907172996, + "grad_norm": 1.4225109815597534, + "learning_rate": 9.989258592336473e-05, + "loss": 0.7792683839797974, + "step": 1412 + }, + { + "epoch": 0.5966244725738397, + "grad_norm": 1.2878522872924805, + "learning_rate": 9.989100689991804e-05, + "loss": 0.8328315019607544, + "step": 1414 + }, + { + "epoch": 0.5974683544303797, + "grad_norm": 1.2067214250564575, + "learning_rate": 9.988941636757421e-05, + "loss": 0.7700617909431458, + "step": 1416 + }, + { + "epoch": 0.5983122362869199, + "grad_norm": 1.1213195323944092, + "learning_rate": 9.988781432670019e-05, + "loss": 0.6872363090515137, + "step": 1418 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 1.3211694955825806, + "learning_rate": 9.98862007776655e-05, + "loss": 0.7184111475944519, + "step": 1420 + }, + { + "epoch": 0.6, + "grad_norm": 1.1916998624801636, + "learning_rate": 9.98845757208424e-05, + "loss": 0.8120859265327454, + "step": 1422 + }, + { + "epoch": 0.60084388185654, + "grad_norm": 1.2772804498672485, + "learning_rate": 9.988293915660572e-05, + "loss": 0.7586462497711182, + "step": 1424 + }, + { + "epoch": 0.6016877637130802, + "grad_norm": 1.4139106273651123, + "learning_rate": 9.988129108533299e-05, + "loss": 0.8175994157791138, + "step": 1426 + }, + { + "epoch": 0.6025316455696202, + "grad_norm": 1.4481157064437866, + "learning_rate": 9.987963150740439e-05, + "loss": 0.7662636041641235, + "step": 1428 + }, + { + "epoch": 0.6033755274261603, + "grad_norm": 1.6000999212265015, + "learning_rate": 9.987796042320277e-05, + "loss": 0.7477837800979614, + "step": 1430 + }, + { + "epoch": 0.6042194092827005, + "grad_norm": 1.26194429397583, + "learning_rate": 9.98762778331136e-05, + "loss": 0.7392798662185669, + "step": 1432 + }, + { + "epoch": 0.6050632911392405, + "grad_norm": 1.2370645999908447, + "learning_rate": 9.987458373752503e-05, + "loss": 0.7795998454093933, + "step": 1434 + }, + { + "epoch": 0.6059071729957806, + "grad_norm": 1.4908311367034912, + "learning_rate": 9.987287813682784e-05, + "loss": 0.7833777070045471, + "step": 1436 + }, + { + "epoch": 0.6067510548523207, + "grad_norm": 1.2918652296066284, + "learning_rate": 9.987116103141549e-05, + "loss": 0.7269768118858337, + "step": 1438 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 1.2170461416244507, + "learning_rate": 9.98694324216841e-05, + "loss": 0.7599279284477234, + "step": 1440 + }, + { + "epoch": 0.6084388185654008, + "grad_norm": 1.4373505115509033, + "learning_rate": 9.98676923080324e-05, + "loss": 0.8256514668464661, + "step": 1442 + }, + { + "epoch": 0.6092827004219409, + "grad_norm": 1.3523614406585693, + "learning_rate": 9.986594069086181e-05, + "loss": 0.8462428450584412, + "step": 1444 + }, + { + "epoch": 0.610126582278481, + "grad_norm": 1.5131851434707642, + "learning_rate": 9.98641775705764e-05, + "loss": 0.8402239084243774, + "step": 1446 + }, + { + "epoch": 0.6109704641350211, + "grad_norm": 1.3518229722976685, + "learning_rate": 9.98624029475829e-05, + "loss": 0.7585759162902832, + "step": 1448 + }, + { + "epoch": 0.6118143459915611, + "grad_norm": 1.3403998613357544, + "learning_rate": 9.986061682229064e-05, + "loss": 0.773881733417511, + "step": 1450 + }, + { + "epoch": 0.6126582278481013, + "grad_norm": 1.1835366487503052, + "learning_rate": 9.985881919511168e-05, + "loss": 0.6770316958427429, + "step": 1452 + }, + { + "epoch": 0.6135021097046414, + "grad_norm": 1.1825730800628662, + "learning_rate": 9.985701006646069e-05, + "loss": 0.7081645727157593, + "step": 1454 + }, + { + "epoch": 0.6143459915611814, + "grad_norm": 1.378994345664978, + "learning_rate": 9.9855189436755e-05, + "loss": 0.7750917673110962, + "step": 1456 + }, + { + "epoch": 0.6151898734177215, + "grad_norm": 1.4208749532699585, + "learning_rate": 9.985335730641458e-05, + "loss": 0.7517801523208618, + "step": 1458 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 1.1413639783859253, + "learning_rate": 9.98515136758621e-05, + "loss": 0.712832510471344, + "step": 1460 + }, + { + "epoch": 0.6168776371308017, + "grad_norm": 1.3949562311172485, + "learning_rate": 9.984965854552283e-05, + "loss": 0.7884142994880676, + "step": 1462 + }, + { + "epoch": 0.6177215189873417, + "grad_norm": 1.4057096242904663, + "learning_rate": 9.984779191582471e-05, + "loss": 0.796623706817627, + "step": 1464 + }, + { + "epoch": 0.6185654008438819, + "grad_norm": 1.1681689023971558, + "learning_rate": 9.984591378719834e-05, + "loss": 0.7862933874130249, + "step": 1466 + }, + { + "epoch": 0.619409282700422, + "grad_norm": 1.2585291862487793, + "learning_rate": 9.984402416007696e-05, + "loss": 0.7889828681945801, + "step": 1468 + }, + { + "epoch": 0.620253164556962, + "grad_norm": 1.2598098516464233, + "learning_rate": 9.984212303489649e-05, + "loss": 0.7375997304916382, + "step": 1470 + }, + { + "epoch": 0.6210970464135022, + "grad_norm": 1.4628467559814453, + "learning_rate": 9.984021041209547e-05, + "loss": 0.7839564085006714, + "step": 1472 + }, + { + "epoch": 0.6219409282700422, + "grad_norm": 1.3606770038604736, + "learning_rate": 9.983828629211511e-05, + "loss": 0.7566051483154297, + "step": 1474 + }, + { + "epoch": 0.6227848101265823, + "grad_norm": 1.182644248008728, + "learning_rate": 9.983635067539927e-05, + "loss": 0.6638457179069519, + "step": 1476 + }, + { + "epoch": 0.6236286919831223, + "grad_norm": 1.5617793798446655, + "learning_rate": 9.983440356239445e-05, + "loss": 0.8227225542068481, + "step": 1478 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 1.2290058135986328, + "learning_rate": 9.98324449535498e-05, + "loss": 0.7086431980133057, + "step": 1480 + }, + { + "epoch": 0.6253164556962025, + "grad_norm": 1.3822678327560425, + "learning_rate": 9.983047484931716e-05, + "loss": 0.8076596856117249, + "step": 1482 + }, + { + "epoch": 0.6261603375527426, + "grad_norm": 1.163699746131897, + "learning_rate": 9.982849325015098e-05, + "loss": 0.7514539361000061, + "step": 1484 + }, + { + "epoch": 0.6270042194092827, + "grad_norm": 1.2635631561279297, + "learning_rate": 9.982650015650839e-05, + "loss": 0.7298142910003662, + "step": 1486 + }, + { + "epoch": 0.6278481012658228, + "grad_norm": 1.3135387897491455, + "learning_rate": 9.982449556884914e-05, + "loss": 0.8092831373214722, + "step": 1488 + }, + { + "epoch": 0.6286919831223629, + "grad_norm": 1.3577877283096313, + "learning_rate": 9.982247948763567e-05, + "loss": 0.7934147715568542, + "step": 1490 + }, + { + "epoch": 0.6295358649789029, + "grad_norm": 1.1482092142105103, + "learning_rate": 9.982045191333304e-05, + "loss": 0.789363443851471, + "step": 1492 + }, + { + "epoch": 0.6303797468354431, + "grad_norm": 1.189771056175232, + "learning_rate": 9.981841284640895e-05, + "loss": 0.7458413243293762, + "step": 1494 + }, + { + "epoch": 0.6312236286919831, + "grad_norm": 1.2815836668014526, + "learning_rate": 9.981636228733383e-05, + "loss": 0.7299918532371521, + "step": 1496 + }, + { + "epoch": 0.6320675105485232, + "grad_norm": 1.36761474609375, + "learning_rate": 9.981430023658068e-05, + "loss": 0.7545169591903687, + "step": 1498 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.2594345808029175, + "learning_rate": 9.981222669462513e-05, + "loss": 0.7358481884002686, + "step": 1500 + }, + { + "epoch": 0.6329113924050633, + "eval_loss": 0.7896141409873962, + "eval_runtime": 865.9069, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1500 + }, + { + "epoch": 0.6337552742616034, + "grad_norm": 3.6419246196746826, + "learning_rate": 9.981014166194556e-05, + "loss": 0.8253764510154724, + "step": 1502 + }, + { + "epoch": 0.6345991561181434, + "grad_norm": 1.7333487272262573, + "learning_rate": 9.980804513902294e-05, + "loss": 0.8254884481430054, + "step": 1504 + }, + { + "epoch": 0.6354430379746835, + "grad_norm": 1.1998231410980225, + "learning_rate": 9.980593712634088e-05, + "loss": 0.7833738327026367, + "step": 1506 + }, + { + "epoch": 0.6362869198312237, + "grad_norm": 1.347011685371399, + "learning_rate": 9.980381762438566e-05, + "loss": 0.753408670425415, + "step": 1508 + }, + { + "epoch": 0.6371308016877637, + "grad_norm": 1.1759053468704224, + "learning_rate": 9.980168663364622e-05, + "loss": 0.7867791652679443, + "step": 1510 + }, + { + "epoch": 0.6379746835443038, + "grad_norm": 1.3113552331924438, + "learning_rate": 9.979954415461412e-05, + "loss": 0.6753612160682678, + "step": 1512 + }, + { + "epoch": 0.6388185654008439, + "grad_norm": 1.3258320093154907, + "learning_rate": 9.979739018778362e-05, + "loss": 0.750367283821106, + "step": 1514 + }, + { + "epoch": 0.639662447257384, + "grad_norm": 1.175145149230957, + "learning_rate": 9.979522473365157e-05, + "loss": 0.7505861520767212, + "step": 1516 + }, + { + "epoch": 0.640506329113924, + "grad_norm": 1.2276148796081543, + "learning_rate": 9.979304779271752e-05, + "loss": 0.7429317831993103, + "step": 1518 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 1.3262875080108643, + "learning_rate": 9.979085936548362e-05, + "loss": 0.786217212677002, + "step": 1520 + }, + { + "epoch": 0.6421940928270042, + "grad_norm": 1.3067121505737305, + "learning_rate": 9.978865945245473e-05, + "loss": 0.6942036151885986, + "step": 1522 + }, + { + "epoch": 0.6430379746835443, + "grad_norm": 1.5352400541305542, + "learning_rate": 9.978644805413832e-05, + "loss": 0.8281817436218262, + "step": 1524 + }, + { + "epoch": 0.6438818565400843, + "grad_norm": 1.2848507165908813, + "learning_rate": 9.97842251710445e-05, + "loss": 0.8110972046852112, + "step": 1526 + }, + { + "epoch": 0.6447257383966245, + "grad_norm": 1.352196216583252, + "learning_rate": 9.978199080368607e-05, + "loss": 0.7354730367660522, + "step": 1528 + }, + { + "epoch": 0.6455696202531646, + "grad_norm": 1.2427687644958496, + "learning_rate": 9.977974495257842e-05, + "loss": 0.7915583848953247, + "step": 1530 + }, + { + "epoch": 0.6464135021097046, + "grad_norm": 1.3163504600524902, + "learning_rate": 9.977748761823967e-05, + "loss": 0.7400109171867371, + "step": 1532 + }, + { + "epoch": 0.6472573839662448, + "grad_norm": 1.2496893405914307, + "learning_rate": 9.977521880119049e-05, + "loss": 0.7104899287223816, + "step": 1534 + }, + { + "epoch": 0.6481012658227848, + "grad_norm": 1.0907179117202759, + "learning_rate": 9.97729385019543e-05, + "loss": 0.8074463605880737, + "step": 1536 + }, + { + "epoch": 0.6489451476793249, + "grad_norm": 1.2323429584503174, + "learning_rate": 9.977064672105712e-05, + "loss": 0.7770540714263916, + "step": 1538 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 1.224428415298462, + "learning_rate": 9.976834345902759e-05, + "loss": 0.806465208530426, + "step": 1540 + }, + { + "epoch": 0.6506329113924051, + "grad_norm": 1.3529564142227173, + "learning_rate": 9.976602871639705e-05, + "loss": 0.7306749224662781, + "step": 1542 + }, + { + "epoch": 0.6514767932489451, + "grad_norm": 1.1770031452178955, + "learning_rate": 9.976370249369946e-05, + "loss": 0.783933699131012, + "step": 1544 + }, + { + "epoch": 0.6523206751054852, + "grad_norm": 1.205283522605896, + "learning_rate": 9.976136479147144e-05, + "loss": 0.6937689185142517, + "step": 1546 + }, + { + "epoch": 0.6531645569620254, + "grad_norm": 1.2329360246658325, + "learning_rate": 9.975901561025223e-05, + "loss": 0.8041763305664062, + "step": 1548 + }, + { + "epoch": 0.6540084388185654, + "grad_norm": 1.499973177909851, + "learning_rate": 9.975665495058377e-05, + "loss": 0.750390887260437, + "step": 1550 + }, + { + "epoch": 0.6548523206751055, + "grad_norm": 1.31832754611969, + "learning_rate": 9.975428281301061e-05, + "loss": 0.7658298015594482, + "step": 1552 + }, + { + "epoch": 0.6556962025316456, + "grad_norm": 1.3998414278030396, + "learning_rate": 9.975189919807994e-05, + "loss": 0.8651264905929565, + "step": 1554 + }, + { + "epoch": 0.6565400843881857, + "grad_norm": 1.2002551555633545, + "learning_rate": 9.974950410634164e-05, + "loss": 0.6776561141014099, + "step": 1556 + }, + { + "epoch": 0.6573839662447257, + "grad_norm": 1.1986602544784546, + "learning_rate": 9.97470975383482e-05, + "loss": 0.8159130811691284, + "step": 1558 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 1.3583602905273438, + "learning_rate": 9.974467949465477e-05, + "loss": 0.7528039216995239, + "step": 1560 + }, + { + "epoch": 0.6590717299578059, + "grad_norm": 1.4176239967346191, + "learning_rate": 9.974224997581913e-05, + "loss": 0.6970920562744141, + "step": 1562 + }, + { + "epoch": 0.659915611814346, + "grad_norm": 1.3899401426315308, + "learning_rate": 9.973980898240177e-05, + "loss": 0.7718377113342285, + "step": 1564 + }, + { + "epoch": 0.660759493670886, + "grad_norm": 1.222413182258606, + "learning_rate": 9.973735651496571e-05, + "loss": 0.7346280217170715, + "step": 1566 + }, + { + "epoch": 0.6616033755274262, + "grad_norm": 1.3750087022781372, + "learning_rate": 9.973489257407676e-05, + "loss": 0.7923588156700134, + "step": 1568 + }, + { + "epoch": 0.6624472573839663, + "grad_norm": 1.24547278881073, + "learning_rate": 9.973241716030325e-05, + "loss": 0.8258910179138184, + "step": 1570 + }, + { + "epoch": 0.6632911392405063, + "grad_norm": 1.2464141845703125, + "learning_rate": 9.972993027421624e-05, + "loss": 0.7869232296943665, + "step": 1572 + }, + { + "epoch": 0.6641350210970464, + "grad_norm": 1.3088903427124023, + "learning_rate": 9.972743191638939e-05, + "loss": 0.8144775629043579, + "step": 1574 + }, + { + "epoch": 0.6649789029535865, + "grad_norm": 1.2252418994903564, + "learning_rate": 9.972492208739903e-05, + "loss": 0.7432073950767517, + "step": 1576 + }, + { + "epoch": 0.6658227848101266, + "grad_norm": 1.2303717136383057, + "learning_rate": 9.972240078782413e-05, + "loss": 0.7386854887008667, + "step": 1578 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0226294994354248, + "learning_rate": 9.971986801824631e-05, + "loss": 0.7127882838249207, + "step": 1580 + }, + { + "epoch": 0.6675105485232068, + "grad_norm": 1.362332820892334, + "learning_rate": 9.971732377924982e-05, + "loss": 0.7557716369628906, + "step": 1582 + }, + { + "epoch": 0.6683544303797468, + "grad_norm": 1.4436695575714111, + "learning_rate": 9.971476807142158e-05, + "loss": 0.7832611203193665, + "step": 1584 + }, + { + "epoch": 0.6691983122362869, + "grad_norm": 1.276695966720581, + "learning_rate": 9.971220089535113e-05, + "loss": 0.8190197944641113, + "step": 1586 + }, + { + "epoch": 0.6700421940928271, + "grad_norm": 1.2413527965545654, + "learning_rate": 9.970962225163069e-05, + "loss": 0.747222363948822, + "step": 1588 + }, + { + "epoch": 0.6708860759493671, + "grad_norm": 1.3395767211914062, + "learning_rate": 9.970703214085507e-05, + "loss": 0.7846449017524719, + "step": 1590 + }, + { + "epoch": 0.6717299578059072, + "grad_norm": 1.291327953338623, + "learning_rate": 9.970443056362178e-05, + "loss": 0.8160232901573181, + "step": 1592 + }, + { + "epoch": 0.6725738396624472, + "grad_norm": 1.3139684200286865, + "learning_rate": 9.970181752053097e-05, + "loss": 0.7413806915283203, + "step": 1594 + }, + { + "epoch": 0.6734177215189874, + "grad_norm": 1.3170921802520752, + "learning_rate": 9.969919301218537e-05, + "loss": 0.7637304067611694, + "step": 1596 + }, + { + "epoch": 0.6742616033755274, + "grad_norm": 1.3349758386611938, + "learning_rate": 9.969655703919044e-05, + "loss": 0.7823366522789001, + "step": 1598 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 1.2151578664779663, + "learning_rate": 9.969390960215425e-05, + "loss": 0.6587790846824646, + "step": 1600 + }, + { + "epoch": 0.6751054852320675, + "eval_loss": 0.7836604714393616, + "eval_runtime": 861.5352, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 1600 + }, + { + "epoch": 0.6759493670886076, + "grad_norm": 1.2541478872299194, + "learning_rate": 9.96912507016875e-05, + "loss": 0.7314544320106506, + "step": 1602 + }, + { + "epoch": 0.6767932489451477, + "grad_norm": 1.091790795326233, + "learning_rate": 9.968858033840357e-05, + "loss": 0.702468752861023, + "step": 1604 + }, + { + "epoch": 0.6776371308016877, + "grad_norm": 1.36745285987854, + "learning_rate": 9.968589851291841e-05, + "loss": 0.7691897749900818, + "step": 1606 + }, + { + "epoch": 0.6784810126582278, + "grad_norm": 1.1325993537902832, + "learning_rate": 9.968320522585072e-05, + "loss": 0.7422228455543518, + "step": 1608 + }, + { + "epoch": 0.679324894514768, + "grad_norm": 1.1015450954437256, + "learning_rate": 9.968050047782176e-05, + "loss": 0.677532434463501, + "step": 1610 + }, + { + "epoch": 0.680168776371308, + "grad_norm": 1.2216695547103882, + "learning_rate": 9.967778426945548e-05, + "loss": 0.7973438501358032, + "step": 1612 + }, + { + "epoch": 0.6810126582278481, + "grad_norm": 1.159395456314087, + "learning_rate": 9.967505660137843e-05, + "loss": 0.6742876172065735, + "step": 1614 + }, + { + "epoch": 0.6818565400843882, + "grad_norm": 1.404433250427246, + "learning_rate": 9.967231747421988e-05, + "loss": 0.7592008709907532, + "step": 1616 + }, + { + "epoch": 0.6827004219409283, + "grad_norm": 1.2489168643951416, + "learning_rate": 9.966956688861164e-05, + "loss": 0.7565826177597046, + "step": 1618 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 1.2960615158081055, + "learning_rate": 9.966680484518825e-05, + "loss": 0.7694597840309143, + "step": 1620 + }, + { + "epoch": 0.6843881856540084, + "grad_norm": 1.3598436117172241, + "learning_rate": 9.966403134458685e-05, + "loss": 0.8392959833145142, + "step": 1622 + }, + { + "epoch": 0.6852320675105485, + "grad_norm": 1.258065938949585, + "learning_rate": 9.966124638744722e-05, + "loss": 0.8014217019081116, + "step": 1624 + }, + { + "epoch": 0.6860759493670886, + "grad_norm": 1.3132309913635254, + "learning_rate": 9.965844997441184e-05, + "loss": 0.7029755711555481, + "step": 1626 + }, + { + "epoch": 0.6869198312236287, + "grad_norm": 1.1204946041107178, + "learning_rate": 9.965564210612575e-05, + "loss": 0.7213528752326965, + "step": 1628 + }, + { + "epoch": 0.6877637130801688, + "grad_norm": 1.037251591682434, + "learning_rate": 9.965282278323667e-05, + "loss": 0.6895437240600586, + "step": 1630 + }, + { + "epoch": 0.6886075949367089, + "grad_norm": 1.093807578086853, + "learning_rate": 9.964999200639498e-05, + "loss": 0.8035063743591309, + "step": 1632 + }, + { + "epoch": 0.6894514767932489, + "grad_norm": 1.367386817932129, + "learning_rate": 9.964714977625367e-05, + "loss": 0.6191847920417786, + "step": 1634 + }, + { + "epoch": 0.6902953586497891, + "grad_norm": 1.3160961866378784, + "learning_rate": 9.964429609346841e-05, + "loss": 0.7469727993011475, + "step": 1636 + }, + { + "epoch": 0.6911392405063291, + "grad_norm": 1.3736863136291504, + "learning_rate": 9.964143095869748e-05, + "loss": 0.7987836599349976, + "step": 1638 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 1.323209524154663, + "learning_rate": 9.963855437260182e-05, + "loss": 0.7901709675788879, + "step": 1640 + }, + { + "epoch": 0.6928270042194092, + "grad_norm": 1.3943440914154053, + "learning_rate": 9.963566633584496e-05, + "loss": 0.7889530658721924, + "step": 1642 + }, + { + "epoch": 0.6936708860759494, + "grad_norm": 1.3699116706848145, + "learning_rate": 9.963276684909317e-05, + "loss": 0.756829559803009, + "step": 1644 + }, + { + "epoch": 0.6945147679324895, + "grad_norm": 1.4216378927230835, + "learning_rate": 9.962985591301529e-05, + "loss": 0.7840303182601929, + "step": 1646 + }, + { + "epoch": 0.6953586497890295, + "grad_norm": 1.2231985330581665, + "learning_rate": 9.962693352828279e-05, + "loss": 0.700393557548523, + "step": 1648 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.3568313121795654, + "learning_rate": 9.962399969556983e-05, + "loss": 0.7010306715965271, + "step": 1650 + }, + { + "epoch": 0.6970464135021097, + "grad_norm": 1.1662907600402832, + "learning_rate": 9.96210544155532e-05, + "loss": 0.6935506463050842, + "step": 1652 + }, + { + "epoch": 0.6978902953586498, + "grad_norm": 1.3066680431365967, + "learning_rate": 9.96180976889123e-05, + "loss": 0.7913851141929626, + "step": 1654 + }, + { + "epoch": 0.6987341772151898, + "grad_norm": 1.2268375158309937, + "learning_rate": 9.961512951632918e-05, + "loss": 0.764849066734314, + "step": 1656 + }, + { + "epoch": 0.69957805907173, + "grad_norm": 1.4509469270706177, + "learning_rate": 9.96121498984886e-05, + "loss": 0.7544103860855103, + "step": 1658 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 1.200772762298584, + "learning_rate": 9.960915883607782e-05, + "loss": 0.7766591310501099, + "step": 1660 + }, + { + "epoch": 0.7012658227848101, + "grad_norm": 1.3825311660766602, + "learning_rate": 9.960615632978687e-05, + "loss": 0.7433559894561768, + "step": 1662 + }, + { + "epoch": 0.7021097046413503, + "grad_norm": 1.3197243213653564, + "learning_rate": 9.960314238030836e-05, + "loss": 0.7770103812217712, + "step": 1664 + }, + { + "epoch": 0.7029535864978903, + "grad_norm": 1.515163779258728, + "learning_rate": 9.960011698833755e-05, + "loss": 0.8597216606140137, + "step": 1666 + }, + { + "epoch": 0.7037974683544304, + "grad_norm": 1.2329891920089722, + "learning_rate": 9.959708015457234e-05, + "loss": 0.7630532383918762, + "step": 1668 + }, + { + "epoch": 0.7046413502109705, + "grad_norm": 1.0592037439346313, + "learning_rate": 9.959403187971327e-05, + "loss": 0.7299806475639343, + "step": 1670 + }, + { + "epoch": 0.7054852320675106, + "grad_norm": 2.2717394828796387, + "learning_rate": 9.959097216446351e-05, + "loss": 0.6999854445457458, + "step": 1672 + }, + { + "epoch": 0.7063291139240506, + "grad_norm": 1.1552131175994873, + "learning_rate": 9.958790100952889e-05, + "loss": 0.8403060436248779, + "step": 1674 + }, + { + "epoch": 0.7071729957805907, + "grad_norm": 1.290488839149475, + "learning_rate": 9.958481841561787e-05, + "loss": 0.7729134559631348, + "step": 1676 + }, + { + "epoch": 0.7080168776371308, + "grad_norm": 1.1913278102874756, + "learning_rate": 9.958172438344152e-05, + "loss": 0.7100697755813599, + "step": 1678 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 1.2355852127075195, + "learning_rate": 9.957861891371359e-05, + "loss": 0.7014795541763306, + "step": 1680 + }, + { + "epoch": 0.7097046413502109, + "grad_norm": 1.258705496788025, + "learning_rate": 9.957550200715044e-05, + "loss": 0.8131424784660339, + "step": 1682 + }, + { + "epoch": 0.7105485232067511, + "grad_norm": 1.1102997064590454, + "learning_rate": 9.957237366447112e-05, + "loss": 0.6842480301856995, + "step": 1684 + }, + { + "epoch": 0.7113924050632912, + "grad_norm": 1.4466290473937988, + "learning_rate": 9.956923388639724e-05, + "loss": 0.6730120182037354, + "step": 1686 + }, + { + "epoch": 0.7122362869198312, + "grad_norm": 1.261152982711792, + "learning_rate": 9.956608267365311e-05, + "loss": 0.7109374403953552, + "step": 1688 + }, + { + "epoch": 0.7130801687763713, + "grad_norm": 1.4070630073547363, + "learning_rate": 9.956292002696562e-05, + "loss": 0.7545008063316345, + "step": 1690 + }, + { + "epoch": 0.7139240506329114, + "grad_norm": 1.2532793283462524, + "learning_rate": 9.955974594706436e-05, + "loss": 0.7892587184906006, + "step": 1692 + }, + { + "epoch": 0.7147679324894515, + "grad_norm": 1.1180293560028076, + "learning_rate": 9.955656043468153e-05, + "loss": 0.7348554134368896, + "step": 1694 + }, + { + "epoch": 0.7156118143459915, + "grad_norm": 1.333054542541504, + "learning_rate": 9.955336349055195e-05, + "loss": 0.8207674026489258, + "step": 1696 + }, + { + "epoch": 0.7164556962025317, + "grad_norm": 1.1373547315597534, + "learning_rate": 9.95501551154131e-05, + "loss": 0.7226691842079163, + "step": 1698 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 1.2342052459716797, + "learning_rate": 9.95469353100051e-05, + "loss": 0.726982831954956, + "step": 1700 + }, + { + "epoch": 0.7172995780590717, + "eval_loss": 0.7783148884773254, + "eval_runtime": 846.1986, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 1700 + }, + { + "epoch": 0.7181434599156118, + "grad_norm": 1.3781483173370361, + "learning_rate": 9.95437040750707e-05, + "loss": 0.7623077034950256, + "step": 1702 + }, + { + "epoch": 0.7189873417721518, + "grad_norm": 1.301440715789795, + "learning_rate": 9.954046141135526e-05, + "loss": 0.7421616315841675, + "step": 1704 + }, + { + "epoch": 0.719831223628692, + "grad_norm": 1.1375854015350342, + "learning_rate": 9.953720731960683e-05, + "loss": 0.685523509979248, + "step": 1706 + }, + { + "epoch": 0.7206751054852321, + "grad_norm": 1.2014397382736206, + "learning_rate": 9.953394180057604e-05, + "loss": 0.756073534488678, + "step": 1708 + }, + { + "epoch": 0.7215189873417721, + "grad_norm": 1.232802152633667, + "learning_rate": 9.95306648550162e-05, + "loss": 0.7364522814750671, + "step": 1710 + }, + { + "epoch": 0.7223628691983123, + "grad_norm": 1.4462472200393677, + "learning_rate": 9.952737648368323e-05, + "loss": 0.7073688507080078, + "step": 1712 + }, + { + "epoch": 0.7232067510548523, + "grad_norm": 1.123523473739624, + "learning_rate": 9.95240766873357e-05, + "loss": 0.7147064805030823, + "step": 1714 + }, + { + "epoch": 0.7240506329113924, + "grad_norm": 1.4111510515213013, + "learning_rate": 9.95207654667348e-05, + "loss": 0.7108398079872131, + "step": 1716 + }, + { + "epoch": 0.7248945147679325, + "grad_norm": 1.2785903215408325, + "learning_rate": 9.951744282264437e-05, + "loss": 0.7080079317092896, + "step": 1718 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 1.1361653804779053, + "learning_rate": 9.951410875583089e-05, + "loss": 0.7396624684333801, + "step": 1720 + }, + { + "epoch": 0.7265822784810126, + "grad_norm": 1.0762585401535034, + "learning_rate": 9.951076326706346e-05, + "loss": 0.7724334597587585, + "step": 1722 + }, + { + "epoch": 0.7274261603375527, + "grad_norm": 1.3104428052902222, + "learning_rate": 9.950740635711379e-05, + "loss": 0.7311923503875732, + "step": 1724 + }, + { + "epoch": 0.7282700421940929, + "grad_norm": 1.1291942596435547, + "learning_rate": 9.95040380267563e-05, + "loss": 0.6878296732902527, + "step": 1726 + }, + { + "epoch": 0.7291139240506329, + "grad_norm": 1.5171746015548706, + "learning_rate": 9.9500658276768e-05, + "loss": 0.7410538196563721, + "step": 1728 + }, + { + "epoch": 0.729957805907173, + "grad_norm": 1.0966423749923706, + "learning_rate": 9.949726710792848e-05, + "loss": 0.6953532695770264, + "step": 1730 + }, + { + "epoch": 0.7308016877637131, + "grad_norm": 1.2436997890472412, + "learning_rate": 9.949386452102007e-05, + "loss": 0.6679023504257202, + "step": 1732 + }, + { + "epoch": 0.7316455696202532, + "grad_norm": 1.1364835500717163, + "learning_rate": 9.949045051682766e-05, + "loss": 0.8046789765357971, + "step": 1734 + }, + { + "epoch": 0.7324894514767932, + "grad_norm": 1.296648383140564, + "learning_rate": 9.948702509613878e-05, + "loss": 0.7322937846183777, + "step": 1736 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.2355525493621826, + "learning_rate": 9.948358825974365e-05, + "loss": 0.7442626357078552, + "step": 1738 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 1.1634451150894165, + "learning_rate": 9.948014000843504e-05, + "loss": 0.7231078743934631, + "step": 1740 + }, + { + "epoch": 0.7350210970464135, + "grad_norm": 1.1500129699707031, + "learning_rate": 9.947668034300843e-05, + "loss": 0.6436833143234253, + "step": 1742 + }, + { + "epoch": 0.7358649789029535, + "grad_norm": 1.3881278038024902, + "learning_rate": 9.947320926426189e-05, + "loss": 0.8170580863952637, + "step": 1744 + }, + { + "epoch": 0.7367088607594937, + "grad_norm": 1.3479492664337158, + "learning_rate": 9.94697267729961e-05, + "loss": 0.7830947041511536, + "step": 1746 + }, + { + "epoch": 0.7375527426160338, + "grad_norm": 1.0187158584594727, + "learning_rate": 9.946623287001444e-05, + "loss": 0.7358533143997192, + "step": 1748 + }, + { + "epoch": 0.7383966244725738, + "grad_norm": 1.2575689554214478, + "learning_rate": 9.946272755612287e-05, + "loss": 0.7279790639877319, + "step": 1750 + }, + { + "epoch": 0.739240506329114, + "grad_norm": 1.2045027017593384, + "learning_rate": 9.945921083213002e-05, + "loss": 0.6953092217445374, + "step": 1752 + }, + { + "epoch": 0.740084388185654, + "grad_norm": 1.3994466066360474, + "learning_rate": 9.945568269884708e-05, + "loss": 0.8094141483306885, + "step": 1754 + }, + { + "epoch": 0.7409282700421941, + "grad_norm": 1.2892286777496338, + "learning_rate": 9.945214315708797e-05, + "loss": 0.6979201436042786, + "step": 1756 + }, + { + "epoch": 0.7417721518987341, + "grad_norm": 1.2006971836090088, + "learning_rate": 9.944859220766919e-05, + "loss": 0.6810774803161621, + "step": 1758 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 1.055793285369873, + "learning_rate": 9.944502985140986e-05, + "loss": 0.6796762347221375, + "step": 1760 + }, + { + "epoch": 0.7434599156118143, + "grad_norm": 1.174714207649231, + "learning_rate": 9.944145608913175e-05, + "loss": 0.7954121828079224, + "step": 1762 + }, + { + "epoch": 0.7443037974683544, + "grad_norm": 1.1638222932815552, + "learning_rate": 9.943787092165926e-05, + "loss": 0.6939491629600525, + "step": 1764 + }, + { + "epoch": 0.7451476793248946, + "grad_norm": 1.1861820220947266, + "learning_rate": 9.943427434981942e-05, + "loss": 0.8112956285476685, + "step": 1766 + }, + { + "epoch": 0.7459915611814346, + "grad_norm": 0.9667421579360962, + "learning_rate": 9.943066637444189e-05, + "loss": 0.6812481880187988, + "step": 1768 + }, + { + "epoch": 0.7468354430379747, + "grad_norm": 1.2826191186904907, + "learning_rate": 9.942704699635898e-05, + "loss": 0.7598370313644409, + "step": 1770 + }, + { + "epoch": 0.7476793248945147, + "grad_norm": 1.2257909774780273, + "learning_rate": 9.942341621640558e-05, + "loss": 0.7118877172470093, + "step": 1772 + }, + { + "epoch": 0.7485232067510549, + "grad_norm": 1.5224615335464478, + "learning_rate": 9.941977403541925e-05, + "loss": 0.8037024736404419, + "step": 1774 + }, + { + "epoch": 0.7493670886075949, + "grad_norm": 1.188689947128296, + "learning_rate": 9.941612045424018e-05, + "loss": 0.6795828938484192, + "step": 1776 + }, + { + "epoch": 0.750210970464135, + "grad_norm": 1.0685369968414307, + "learning_rate": 9.941245547371116e-05, + "loss": 0.6934568881988525, + "step": 1778 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 1.1643654108047485, + "learning_rate": 9.940877909467767e-05, + "loss": 0.6883851289749146, + "step": 1780 + }, + { + "epoch": 0.7518987341772152, + "grad_norm": 1.15621018409729, + "learning_rate": 9.940509131798775e-05, + "loss": 0.8284637928009033, + "step": 1782 + }, + { + "epoch": 0.7527426160337553, + "grad_norm": 1.1946302652359009, + "learning_rate": 9.94013921444921e-05, + "loss": 0.7108310461044312, + "step": 1784 + }, + { + "epoch": 0.7535864978902953, + "grad_norm": 1.1536555290222168, + "learning_rate": 9.939768157504404e-05, + "loss": 0.7166154384613037, + "step": 1786 + }, + { + "epoch": 0.7544303797468355, + "grad_norm": 1.3184611797332764, + "learning_rate": 9.939395961049956e-05, + "loss": 0.7774572372436523, + "step": 1788 + }, + { + "epoch": 0.7552742616033755, + "grad_norm": 1.0782374143600464, + "learning_rate": 9.939022625171723e-05, + "loss": 0.7386471033096313, + "step": 1790 + }, + { + "epoch": 0.7561181434599156, + "grad_norm": 1.1616696119308472, + "learning_rate": 9.938648149955824e-05, + "loss": 0.6495215892791748, + "step": 1792 + }, + { + "epoch": 0.7569620253164557, + "grad_norm": 1.1715892553329468, + "learning_rate": 9.938272535488647e-05, + "loss": 0.7733646631240845, + "step": 1794 + }, + { + "epoch": 0.7578059071729958, + "grad_norm": 1.203466773033142, + "learning_rate": 9.937895781856838e-05, + "loss": 0.7354782223701477, + "step": 1796 + }, + { + "epoch": 0.7586497890295358, + "grad_norm": 1.246559977531433, + "learning_rate": 9.937517889147305e-05, + "loss": 0.823226273059845, + "step": 1798 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.9968833923339844, + "learning_rate": 9.937138857447221e-05, + "loss": 0.6221681833267212, + "step": 1800 + }, + { + "epoch": 0.759493670886076, + "eval_loss": 0.7719914317131042, + "eval_runtime": 853.1943, + "eval_samples_per_second": 2.47, + "eval_steps_per_second": 2.47, + "step": 1800 + }, + { + "epoch": 0.760337552742616, + "grad_norm": 1.5454338788986206, + "learning_rate": 9.936758686844024e-05, + "loss": 0.7799059152603149, + "step": 1802 + }, + { + "epoch": 0.7611814345991561, + "grad_norm": 1.1954455375671387, + "learning_rate": 9.936377377425409e-05, + "loss": 0.653838038444519, + "step": 1804 + }, + { + "epoch": 0.7620253164556962, + "grad_norm": 1.2538350820541382, + "learning_rate": 9.935994929279339e-05, + "loss": 0.7046942710876465, + "step": 1806 + }, + { + "epoch": 0.7628691983122363, + "grad_norm": 1.2358729839324951, + "learning_rate": 9.935611342494035e-05, + "loss": 0.7821131348609924, + "step": 1808 + }, + { + "epoch": 0.7637130801687764, + "grad_norm": 1.2401310205459595, + "learning_rate": 9.935226617157986e-05, + "loss": 0.7594596147537231, + "step": 1810 + }, + { + "epoch": 0.7645569620253164, + "grad_norm": 1.3197205066680908, + "learning_rate": 9.934840753359938e-05, + "loss": 0.7512493133544922, + "step": 1812 + }, + { + "epoch": 0.7654008438818566, + "grad_norm": 1.2482305765151978, + "learning_rate": 9.934453751188903e-05, + "loss": 0.6953311562538147, + "step": 1814 + }, + { + "epoch": 0.7662447257383966, + "grad_norm": 1.5995157957077026, + "learning_rate": 9.934065610734157e-05, + "loss": 0.7699819803237915, + "step": 1816 + }, + { + "epoch": 0.7670886075949367, + "grad_norm": 1.2414922714233398, + "learning_rate": 9.933676332085235e-05, + "loss": 0.6532001495361328, + "step": 1818 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 1.2274713516235352, + "learning_rate": 9.933285915331937e-05, + "loss": 0.7716373801231384, + "step": 1820 + }, + { + "epoch": 0.7687763713080169, + "grad_norm": 1.2894618511199951, + "learning_rate": 9.932894360564322e-05, + "loss": 0.7002654671669006, + "step": 1822 + }, + { + "epoch": 0.769620253164557, + "grad_norm": 1.10796320438385, + "learning_rate": 9.932501667872718e-05, + "loss": 0.7970587015151978, + "step": 1824 + }, + { + "epoch": 0.770464135021097, + "grad_norm": 1.2393653392791748, + "learning_rate": 9.932107837347708e-05, + "loss": 0.8071644306182861, + "step": 1826 + }, + { + "epoch": 0.7713080168776372, + "grad_norm": 1.1999030113220215, + "learning_rate": 9.931712869080144e-05, + "loss": 0.7376157641410828, + "step": 1828 + }, + { + "epoch": 0.7721518987341772, + "grad_norm": 1.1166026592254639, + "learning_rate": 9.931316763161135e-05, + "loss": 0.7487053275108337, + "step": 1830 + }, + { + "epoch": 0.7729957805907173, + "grad_norm": 1.1788052320480347, + "learning_rate": 9.930919519682059e-05, + "loss": 0.733161985874176, + "step": 1832 + }, + { + "epoch": 0.7738396624472574, + "grad_norm": 1.309968113899231, + "learning_rate": 9.930521138734548e-05, + "loss": 0.7907692790031433, + "step": 1834 + }, + { + "epoch": 0.7746835443037975, + "grad_norm": 1.1685889959335327, + "learning_rate": 9.930121620410502e-05, + "loss": 0.7192210555076599, + "step": 1836 + }, + { + "epoch": 0.7755274261603375, + "grad_norm": 1.2243701219558716, + "learning_rate": 9.929720964802085e-05, + "loss": 0.7394438982009888, + "step": 1838 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 1.2940958738327026, + "learning_rate": 9.929319172001717e-05, + "loss": 0.7885041832923889, + "step": 1840 + }, + { + "epoch": 0.7772151898734178, + "grad_norm": 1.0952763557434082, + "learning_rate": 9.928916242102086e-05, + "loss": 0.6822885274887085, + "step": 1842 + }, + { + "epoch": 0.7780590717299578, + "grad_norm": 1.0333503484725952, + "learning_rate": 9.928512175196139e-05, + "loss": 0.7070927619934082, + "step": 1844 + }, + { + "epoch": 0.7789029535864979, + "grad_norm": 1.201359510421753, + "learning_rate": 9.928106971377088e-05, + "loss": 0.7041296362876892, + "step": 1846 + }, + { + "epoch": 0.779746835443038, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.927700630738404e-05, + "loss": 0.6630192995071411, + "step": 1848 + }, + { + "epoch": 0.7805907172995781, + "grad_norm": 1.2858322858810425, + "learning_rate": 9.927293153373823e-05, + "loss": 0.7628101110458374, + "step": 1850 + }, + { + "epoch": 0.7814345991561181, + "grad_norm": 1.3730580806732178, + "learning_rate": 9.926884539377343e-05, + "loss": 0.7557390928268433, + "step": 1852 + }, + { + "epoch": 0.7822784810126582, + "grad_norm": 1.4954931735992432, + "learning_rate": 9.92647478884322e-05, + "loss": 0.8217329978942871, + "step": 1854 + }, + { + "epoch": 0.7831223628691983, + "grad_norm": 1.1092652082443237, + "learning_rate": 9.92606390186598e-05, + "loss": 0.672879695892334, + "step": 1856 + }, + { + "epoch": 0.7839662447257384, + "grad_norm": 1.2077893018722534, + "learning_rate": 9.925651878540404e-05, + "loss": 0.7380653619766235, + "step": 1858 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 1.0789313316345215, + "learning_rate": 9.925238718961538e-05, + "loss": 0.6648160219192505, + "step": 1860 + }, + { + "epoch": 0.7856540084388186, + "grad_norm": 1.3950812816619873, + "learning_rate": 9.924824423224692e-05, + "loss": 0.8316769003868103, + "step": 1862 + }, + { + "epoch": 0.7864978902953587, + "grad_norm": 1.3934763669967651, + "learning_rate": 9.924408991425433e-05, + "loss": 0.7901778817176819, + "step": 1864 + }, + { + "epoch": 0.7873417721518987, + "grad_norm": 1.2191659212112427, + "learning_rate": 9.923992423659596e-05, + "loss": 0.7643826007843018, + "step": 1866 + }, + { + "epoch": 0.7881856540084389, + "grad_norm": 0.986673891544342, + "learning_rate": 9.923574720023274e-05, + "loss": 0.6314064860343933, + "step": 1868 + }, + { + "epoch": 0.7890295358649789, + "grad_norm": 1.003552794456482, + "learning_rate": 9.923155880612823e-05, + "loss": 0.8244763016700745, + "step": 1870 + }, + { + "epoch": 0.789873417721519, + "grad_norm": 1.0831382274627686, + "learning_rate": 9.92273590552486e-05, + "loss": 0.7398403882980347, + "step": 1872 + }, + { + "epoch": 0.790717299578059, + "grad_norm": 1.1782667636871338, + "learning_rate": 9.922314794856267e-05, + "loss": 0.735211968421936, + "step": 1874 + }, + { + "epoch": 0.7915611814345992, + "grad_norm": 2.230534076690674, + "learning_rate": 9.921892548704186e-05, + "loss": 0.7550510764122009, + "step": 1876 + }, + { + "epoch": 0.7924050632911392, + "grad_norm": 1.0191401243209839, + "learning_rate": 9.92146916716602e-05, + "loss": 0.7676286697387695, + "step": 1878 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 1.1347072124481201, + "learning_rate": 9.921044650339438e-05, + "loss": 0.7409467697143555, + "step": 1880 + }, + { + "epoch": 0.7940928270042195, + "grad_norm": 1.107528567314148, + "learning_rate": 9.920618998322364e-05, + "loss": 0.7760165333747864, + "step": 1882 + }, + { + "epoch": 0.7949367088607595, + "grad_norm": 1.1110666990280151, + "learning_rate": 9.92019221121299e-05, + "loss": 0.7360131740570068, + "step": 1884 + }, + { + "epoch": 0.7957805907172996, + "grad_norm": 1.267580509185791, + "learning_rate": 9.919764289109765e-05, + "loss": 0.7784845232963562, + "step": 1886 + }, + { + "epoch": 0.7966244725738396, + "grad_norm": 1.5894557237625122, + "learning_rate": 9.919335232111407e-05, + "loss": 0.7880831360816956, + "step": 1888 + }, + { + "epoch": 0.7974683544303798, + "grad_norm": 1.1906384229660034, + "learning_rate": 9.918905040316886e-05, + "loss": 0.7315587997436523, + "step": 1890 + }, + { + "epoch": 0.7983122362869198, + "grad_norm": 1.3626811504364014, + "learning_rate": 9.918473713825445e-05, + "loss": 0.7808622121810913, + "step": 1892 + }, + { + "epoch": 0.7991561181434599, + "grad_norm": 1.1801300048828125, + "learning_rate": 9.918041252736577e-05, + "loss": 0.7055642604827881, + "step": 1894 + }, + { + "epoch": 0.8, + "grad_norm": 1.2669063806533813, + "learning_rate": 9.917607657150046e-05, + "loss": 0.7188893556594849, + "step": 1896 + }, + { + "epoch": 0.8008438818565401, + "grad_norm": 1.1746855974197388, + "learning_rate": 9.91717292716587e-05, + "loss": 0.7787454128265381, + "step": 1898 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.120012640953064, + "learning_rate": 9.916737062884338e-05, + "loss": 0.720715343952179, + "step": 1900 + }, + { + "epoch": 0.8016877637130801, + "eval_loss": 0.7648926973342896, + "eval_runtime": 865.9394, + "eval_samples_per_second": 2.433, + "eval_steps_per_second": 2.433, + "step": 1900 + }, + { + "epoch": 0.8025316455696202, + "grad_norm": 1.1745549440383911, + "learning_rate": 9.916300064405993e-05, + "loss": 0.7544789910316467, + "step": 1902 + }, + { + "epoch": 0.8033755274261604, + "grad_norm": 1.1439874172210693, + "learning_rate": 9.915861931831643e-05, + "loss": 0.7479203343391418, + "step": 1904 + }, + { + "epoch": 0.8042194092827004, + "grad_norm": 1.3508219718933105, + "learning_rate": 9.915422665262356e-05, + "loss": 0.6995842456817627, + "step": 1906 + }, + { + "epoch": 0.8050632911392405, + "grad_norm": 1.1519006490707397, + "learning_rate": 9.914982264799462e-05, + "loss": 0.7152725458145142, + "step": 1908 + }, + { + "epoch": 0.8059071729957806, + "grad_norm": 1.0818005800247192, + "learning_rate": 9.914540730544554e-05, + "loss": 0.7105516195297241, + "step": 1910 + }, + { + "epoch": 0.8067510548523207, + "grad_norm": 1.1611127853393555, + "learning_rate": 9.914098062599485e-05, + "loss": 0.6911059617996216, + "step": 1912 + }, + { + "epoch": 0.8075949367088607, + "grad_norm": 1.1964445114135742, + "learning_rate": 9.91365426106637e-05, + "loss": 0.6897286772727966, + "step": 1914 + }, + { + "epoch": 0.8084388185654009, + "grad_norm": 1.3873497247695923, + "learning_rate": 9.913209326047585e-05, + "loss": 0.7263250350952148, + "step": 1916 + }, + { + "epoch": 0.809282700421941, + "grad_norm": 1.1729894876480103, + "learning_rate": 9.91276325764577e-05, + "loss": 0.7045295238494873, + "step": 1918 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.9089694619178772, + "learning_rate": 9.912316055963822e-05, + "loss": 0.587131142616272, + "step": 1920 + }, + { + "epoch": 0.810970464135021, + "grad_norm": 1.2051384449005127, + "learning_rate": 9.911867721104902e-05, + "loss": 0.7237880229949951, + "step": 1922 + }, + { + "epoch": 0.8118143459915612, + "grad_norm": 1.2152670621871948, + "learning_rate": 9.911418253172433e-05, + "loss": 0.6967294216156006, + "step": 1924 + }, + { + "epoch": 0.8126582278481013, + "grad_norm": 1.1193642616271973, + "learning_rate": 9.9109676522701e-05, + "loss": 0.7636315822601318, + "step": 1926 + }, + { + "epoch": 0.8135021097046413, + "grad_norm": 1.2457597255706787, + "learning_rate": 9.910515918501843e-05, + "loss": 0.7451969981193542, + "step": 1928 + }, + { + "epoch": 0.8143459915611815, + "grad_norm": 1.057009220123291, + "learning_rate": 9.910063051971876e-05, + "loss": 0.6320056319236755, + "step": 1930 + }, + { + "epoch": 0.8151898734177215, + "grad_norm": 1.2820258140563965, + "learning_rate": 9.909609052784661e-05, + "loss": 0.691004753112793, + "step": 1932 + }, + { + "epoch": 0.8160337552742616, + "grad_norm": 1.331312656402588, + "learning_rate": 9.909153921044927e-05, + "loss": 0.7741923332214355, + "step": 1934 + }, + { + "epoch": 0.8168776371308016, + "grad_norm": 1.2055360078811646, + "learning_rate": 9.908697656857668e-05, + "loss": 0.668049156665802, + "step": 1936 + }, + { + "epoch": 0.8177215189873418, + "grad_norm": 1.2124541997909546, + "learning_rate": 9.90824026032813e-05, + "loss": 0.6584748029708862, + "step": 1938 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 1.244288682937622, + "learning_rate": 9.90778173156183e-05, + "loss": 0.7081992626190186, + "step": 1940 + }, + { + "epoch": 0.8194092827004219, + "grad_norm": 1.250558853149414, + "learning_rate": 9.907322070664542e-05, + "loss": 0.7977840900421143, + "step": 1942 + }, + { + "epoch": 0.8202531645569621, + "grad_norm": 1.3892892599105835, + "learning_rate": 9.906861277742297e-05, + "loss": 0.7830103635787964, + "step": 1944 + }, + { + "epoch": 0.8210970464135021, + "grad_norm": 1.3152644634246826, + "learning_rate": 9.906399352901393e-05, + "loss": 0.8451479077339172, + "step": 1946 + }, + { + "epoch": 0.8219409282700422, + "grad_norm": 1.1102250814437866, + "learning_rate": 9.905936296248388e-05, + "loss": 0.7035528421401978, + "step": 1948 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.0271214246749878, + "learning_rate": 9.905472107890101e-05, + "loss": 0.764616847038269, + "step": 1950 + }, + { + "epoch": 0.8236286919831224, + "grad_norm": 1.1772255897521973, + "learning_rate": 9.905006787933609e-05, + "loss": 0.7699717283248901, + "step": 1952 + }, + { + "epoch": 0.8244725738396624, + "grad_norm": 1.2486404180526733, + "learning_rate": 9.904540336486252e-05, + "loss": 0.7755605578422546, + "step": 1954 + }, + { + "epoch": 0.8253164556962025, + "grad_norm": 1.070148229598999, + "learning_rate": 9.904072753655635e-05, + "loss": 0.688934326171875, + "step": 1956 + }, + { + "epoch": 0.8261603375527427, + "grad_norm": 1.118401288986206, + "learning_rate": 9.903604039549617e-05, + "loss": 0.7447791695594788, + "step": 1958 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 1.2209899425506592, + "learning_rate": 9.903134194276323e-05, + "loss": 0.7990683317184448, + "step": 1960 + }, + { + "epoch": 0.8278481012658228, + "grad_norm": 1.296093225479126, + "learning_rate": 9.902663217944137e-05, + "loss": 0.7290873527526855, + "step": 1962 + }, + { + "epoch": 0.8286919831223629, + "grad_norm": 1.2594937086105347, + "learning_rate": 9.902191110661704e-05, + "loss": 0.7971217036247253, + "step": 1964 + }, + { + "epoch": 0.829535864978903, + "grad_norm": 1.6016536951065063, + "learning_rate": 9.90171787253793e-05, + "loss": 0.6728768348693848, + "step": 1966 + }, + { + "epoch": 0.830379746835443, + "grad_norm": 3.3128950595855713, + "learning_rate": 9.901243503681983e-05, + "loss": 0.7684211730957031, + "step": 1968 + }, + { + "epoch": 0.8312236286919831, + "grad_norm": 1.2970373630523682, + "learning_rate": 9.90076800420329e-05, + "loss": 0.756637454032898, + "step": 1970 + }, + { + "epoch": 0.8320675105485232, + "grad_norm": 1.1388959884643555, + "learning_rate": 9.900291374211538e-05, + "loss": 0.6692084074020386, + "step": 1972 + }, + { + "epoch": 0.8329113924050633, + "grad_norm": 1.050641655921936, + "learning_rate": 9.899813613816677e-05, + "loss": 0.7298309803009033, + "step": 1974 + }, + { + "epoch": 0.8337552742616033, + "grad_norm": 1.2598577737808228, + "learning_rate": 9.899334723128922e-05, + "loss": 0.6886547803878784, + "step": 1976 + }, + { + "epoch": 0.8345991561181435, + "grad_norm": 1.2800767421722412, + "learning_rate": 9.898854702258735e-05, + "loss": 0.745341420173645, + "step": 1978 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.1923155784606934, + "learning_rate": 9.898373551316856e-05, + "loss": 0.7133575081825256, + "step": 1980 + }, + { + "epoch": 0.8362869198312236, + "grad_norm": 1.156121015548706, + "learning_rate": 9.897891270414272e-05, + "loss": 0.8117790818214417, + "step": 1982 + }, + { + "epoch": 0.8371308016877637, + "grad_norm": 1.0400618314743042, + "learning_rate": 9.897407859662238e-05, + "loss": 0.6094260215759277, + "step": 1984 + }, + { + "epoch": 0.8379746835443038, + "grad_norm": 1.451953411102295, + "learning_rate": 9.896923319172268e-05, + "loss": 0.7680332064628601, + "step": 1986 + }, + { + "epoch": 0.8388185654008439, + "grad_norm": 1.2560248374938965, + "learning_rate": 9.896437649056134e-05, + "loss": 0.6918784379959106, + "step": 1988 + }, + { + "epoch": 0.8396624472573839, + "grad_norm": 1.2744325399398804, + "learning_rate": 9.895950849425874e-05, + "loss": 0.7654696106910706, + "step": 1990 + }, + { + "epoch": 0.8405063291139241, + "grad_norm": 1.304439902305603, + "learning_rate": 9.895462920393781e-05, + "loss": 0.7585932612419128, + "step": 1992 + }, + { + "epoch": 0.8413502109704641, + "grad_norm": 1.578957200050354, + "learning_rate": 9.89497386207241e-05, + "loss": 0.7474164962768555, + "step": 1994 + }, + { + "epoch": 0.8421940928270042, + "grad_norm": 1.0358996391296387, + "learning_rate": 9.89448367457458e-05, + "loss": 0.663844883441925, + "step": 1996 + }, + { + "epoch": 0.8430379746835444, + "grad_norm": 1.2285103797912598, + "learning_rate": 9.893992358013366e-05, + "loss": 0.7578557729721069, + "step": 1998 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 1.2051875591278076, + "learning_rate": 9.893499912502108e-05, + "loss": 0.7795036435127258, + "step": 2000 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.7587011456489563, + "eval_runtime": 856.2276, + "eval_samples_per_second": 2.461, + "eval_steps_per_second": 2.461, + "step": 2000 + }, + { + "epoch": 0.8447257383966245, + "grad_norm": 1.145434021949768, + "learning_rate": 9.893006338154401e-05, + "loss": 0.731850802898407, + "step": 2002 + }, + { + "epoch": 0.8455696202531645, + "grad_norm": 1.0618077516555786, + "learning_rate": 9.892511635084101e-05, + "loss": 0.6711665391921997, + "step": 2004 + }, + { + "epoch": 0.8464135021097047, + "grad_norm": 1.1657867431640625, + "learning_rate": 9.892015803405331e-05, + "loss": 0.6894803643226624, + "step": 2006 + }, + { + "epoch": 0.8472573839662447, + "grad_norm": 1.080140233039856, + "learning_rate": 9.891518843232467e-05, + "loss": 0.628146231174469, + "step": 2008 + }, + { + "epoch": 0.8481012658227848, + "grad_norm": 1.0664509534835815, + "learning_rate": 9.891020754680151e-05, + "loss": 0.740858793258667, + "step": 2010 + }, + { + "epoch": 0.8489451476793249, + "grad_norm": 1.5567615032196045, + "learning_rate": 9.89052153786328e-05, + "loss": 0.7763919234275818, + "step": 2012 + }, + { + "epoch": 0.849789029535865, + "grad_norm": 1.4347095489501953, + "learning_rate": 9.890021192897016e-05, + "loss": 0.8131396770477295, + "step": 2014 + }, + { + "epoch": 0.850632911392405, + "grad_norm": 1.1787892580032349, + "learning_rate": 9.889519719896776e-05, + "loss": 0.6829051375389099, + "step": 2016 + }, + { + "epoch": 0.8514767932489451, + "grad_norm": 1.239745855331421, + "learning_rate": 9.889017118978241e-05, + "loss": 0.7664558291435242, + "step": 2018 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 1.1224207878112793, + "learning_rate": 9.888513390257352e-05, + "loss": 0.7307376861572266, + "step": 2020 + }, + { + "epoch": 0.8531645569620253, + "grad_norm": 1.100536823272705, + "learning_rate": 9.88800853385031e-05, + "loss": 0.6786578893661499, + "step": 2022 + }, + { + "epoch": 0.8540084388185654, + "grad_norm": 1.25773024559021, + "learning_rate": 9.887502549873576e-05, + "loss": 0.7971984148025513, + "step": 2024 + }, + { + "epoch": 0.8548523206751055, + "grad_norm": 0.9980104565620422, + "learning_rate": 9.886995438443868e-05, + "loss": 0.6990941166877747, + "step": 2026 + }, + { + "epoch": 0.8556962025316456, + "grad_norm": 1.0464621782302856, + "learning_rate": 9.886487199678171e-05, + "loss": 0.763938307762146, + "step": 2028 + }, + { + "epoch": 0.8565400843881856, + "grad_norm": 1.2303017377853394, + "learning_rate": 9.885977833693724e-05, + "loss": 0.7165632247924805, + "step": 2030 + }, + { + "epoch": 0.8573839662447258, + "grad_norm": 1.2203325033187866, + "learning_rate": 9.885467340608027e-05, + "loss": 0.7586364150047302, + "step": 2032 + }, + { + "epoch": 0.8582278481012658, + "grad_norm": 1.113882064819336, + "learning_rate": 9.884955720538843e-05, + "loss": 0.703253984451294, + "step": 2034 + }, + { + "epoch": 0.8590717299578059, + "grad_norm": 1.1731632947921753, + "learning_rate": 9.88444297360419e-05, + "loss": 0.8530917763710022, + "step": 2036 + }, + { + "epoch": 0.859915611814346, + "grad_norm": 1.4592338800430298, + "learning_rate": 9.883929099922349e-05, + "loss": 0.8166638612747192, + "step": 2038 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.1279125213623047, + "learning_rate": 9.883414099611864e-05, + "loss": 0.6762415170669556, + "step": 2040 + }, + { + "epoch": 0.8616033755274262, + "grad_norm": 1.1587293148040771, + "learning_rate": 9.882897972791534e-05, + "loss": 0.6826539039611816, + "step": 2042 + }, + { + "epoch": 0.8624472573839662, + "grad_norm": 1.1909502744674683, + "learning_rate": 9.88238071958042e-05, + "loss": 0.7372410893440247, + "step": 2044 + }, + { + "epoch": 0.8632911392405064, + "grad_norm": 1.0340155363082886, + "learning_rate": 9.881862340097841e-05, + "loss": 0.699260950088501, + "step": 2046 + }, + { + "epoch": 0.8641350210970464, + "grad_norm": 1.1745870113372803, + "learning_rate": 9.881342834463379e-05, + "loss": 0.7689789533615112, + "step": 2048 + }, + { + "epoch": 0.8649789029535865, + "grad_norm": 1.0003606081008911, + "learning_rate": 9.880822202796872e-05, + "loss": 0.6877372860908508, + "step": 2050 + }, + { + "epoch": 0.8658227848101265, + "grad_norm": 1.2546781301498413, + "learning_rate": 9.88030044521842e-05, + "loss": 0.7632413506507874, + "step": 2052 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.1178704500198364, + "learning_rate": 9.879777561848385e-05, + "loss": 0.6776729822158813, + "step": 2054 + }, + { + "epoch": 0.8675105485232067, + "grad_norm": 1.523606777191162, + "learning_rate": 9.879253552807384e-05, + "loss": 0.7592973709106445, + "step": 2056 + }, + { + "epoch": 0.8683544303797468, + "grad_norm": 1.3490995168685913, + "learning_rate": 9.878728418216296e-05, + "loss": 0.8028839230537415, + "step": 2058 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 1.1851624250411987, + "learning_rate": 9.87820215819626e-05, + "loss": 0.7499933838844299, + "step": 2060 + }, + { + "epoch": 0.870042194092827, + "grad_norm": 1.1877925395965576, + "learning_rate": 9.877674772868672e-05, + "loss": 0.7324717044830322, + "step": 2062 + }, + { + "epoch": 0.8708860759493671, + "grad_norm": 1.2982885837554932, + "learning_rate": 9.877146262355194e-05, + "loss": 0.7456585168838501, + "step": 2064 + }, + { + "epoch": 0.8717299578059071, + "grad_norm": 1.043912649154663, + "learning_rate": 9.876616626777739e-05, + "loss": 0.7552799582481384, + "step": 2066 + }, + { + "epoch": 0.8725738396624473, + "grad_norm": 1.172580599784851, + "learning_rate": 9.876085866258487e-05, + "loss": 0.6964990496635437, + "step": 2068 + }, + { + "epoch": 0.8734177215189873, + "grad_norm": 1.26815927028656, + "learning_rate": 9.875553980919871e-05, + "loss": 0.7368612289428711, + "step": 2070 + }, + { + "epoch": 0.8742616033755274, + "grad_norm": 1.1268136501312256, + "learning_rate": 9.875020970884587e-05, + "loss": 0.7400802969932556, + "step": 2072 + }, + { + "epoch": 0.8751054852320675, + "grad_norm": 1.0556721687316895, + "learning_rate": 9.874486836275594e-05, + "loss": 0.6931334137916565, + "step": 2074 + }, + { + "epoch": 0.8759493670886076, + "grad_norm": 1.1967823505401611, + "learning_rate": 9.873951577216106e-05, + "loss": 0.7124089002609253, + "step": 2076 + }, + { + "epoch": 0.8767932489451477, + "grad_norm": 1.1753164529800415, + "learning_rate": 9.873415193829591e-05, + "loss": 0.7462030053138733, + "step": 2078 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 1.326923131942749, + "learning_rate": 9.872877686239789e-05, + "loss": 0.778078019618988, + "step": 2080 + }, + { + "epoch": 0.8784810126582279, + "grad_norm": 1.1472662687301636, + "learning_rate": 9.87233905457069e-05, + "loss": 0.6592919826507568, + "step": 2082 + }, + { + "epoch": 0.8793248945147679, + "grad_norm": 1.1162762641906738, + "learning_rate": 9.871799298946544e-05, + "loss": 0.661717414855957, + "step": 2084 + }, + { + "epoch": 0.880168776371308, + "grad_norm": 1.1694408655166626, + "learning_rate": 9.871258419491866e-05, + "loss": 0.6203670501708984, + "step": 2086 + }, + { + "epoch": 0.8810126582278481, + "grad_norm": 1.229691505432129, + "learning_rate": 9.870716416331425e-05, + "loss": 0.758888304233551, + "step": 2088 + }, + { + "epoch": 0.8818565400843882, + "grad_norm": 1.540377140045166, + "learning_rate": 9.870173289590251e-05, + "loss": 0.760649561882019, + "step": 2090 + }, + { + "epoch": 0.8827004219409282, + "grad_norm": 1.173628568649292, + "learning_rate": 9.869629039393632e-05, + "loss": 0.6981227397918701, + "step": 2092 + }, + { + "epoch": 0.8835443037974684, + "grad_norm": 1.1404013633728027, + "learning_rate": 9.869083665867116e-05, + "loss": 0.7808336615562439, + "step": 2094 + }, + { + "epoch": 0.8843881856540085, + "grad_norm": 1.1038721799850464, + "learning_rate": 9.868537169136511e-05, + "loss": 0.7540555596351624, + "step": 2096 + }, + { + "epoch": 0.8852320675105485, + "grad_norm": 1.1510080099105835, + "learning_rate": 9.867989549327885e-05, + "loss": 0.6650454998016357, + "step": 2098 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.166912317276001, + "learning_rate": 9.867440806567561e-05, + "loss": 0.673769474029541, + "step": 2100 + }, + { + "epoch": 0.8860759493670886, + "eval_loss": 0.7559094429016113, + "eval_runtime": 847.8311, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2100 + }, + { + "epoch": 0.8869198312236287, + "grad_norm": 1.227583885192871, + "learning_rate": 9.866890940982121e-05, + "loss": 0.8314241766929626, + "step": 2102 + }, + { + "epoch": 0.8877637130801688, + "grad_norm": 1.1813976764678955, + "learning_rate": 9.866339952698413e-05, + "loss": 0.6770843863487244, + "step": 2104 + }, + { + "epoch": 0.8886075949367088, + "grad_norm": 1.2471063137054443, + "learning_rate": 9.865787841843539e-05, + "loss": 0.7142292857170105, + "step": 2106 + }, + { + "epoch": 0.889451476793249, + "grad_norm": 1.1602860689163208, + "learning_rate": 9.865234608544858e-05, + "loss": 0.6981731653213501, + "step": 2108 + }, + { + "epoch": 0.890295358649789, + "grad_norm": 1.145677089691162, + "learning_rate": 9.864680252929992e-05, + "loss": 0.7019379138946533, + "step": 2110 + }, + { + "epoch": 0.8911392405063291, + "grad_norm": 1.2222462892532349, + "learning_rate": 9.86412477512682e-05, + "loss": 0.7690986394882202, + "step": 2112 + }, + { + "epoch": 0.8919831223628693, + "grad_norm": 1.1288166046142578, + "learning_rate": 9.863568175263478e-05, + "loss": 0.7241792678833008, + "step": 2114 + }, + { + "epoch": 0.8928270042194093, + "grad_norm": 1.1773978471755981, + "learning_rate": 9.863010453468364e-05, + "loss": 0.7392162084579468, + "step": 2116 + }, + { + "epoch": 0.8936708860759494, + "grad_norm": 1.102638840675354, + "learning_rate": 9.862451609870136e-05, + "loss": 0.7603078484535217, + "step": 2118 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.1325360536575317, + "learning_rate": 9.861891644597707e-05, + "loss": 0.6804911494255066, + "step": 2120 + }, + { + "epoch": 0.8953586497890296, + "grad_norm": 1.1381969451904297, + "learning_rate": 9.86133055778025e-05, + "loss": 0.787288248538971, + "step": 2122 + }, + { + "epoch": 0.8962025316455696, + "grad_norm": 1.2454546689987183, + "learning_rate": 9.860768349547196e-05, + "loss": 0.7282505035400391, + "step": 2124 + }, + { + "epoch": 0.8970464135021097, + "grad_norm": 1.2568305730819702, + "learning_rate": 9.860205020028237e-05, + "loss": 0.7554803490638733, + "step": 2126 + }, + { + "epoch": 0.8978902953586498, + "grad_norm": 1.1523523330688477, + "learning_rate": 9.859640569353321e-05, + "loss": 0.7126525044441223, + "step": 2128 + }, + { + "epoch": 0.8987341772151899, + "grad_norm": 1.314878225326538, + "learning_rate": 9.859074997652658e-05, + "loss": 0.7300811409950256, + "step": 2130 + }, + { + "epoch": 0.8995780590717299, + "grad_norm": 1.1272218227386475, + "learning_rate": 9.858508305056713e-05, + "loss": 0.7217329144477844, + "step": 2132 + }, + { + "epoch": 0.90042194092827, + "grad_norm": 1.10934317111969, + "learning_rate": 9.857940491696211e-05, + "loss": 0.714308500289917, + "step": 2134 + }, + { + "epoch": 0.9012658227848102, + "grad_norm": 1.1991039514541626, + "learning_rate": 9.857371557702136e-05, + "loss": 0.6613366007804871, + "step": 2136 + }, + { + "epoch": 0.9021097046413502, + "grad_norm": 1.3176918029785156, + "learning_rate": 9.85680150320573e-05, + "loss": 0.6972863078117371, + "step": 2138 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 1.1966592073440552, + "learning_rate": 9.856230328338496e-05, + "loss": 0.7299100160598755, + "step": 2140 + }, + { + "epoch": 0.9037974683544304, + "grad_norm": 1.2889270782470703, + "learning_rate": 9.85565803323219e-05, + "loss": 0.7145020961761475, + "step": 2142 + }, + { + "epoch": 0.9046413502109705, + "grad_norm": 1.2112789154052734, + "learning_rate": 9.855084618018828e-05, + "loss": 0.6717942953109741, + "step": 2144 + }, + { + "epoch": 0.9054852320675105, + "grad_norm": 1.2550239562988281, + "learning_rate": 9.85451008283069e-05, + "loss": 0.7460196018218994, + "step": 2146 + }, + { + "epoch": 0.9063291139240506, + "grad_norm": 1.2926387786865234, + "learning_rate": 9.853934427800309e-05, + "loss": 0.8300626873970032, + "step": 2148 + }, + { + "epoch": 0.9071729957805907, + "grad_norm": 1.0690672397613525, + "learning_rate": 9.853357653060478e-05, + "loss": 0.715215802192688, + "step": 2150 + }, + { + "epoch": 0.9080168776371308, + "grad_norm": 1.1021424531936646, + "learning_rate": 9.852779758744245e-05, + "loss": 0.7021427154541016, + "step": 2152 + }, + { + "epoch": 0.9088607594936708, + "grad_norm": 1.0713517665863037, + "learning_rate": 9.852200744984921e-05, + "loss": 0.7576406598091125, + "step": 2154 + }, + { + "epoch": 0.909704641350211, + "grad_norm": 1.277526617050171, + "learning_rate": 9.851620611916075e-05, + "loss": 0.7008846998214722, + "step": 2156 + }, + { + "epoch": 0.9105485232067511, + "grad_norm": 1.2434618473052979, + "learning_rate": 9.85103935967153e-05, + "loss": 0.7536613345146179, + "step": 2158 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 1.1654841899871826, + "learning_rate": 9.850456988385371e-05, + "loss": 0.7435567378997803, + "step": 2160 + }, + { + "epoch": 0.9122362869198313, + "grad_norm": 1.0718246698379517, + "learning_rate": 9.849873498191939e-05, + "loss": 0.7725666165351868, + "step": 2162 + }, + { + "epoch": 0.9130801687763713, + "grad_norm": 1.3425630331039429, + "learning_rate": 9.849288889225835e-05, + "loss": 0.7833593487739563, + "step": 2164 + }, + { + "epoch": 0.9139240506329114, + "grad_norm": 1.1989985704421997, + "learning_rate": 9.848703161621917e-05, + "loss": 0.7290158867835999, + "step": 2166 + }, + { + "epoch": 0.9147679324894514, + "grad_norm": 1.0549380779266357, + "learning_rate": 9.8481163155153e-05, + "loss": 0.6787996888160706, + "step": 2168 + }, + { + "epoch": 0.9156118143459916, + "grad_norm": 1.0757017135620117, + "learning_rate": 9.847528351041359e-05, + "loss": 0.7645748853683472, + "step": 2170 + }, + { + "epoch": 0.9164556962025316, + "grad_norm": 1.0636975765228271, + "learning_rate": 9.846939268335726e-05, + "loss": 0.6640698313713074, + "step": 2172 + }, + { + "epoch": 0.9172995780590717, + "grad_norm": 1.2038439512252808, + "learning_rate": 9.846349067534291e-05, + "loss": 0.7216284275054932, + "step": 2174 + }, + { + "epoch": 0.9181434599156119, + "grad_norm": 1.17854642868042, + "learning_rate": 9.845757748773203e-05, + "loss": 0.7244991660118103, + "step": 2176 + }, + { + "epoch": 0.9189873417721519, + "grad_norm": 1.0391159057617188, + "learning_rate": 9.845165312188864e-05, + "loss": 0.6043152809143066, + "step": 2178 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 1.2382071018218994, + "learning_rate": 9.844571757917944e-05, + "loss": 0.7791659832000732, + "step": 2180 + }, + { + "epoch": 0.920675105485232, + "grad_norm": 1.0855708122253418, + "learning_rate": 9.84397708609736e-05, + "loss": 0.7190433144569397, + "step": 2182 + }, + { + "epoch": 0.9215189873417722, + "grad_norm": 1.103308916091919, + "learning_rate": 9.843381296864291e-05, + "loss": 0.6648658514022827, + "step": 2184 + }, + { + "epoch": 0.9223628691983122, + "grad_norm": 1.073517918586731, + "learning_rate": 9.842784390356178e-05, + "loss": 0.6891760230064392, + "step": 2186 + }, + { + "epoch": 0.9232067510548523, + "grad_norm": 1.0806199312210083, + "learning_rate": 9.842186366710712e-05, + "loss": 0.6880859136581421, + "step": 2188 + }, + { + "epoch": 0.9240506329113924, + "grad_norm": 1.0631483793258667, + "learning_rate": 9.841587226065848e-05, + "loss": 0.6238307952880859, + "step": 2190 + }, + { + "epoch": 0.9248945147679325, + "grad_norm": 1.2630863189697266, + "learning_rate": 9.840986968559795e-05, + "loss": 0.6905744075775146, + "step": 2192 + }, + { + "epoch": 0.9257383966244725, + "grad_norm": 1.1307560205459595, + "learning_rate": 9.840385594331022e-05, + "loss": 0.7531564235687256, + "step": 2194 + }, + { + "epoch": 0.9265822784810127, + "grad_norm": 1.0294862985610962, + "learning_rate": 9.839783103518254e-05, + "loss": 0.6750671863555908, + "step": 2196 + }, + { + "epoch": 0.9274261603375528, + "grad_norm": 1.2446976900100708, + "learning_rate": 9.839179496260472e-05, + "loss": 0.7200804352760315, + "step": 2198 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 1.2673420906066895, + "learning_rate": 9.83857477269692e-05, + "loss": 0.7002623677253723, + "step": 2200 + }, + { + "epoch": 0.9282700421940928, + "eval_loss": 0.7497645616531372, + "eval_runtime": 856.8766, + "eval_samples_per_second": 2.459, + "eval_steps_per_second": 2.459, + "step": 2200 + }, + { + "epoch": 0.9291139240506329, + "grad_norm": 1.5114624500274658, + "learning_rate": 9.837968932967094e-05, + "loss": 0.7718265056610107, + "step": 2202 + }, + { + "epoch": 0.929957805907173, + "grad_norm": 1.2059369087219238, + "learning_rate": 9.837361977210751e-05, + "loss": 0.7204271554946899, + "step": 2204 + }, + { + "epoch": 0.9308016877637131, + "grad_norm": 1.2077301740646362, + "learning_rate": 9.836753905567902e-05, + "loss": 0.7371073961257935, + "step": 2206 + }, + { + "epoch": 0.9316455696202531, + "grad_norm": 1.120097279548645, + "learning_rate": 9.836144718178818e-05, + "loss": 0.6601167321205139, + "step": 2208 + }, + { + "epoch": 0.9324894514767933, + "grad_norm": 1.1755714416503906, + "learning_rate": 9.835534415184029e-05, + "loss": 0.6897423267364502, + "step": 2210 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3587000370025635, + "learning_rate": 9.834922996724317e-05, + "loss": 0.758438229560852, + "step": 2212 + }, + { + "epoch": 0.9341772151898734, + "grad_norm": 1.1898177862167358, + "learning_rate": 9.834310462940727e-05, + "loss": 0.7489214539527893, + "step": 2214 + }, + { + "epoch": 0.9350210970464135, + "grad_norm": 1.0814623832702637, + "learning_rate": 9.833696813974558e-05, + "loss": 0.6844488382339478, + "step": 2216 + }, + { + "epoch": 0.9358649789029536, + "grad_norm": 1.1060179471969604, + "learning_rate": 9.833082049967366e-05, + "loss": 0.6617586016654968, + "step": 2218 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 1.1780575513839722, + "learning_rate": 9.832466171060968e-05, + "loss": 0.7383584976196289, + "step": 2220 + }, + { + "epoch": 0.9375527426160337, + "grad_norm": 1.3734618425369263, + "learning_rate": 9.831849177397432e-05, + "loss": 0.7764308452606201, + "step": 2222 + }, + { + "epoch": 0.9383966244725739, + "grad_norm": 1.1367733478546143, + "learning_rate": 9.831231069119089e-05, + "loss": 0.6834397912025452, + "step": 2224 + }, + { + "epoch": 0.9392405063291139, + "grad_norm": 1.1695492267608643, + "learning_rate": 9.830611846368524e-05, + "loss": 0.7054480910301208, + "step": 2226 + }, + { + "epoch": 0.940084388185654, + "grad_norm": 1.0345736742019653, + "learning_rate": 9.829991509288579e-05, + "loss": 0.694448709487915, + "step": 2228 + }, + { + "epoch": 0.9409282700421941, + "grad_norm": 1.298105239868164, + "learning_rate": 9.829370058022356e-05, + "loss": 0.6839741468429565, + "step": 2230 + }, + { + "epoch": 0.9417721518987342, + "grad_norm": 1.2905502319335938, + "learning_rate": 9.828747492713209e-05, + "loss": 0.7886884212493896, + "step": 2232 + }, + { + "epoch": 0.9426160337552743, + "grad_norm": 1.12301504611969, + "learning_rate": 9.828123813504753e-05, + "loss": 0.7206413149833679, + "step": 2234 + }, + { + "epoch": 0.9434599156118143, + "grad_norm": 1.2644896507263184, + "learning_rate": 9.82749902054086e-05, + "loss": 0.7700693607330322, + "step": 2236 + }, + { + "epoch": 0.9443037974683545, + "grad_norm": 1.1626365184783936, + "learning_rate": 9.826873113965655e-05, + "loss": 0.7199711203575134, + "step": 2238 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 1.0728627443313599, + "learning_rate": 9.826246093923528e-05, + "loss": 0.7183539271354675, + "step": 2240 + }, + { + "epoch": 0.9459915611814346, + "grad_norm": 1.1444766521453857, + "learning_rate": 9.825617960559114e-05, + "loss": 0.7417964935302734, + "step": 2242 + }, + { + "epoch": 0.9468354430379747, + "grad_norm": 1.4059823751449585, + "learning_rate": 9.824988714017316e-05, + "loss": 0.7949740290641785, + "step": 2244 + }, + { + "epoch": 0.9476793248945148, + "grad_norm": 1.1349766254425049, + "learning_rate": 9.824358354443286e-05, + "loss": 0.6433083415031433, + "step": 2246 + }, + { + "epoch": 0.9485232067510548, + "grad_norm": 1.0879144668579102, + "learning_rate": 9.823726881982438e-05, + "loss": 0.6519861817359924, + "step": 2248 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.2289162874221802, + "learning_rate": 9.82309429678044e-05, + "loss": 0.7280195355415344, + "step": 2250 + }, + { + "epoch": 0.950210970464135, + "grad_norm": 1.1755765676498413, + "learning_rate": 9.822460598983217e-05, + "loss": 0.7524687647819519, + "step": 2252 + }, + { + "epoch": 0.9510548523206751, + "grad_norm": 1.179807186126709, + "learning_rate": 9.821825788736949e-05, + "loss": 0.7543174624443054, + "step": 2254 + }, + { + "epoch": 0.9518987341772152, + "grad_norm": 1.1234289407730103, + "learning_rate": 9.821189866188079e-05, + "loss": 0.716377854347229, + "step": 2256 + }, + { + "epoch": 0.9527426160337553, + "grad_norm": 1.0324063301086426, + "learning_rate": 9.820552831483297e-05, + "loss": 0.6403332948684692, + "step": 2258 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 1.1459579467773438, + "learning_rate": 9.819914684769558e-05, + "loss": 0.7406947612762451, + "step": 2260 + }, + { + "epoch": 0.9544303797468354, + "grad_norm": 1.2886124849319458, + "learning_rate": 9.819275426194072e-05, + "loss": 0.749687671661377, + "step": 2262 + }, + { + "epoch": 0.9552742616033755, + "grad_norm": 1.3349844217300415, + "learning_rate": 9.818635055904299e-05, + "loss": 0.778410017490387, + "step": 2264 + }, + { + "epoch": 0.9561181434599156, + "grad_norm": 1.0994901657104492, + "learning_rate": 9.81799357404796e-05, + "loss": 0.6701914668083191, + "step": 2266 + }, + { + "epoch": 0.9569620253164557, + "grad_norm": 1.1787796020507812, + "learning_rate": 9.817350980773038e-05, + "loss": 0.7205135226249695, + "step": 2268 + }, + { + "epoch": 0.9578059071729957, + "grad_norm": 1.100813627243042, + "learning_rate": 9.816707276227763e-05, + "loss": 0.6897916197776794, + "step": 2270 + }, + { + "epoch": 0.9586497890295359, + "grad_norm": 1.1280698776245117, + "learning_rate": 9.816062460560627e-05, + "loss": 0.6763570308685303, + "step": 2272 + }, + { + "epoch": 0.959493670886076, + "grad_norm": 1.2322514057159424, + "learning_rate": 9.815416533920374e-05, + "loss": 0.6948683857917786, + "step": 2274 + }, + { + "epoch": 0.960337552742616, + "grad_norm": 1.3963630199432373, + "learning_rate": 9.814769496456008e-05, + "loss": 0.7876828908920288, + "step": 2276 + }, + { + "epoch": 0.9611814345991562, + "grad_norm": 1.2093676328659058, + "learning_rate": 9.814121348316792e-05, + "loss": 0.8191362619400024, + "step": 2278 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 1.2223572731018066, + "learning_rate": 9.813472089652233e-05, + "loss": 0.7162626385688782, + "step": 2280 + }, + { + "epoch": 0.9628691983122363, + "grad_norm": 1.1498078107833862, + "learning_rate": 9.812821720612111e-05, + "loss": 0.7183970212936401, + "step": 2282 + }, + { + "epoch": 0.9637130801687763, + "grad_norm": 1.1563853025436401, + "learning_rate": 9.812170241346449e-05, + "loss": 0.734487771987915, + "step": 2284 + }, + { + "epoch": 0.9645569620253165, + "grad_norm": 1.1823415756225586, + "learning_rate": 9.81151765200553e-05, + "loss": 0.7312371730804443, + "step": 2286 + }, + { + "epoch": 0.9654008438818565, + "grad_norm": 1.1336151361465454, + "learning_rate": 9.810863952739899e-05, + "loss": 0.7668377757072449, + "step": 2288 + }, + { + "epoch": 0.9662447257383966, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.810209143700347e-05, + "loss": 0.7100399732589722, + "step": 2290 + }, + { + "epoch": 0.9670886075949368, + "grad_norm": 1.1368129253387451, + "learning_rate": 9.809553225037926e-05, + "loss": 0.7169836163520813, + "step": 2292 + }, + { + "epoch": 0.9679324894514768, + "grad_norm": 1.141107439994812, + "learning_rate": 9.808896196903947e-05, + "loss": 0.7709535956382751, + "step": 2294 + }, + { + "epoch": 0.9687763713080169, + "grad_norm": 1.276405930519104, + "learning_rate": 9.808238059449971e-05, + "loss": 0.7300511002540588, + "step": 2296 + }, + { + "epoch": 0.9696202531645569, + "grad_norm": 0.9817046523094177, + "learning_rate": 9.80757881282782e-05, + "loss": 0.6259129047393799, + "step": 2298 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 1.3965257406234741, + "learning_rate": 9.806918457189566e-05, + "loss": 0.7361716032028198, + "step": 2300 + }, + { + "epoch": 0.9704641350210971, + "eval_loss": 0.7464568614959717, + "eval_runtime": 864.2128, + "eval_samples_per_second": 2.438, + "eval_steps_per_second": 2.438, + "step": 2300 + }, + { + "epoch": 0.9713080168776371, + "grad_norm": 1.2168612480163574, + "learning_rate": 9.806256992687544e-05, + "loss": 0.805477499961853, + "step": 2302 + }, + { + "epoch": 0.9721518987341772, + "grad_norm": 1.0418168306350708, + "learning_rate": 9.80559441947434e-05, + "loss": 0.6673368811607361, + "step": 2304 + }, + { + "epoch": 0.9729957805907173, + "grad_norm": 1.223128318786621, + "learning_rate": 9.804930737702796e-05, + "loss": 0.7585647106170654, + "step": 2306 + }, + { + "epoch": 0.9738396624472574, + "grad_norm": 1.264511227607727, + "learning_rate": 9.804265947526011e-05, + "loss": 0.7642034888267517, + "step": 2308 + }, + { + "epoch": 0.9746835443037974, + "grad_norm": 1.076887607574463, + "learning_rate": 9.803600049097339e-05, + "loss": 0.7094541192054749, + "step": 2310 + }, + { + "epoch": 0.9755274261603376, + "grad_norm": 1.0214987993240356, + "learning_rate": 9.802933042570392e-05, + "loss": 0.7370059490203857, + "step": 2312 + }, + { + "epoch": 0.9763713080168777, + "grad_norm": 1.3075295686721802, + "learning_rate": 9.802264928099035e-05, + "loss": 0.726834237575531, + "step": 2314 + }, + { + "epoch": 0.9772151898734177, + "grad_norm": 1.057386040687561, + "learning_rate": 9.801595705837385e-05, + "loss": 0.6742353439331055, + "step": 2316 + }, + { + "epoch": 0.9780590717299578, + "grad_norm": 1.3998085260391235, + "learning_rate": 9.800925375939825e-05, + "loss": 0.6862425208091736, + "step": 2318 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 1.080574631690979, + "learning_rate": 9.800253938560983e-05, + "loss": 0.6212031245231628, + "step": 2320 + }, + { + "epoch": 0.979746835443038, + "grad_norm": 1.3643771409988403, + "learning_rate": 9.799581393855748e-05, + "loss": 0.7522522211074829, + "step": 2322 + }, + { + "epoch": 0.980590717299578, + "grad_norm": 1.2455768585205078, + "learning_rate": 9.798907741979264e-05, + "loss": 0.7265716791152954, + "step": 2324 + }, + { + "epoch": 0.9814345991561182, + "grad_norm": 1.078774333000183, + "learning_rate": 9.798232983086927e-05, + "loss": 0.7160419225692749, + "step": 2326 + }, + { + "epoch": 0.9822784810126582, + "grad_norm": 1.3013948202133179, + "learning_rate": 9.797557117334394e-05, + "loss": 0.7991124391555786, + "step": 2328 + }, + { + "epoch": 0.9831223628691983, + "grad_norm": 1.2216732501983643, + "learning_rate": 9.796880144877572e-05, + "loss": 0.7193916440010071, + "step": 2330 + }, + { + "epoch": 0.9839662447257383, + "grad_norm": 1.1469542980194092, + "learning_rate": 9.796202065872627e-05, + "loss": 0.7184370756149292, + "step": 2332 + }, + { + "epoch": 0.9848101265822785, + "grad_norm": 1.0431830883026123, + "learning_rate": 9.795522880475979e-05, + "loss": 0.6474619507789612, + "step": 2334 + }, + { + "epoch": 0.9856540084388186, + "grad_norm": 1.1819576025009155, + "learning_rate": 9.794842588844299e-05, + "loss": 0.6392545700073242, + "step": 2336 + }, + { + "epoch": 0.9864978902953586, + "grad_norm": 1.1984983682632446, + "learning_rate": 9.794161191134525e-05, + "loss": 0.7358114719390869, + "step": 2338 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 1.3378512859344482, + "learning_rate": 9.793478687503834e-05, + "loss": 0.6762020587921143, + "step": 2340 + }, + { + "epoch": 0.9881856540084388, + "grad_norm": 1.272674560546875, + "learning_rate": 9.792795078109673e-05, + "loss": 0.7478934526443481, + "step": 2342 + }, + { + "epoch": 0.9890295358649789, + "grad_norm": 1.153746247291565, + "learning_rate": 9.792110363109733e-05, + "loss": 0.7316533923149109, + "step": 2344 + }, + { + "epoch": 0.9898734177215189, + "grad_norm": 1.1361702680587769, + "learning_rate": 9.791424542661967e-05, + "loss": 0.7078539133071899, + "step": 2346 + }, + { + "epoch": 0.9907172995780591, + "grad_norm": 1.3043115139007568, + "learning_rate": 9.790737616924581e-05, + "loss": 0.7945935130119324, + "step": 2348 + }, + { + "epoch": 0.9915611814345991, + "grad_norm": 1.1913264989852905, + "learning_rate": 9.790049586056034e-05, + "loss": 0.8247197866439819, + "step": 2350 + }, + { + "epoch": 0.9924050632911392, + "grad_norm": 1.1560171842575073, + "learning_rate": 9.789360450215041e-05, + "loss": 0.7099657654762268, + "step": 2352 + }, + { + "epoch": 0.9932489451476794, + "grad_norm": 1.2311041355133057, + "learning_rate": 9.788670209560575e-05, + "loss": 0.7480318546295166, + "step": 2354 + }, + { + "epoch": 0.9940928270042194, + "grad_norm": 1.1584707498550415, + "learning_rate": 9.787978864251859e-05, + "loss": 0.6870889067649841, + "step": 2356 + }, + { + "epoch": 0.9949367088607595, + "grad_norm": 1.057478666305542, + "learning_rate": 9.787286414448375e-05, + "loss": 0.6114922165870667, + "step": 2358 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 1.1431775093078613, + "learning_rate": 9.786592860309856e-05, + "loss": 0.6955118179321289, + "step": 2360 + }, + { + "epoch": 0.9966244725738397, + "grad_norm": 1.232142448425293, + "learning_rate": 9.785898201996292e-05, + "loss": 0.735048770904541, + "step": 2362 + }, + { + "epoch": 0.9974683544303797, + "grad_norm": 1.1236306428909302, + "learning_rate": 9.785202439667928e-05, + "loss": 0.7150241136550903, + "step": 2364 + }, + { + "epoch": 0.9983122362869198, + "grad_norm": 1.0517534017562866, + "learning_rate": 9.784505573485263e-05, + "loss": 0.6870222687721252, + "step": 2366 + }, + { + "epoch": 0.99915611814346, + "grad_norm": 1.1747480630874634, + "learning_rate": 9.78380760360905e-05, + "loss": 0.7521567940711975, + "step": 2368 + }, + { + "epoch": 1.0, + "grad_norm": 1.2790346145629883, + "learning_rate": 9.783108530200298e-05, + "loss": 0.7336234450340271, + "step": 2370 + }, + { + "epoch": 1.0008438818565402, + "grad_norm": 1.1216399669647217, + "learning_rate": 9.78240835342027e-05, + "loss": 0.6378109455108643, + "step": 2372 + }, + { + "epoch": 1.00168776371308, + "grad_norm": 1.267336368560791, + "learning_rate": 9.781707073430482e-05, + "loss": 0.6174905300140381, + "step": 2374 + }, + { + "epoch": 1.0025316455696203, + "grad_norm": 1.1342934370040894, + "learning_rate": 9.781004690392706e-05, + "loss": 0.6579123139381409, + "step": 2376 + }, + { + "epoch": 1.0033755274261604, + "grad_norm": 1.1317468881607056, + "learning_rate": 9.78030120446897e-05, + "loss": 0.6679617166519165, + "step": 2378 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 1.2992616891860962, + "learning_rate": 9.779596615821552e-05, + "loss": 0.7368149161338806, + "step": 2380 + }, + { + "epoch": 1.0050632911392405, + "grad_norm": 1.1714510917663574, + "learning_rate": 9.77889092461299e-05, + "loss": 0.6887164115905762, + "step": 2382 + }, + { + "epoch": 1.0059071729957807, + "grad_norm": 1.1670639514923096, + "learning_rate": 9.778184131006071e-05, + "loss": 0.681344211101532, + "step": 2384 + }, + { + "epoch": 1.0067510548523206, + "grad_norm": 1.2487291097640991, + "learning_rate": 9.77747623516384e-05, + "loss": 0.7342769503593445, + "step": 2386 + }, + { + "epoch": 1.0075949367088608, + "grad_norm": 1.2408956289291382, + "learning_rate": 9.776767237249595e-05, + "loss": 0.577454149723053, + "step": 2388 + }, + { + "epoch": 1.0084388185654007, + "grad_norm": 1.067991852760315, + "learning_rate": 9.776057137426889e-05, + "loss": 0.6588307023048401, + "step": 2390 + }, + { + "epoch": 1.009282700421941, + "grad_norm": 1.2821543216705322, + "learning_rate": 9.775345935859525e-05, + "loss": 0.7045041918754578, + "step": 2392 + }, + { + "epoch": 1.010126582278481, + "grad_norm": 1.3160134553909302, + "learning_rate": 9.774633632711569e-05, + "loss": 0.7141479253768921, + "step": 2394 + }, + { + "epoch": 1.010970464135021, + "grad_norm": 1.66774320602417, + "learning_rate": 9.773920228147329e-05, + "loss": 0.723293662071228, + "step": 2396 + }, + { + "epoch": 1.0118143459915612, + "grad_norm": 1.027588963508606, + "learning_rate": 9.77320572233138e-05, + "loss": 0.5812023878097534, + "step": 2398 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.406507968902588, + "learning_rate": 9.77249011542854e-05, + "loss": 0.7071458101272583, + "step": 2400 + }, + { + "epoch": 1.0126582278481013, + "eval_loss": 0.7421699166297913, + "eval_runtime": 854.2185, + "eval_samples_per_second": 2.467, + "eval_steps_per_second": 2.467, + "step": 2400 + }, + { + "epoch": 1.0135021097046413, + "grad_norm": 1.1236240863800049, + "learning_rate": 9.771773407603889e-05, + "loss": 0.7049722671508789, + "step": 2402 + }, + { + "epoch": 1.0143459915611814, + "grad_norm": 1.1924289464950562, + "learning_rate": 9.771055599022756e-05, + "loss": 0.635308027267456, + "step": 2404 + }, + { + "epoch": 1.0151898734177216, + "grad_norm": 1.1744966506958008, + "learning_rate": 9.770336689850727e-05, + "loss": 0.7286487817764282, + "step": 2406 + }, + { + "epoch": 1.0160337552742615, + "grad_norm": 1.2131173610687256, + "learning_rate": 9.769616680253639e-05, + "loss": 0.6828222274780273, + "step": 2408 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 1.0517828464508057, + "learning_rate": 9.768895570397585e-05, + "loss": 0.6652156114578247, + "step": 2410 + }, + { + "epoch": 1.0177215189873419, + "grad_norm": 1.1603758335113525, + "learning_rate": 9.768173360448912e-05, + "loss": 0.7278267741203308, + "step": 2412 + }, + { + "epoch": 1.0185654008438818, + "grad_norm": 1.3167752027511597, + "learning_rate": 9.767450050574218e-05, + "loss": 0.6082334518432617, + "step": 2414 + }, + { + "epoch": 1.019409282700422, + "grad_norm": 1.1754449605941772, + "learning_rate": 9.766725640940358e-05, + "loss": 0.67228102684021, + "step": 2416 + }, + { + "epoch": 1.0202531645569621, + "grad_norm": 1.060952067375183, + "learning_rate": 9.766000131714442e-05, + "loss": 0.5984366536140442, + "step": 2418 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 1.0826152563095093, + "learning_rate": 9.765273523063825e-05, + "loss": 0.690661609172821, + "step": 2420 + }, + { + "epoch": 1.0219409282700422, + "grad_norm": 1.423723816871643, + "learning_rate": 9.764545815156125e-05, + "loss": 0.7960668802261353, + "step": 2422 + }, + { + "epoch": 1.0227848101265822, + "grad_norm": 1.0882549285888672, + "learning_rate": 9.763817008159212e-05, + "loss": 0.6971074342727661, + "step": 2424 + }, + { + "epoch": 1.0236286919831223, + "grad_norm": 1.1053040027618408, + "learning_rate": 9.763087102241206e-05, + "loss": 0.6854458451271057, + "step": 2426 + }, + { + "epoch": 1.0244725738396625, + "grad_norm": 1.1975224018096924, + "learning_rate": 9.762356097570482e-05, + "loss": 0.6724489331245422, + "step": 2428 + }, + { + "epoch": 1.0253164556962024, + "grad_norm": 1.1692171096801758, + "learning_rate": 9.76162399431567e-05, + "loss": 0.7064506411552429, + "step": 2430 + }, + { + "epoch": 1.0261603375527426, + "grad_norm": 1.1927787065505981, + "learning_rate": 9.760890792645649e-05, + "loss": 0.6605257391929626, + "step": 2432 + }, + { + "epoch": 1.0270042194092828, + "grad_norm": 1.4147427082061768, + "learning_rate": 9.760156492729558e-05, + "loss": 0.6872501373291016, + "step": 2434 + }, + { + "epoch": 1.0278481012658227, + "grad_norm": 1.2503126859664917, + "learning_rate": 9.759421094736785e-05, + "loss": 0.7117500305175781, + "step": 2436 + }, + { + "epoch": 1.0286919831223629, + "grad_norm": 1.229978084564209, + "learning_rate": 9.758684598836971e-05, + "loss": 0.6740369200706482, + "step": 2438 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 1.4765945672988892, + "learning_rate": 9.757947005200014e-05, + "loss": 0.7215790748596191, + "step": 2440 + }, + { + "epoch": 1.030379746835443, + "grad_norm": 1.282632827758789, + "learning_rate": 9.757208313996061e-05, + "loss": 0.6961746215820312, + "step": 2442 + }, + { + "epoch": 1.0312236286919831, + "grad_norm": 1.259828805923462, + "learning_rate": 9.756468525395512e-05, + "loss": 0.6348349452018738, + "step": 2444 + }, + { + "epoch": 1.0320675105485233, + "grad_norm": 1.0984172821044922, + "learning_rate": 9.755727639569024e-05, + "loss": 0.6756057739257812, + "step": 2446 + }, + { + "epoch": 1.0329113924050632, + "grad_norm": 1.235835075378418, + "learning_rate": 9.754985656687506e-05, + "loss": 0.6968509554862976, + "step": 2448 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 1.273032546043396, + "learning_rate": 9.754242576922119e-05, + "loss": 0.6793950796127319, + "step": 2450 + }, + { + "epoch": 1.0345991561181433, + "grad_norm": 1.251996397972107, + "learning_rate": 9.753498400444274e-05, + "loss": 0.645270586013794, + "step": 2452 + }, + { + "epoch": 1.0354430379746835, + "grad_norm": 1.4310805797576904, + "learning_rate": 9.752753127425642e-05, + "loss": 0.7291322350502014, + "step": 2454 + }, + { + "epoch": 1.0362869198312237, + "grad_norm": 1.6582196950912476, + "learning_rate": 9.752006758038142e-05, + "loss": 0.7553019523620605, + "step": 2456 + }, + { + "epoch": 1.0371308016877636, + "grad_norm": 1.081773042678833, + "learning_rate": 9.751259292453947e-05, + "loss": 0.5637331008911133, + "step": 2458 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 1.1483876705169678, + "learning_rate": 9.750510730845483e-05, + "loss": 0.6012396216392517, + "step": 2460 + }, + { + "epoch": 1.038818565400844, + "grad_norm": 1.0879185199737549, + "learning_rate": 9.749761073385428e-05, + "loss": 0.6795822381973267, + "step": 2462 + }, + { + "epoch": 1.0396624472573839, + "grad_norm": 1.2378218173980713, + "learning_rate": 9.749010320246714e-05, + "loss": 0.6895145773887634, + "step": 2464 + }, + { + "epoch": 1.040506329113924, + "grad_norm": 1.253233790397644, + "learning_rate": 9.748258471602527e-05, + "loss": 0.7124115228652954, + "step": 2466 + }, + { + "epoch": 1.0413502109704642, + "grad_norm": 1.3994864225387573, + "learning_rate": 9.747505527626302e-05, + "loss": 0.7304861545562744, + "step": 2468 + }, + { + "epoch": 1.0421940928270041, + "grad_norm": 1.2360669374465942, + "learning_rate": 9.74675148849173e-05, + "loss": 0.6845837831497192, + "step": 2470 + }, + { + "epoch": 1.0430379746835443, + "grad_norm": 1.126849889755249, + "learning_rate": 9.74599635437275e-05, + "loss": 0.6780203580856323, + "step": 2472 + }, + { + "epoch": 1.0438818565400845, + "grad_norm": 1.169788122177124, + "learning_rate": 9.745240125443562e-05, + "loss": 0.7550003528594971, + "step": 2474 + }, + { + "epoch": 1.0447257383966244, + "grad_norm": 1.1311867237091064, + "learning_rate": 9.744482801878612e-05, + "loss": 0.6910399198532104, + "step": 2476 + }, + { + "epoch": 1.0455696202531646, + "grad_norm": 1.1267731189727783, + "learning_rate": 9.743724383852597e-05, + "loss": 0.7164814472198486, + "step": 2478 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 1.2239704132080078, + "learning_rate": 9.742964871540472e-05, + "loss": 0.6428439617156982, + "step": 2480 + }, + { + "epoch": 1.0472573839662447, + "grad_norm": 1.1854743957519531, + "learning_rate": 9.742204265117443e-05, + "loss": 0.6994290351867676, + "step": 2482 + }, + { + "epoch": 1.0481012658227848, + "grad_norm": 1.0695894956588745, + "learning_rate": 9.741442564758964e-05, + "loss": 0.6725777983665466, + "step": 2484 + }, + { + "epoch": 1.048945147679325, + "grad_norm": 1.1799863576889038, + "learning_rate": 9.740679770640748e-05, + "loss": 0.6538674235343933, + "step": 2486 + }, + { + "epoch": 1.049789029535865, + "grad_norm": 1.295546293258667, + "learning_rate": 9.739915882938754e-05, + "loss": 0.780756950378418, + "step": 2488 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 1.2371755838394165, + "learning_rate": 9.739150901829198e-05, + "loss": 0.6657930612564087, + "step": 2490 + }, + { + "epoch": 1.051476793248945, + "grad_norm": 1.103037714958191, + "learning_rate": 9.738384827488547e-05, + "loss": 0.6675208210945129, + "step": 2492 + }, + { + "epoch": 1.0523206751054852, + "grad_norm": 1.1835435628890991, + "learning_rate": 9.737617660093517e-05, + "loss": 0.6693358421325684, + "step": 2494 + }, + { + "epoch": 1.0531645569620254, + "grad_norm": 1.003771424293518, + "learning_rate": 9.736849399821082e-05, + "loss": 0.624502956867218, + "step": 2496 + }, + { + "epoch": 1.0540084388185653, + "grad_norm": 1.1391769647598267, + "learning_rate": 9.736080046848463e-05, + "loss": 0.6350868344306946, + "step": 2498 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 1.376518726348877, + "learning_rate": 9.735309601353134e-05, + "loss": 0.6721012592315674, + "step": 2500 + }, + { + "epoch": 1.0548523206751055, + "eval_loss": 0.741338849067688, + "eval_runtime": 847.7478, + "eval_samples_per_second": 2.485, + "eval_steps_per_second": 2.485, + "step": 2500 + }, + { + "epoch": 1.0556962025316456, + "grad_norm": 1.194190502166748, + "learning_rate": 9.734538063512824e-05, + "loss": 0.6888233423233032, + "step": 2502 + }, + { + "epoch": 1.0565400843881856, + "grad_norm": 1.378830909729004, + "learning_rate": 9.733765433505513e-05, + "loss": 0.7095553278923035, + "step": 2504 + }, + { + "epoch": 1.0573839662447257, + "grad_norm": 1.1289541721343994, + "learning_rate": 9.732991711509428e-05, + "loss": 0.6734166145324707, + "step": 2506 + }, + { + "epoch": 1.058227848101266, + "grad_norm": 1.1858116388320923, + "learning_rate": 9.732216897703054e-05, + "loss": 0.7006195187568665, + "step": 2508 + }, + { + "epoch": 1.0590717299578059, + "grad_norm": 1.1365686655044556, + "learning_rate": 9.731440992265127e-05, + "loss": 0.6481205821037292, + "step": 2510 + }, + { + "epoch": 1.059915611814346, + "grad_norm": 1.2886228561401367, + "learning_rate": 9.730663995374632e-05, + "loss": 0.679282546043396, + "step": 2512 + }, + { + "epoch": 1.0607594936708862, + "grad_norm": 1.355322003364563, + "learning_rate": 9.729885907210808e-05, + "loss": 0.7656359672546387, + "step": 2514 + }, + { + "epoch": 1.0616033755274261, + "grad_norm": 1.1552364826202393, + "learning_rate": 9.729106727953142e-05, + "loss": 0.5996183156967163, + "step": 2516 + }, + { + "epoch": 1.0624472573839663, + "grad_norm": 1.1419235467910767, + "learning_rate": 9.728326457781381e-05, + "loss": 0.7599716782569885, + "step": 2518 + }, + { + "epoch": 1.0632911392405062, + "grad_norm": 1.2240079641342163, + "learning_rate": 9.727545096875512e-05, + "loss": 0.7150241732597351, + "step": 2520 + }, + { + "epoch": 1.0641350210970464, + "grad_norm": 1.2463440895080566, + "learning_rate": 9.726762645415785e-05, + "loss": 0.734352171421051, + "step": 2522 + }, + { + "epoch": 1.0649789029535865, + "grad_norm": 1.1680364608764648, + "learning_rate": 9.725979103582697e-05, + "loss": 0.6950796842575073, + "step": 2524 + }, + { + "epoch": 1.0658227848101265, + "grad_norm": 1.1680421829223633, + "learning_rate": 9.725194471556991e-05, + "loss": 0.7096341252326965, + "step": 2526 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.043717861175537, + "learning_rate": 9.724408749519671e-05, + "loss": 0.6486304402351379, + "step": 2528 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 1.1240284442901611, + "learning_rate": 9.723621937651985e-05, + "loss": 0.6519505381584167, + "step": 2530 + }, + { + "epoch": 1.0683544303797468, + "grad_norm": 1.185223937034607, + "learning_rate": 9.722834036135439e-05, + "loss": 0.6724293231964111, + "step": 2532 + }, + { + "epoch": 1.069198312236287, + "grad_norm": 1.3234196901321411, + "learning_rate": 9.722045045151784e-05, + "loss": 0.6886576414108276, + "step": 2534 + }, + { + "epoch": 1.070042194092827, + "grad_norm": 1.333084225654602, + "learning_rate": 9.721254964883024e-05, + "loss": 0.688493549823761, + "step": 2536 + }, + { + "epoch": 1.070886075949367, + "grad_norm": 1.2435462474822998, + "learning_rate": 9.720463795511419e-05, + "loss": 0.6527412533760071, + "step": 2538 + }, + { + "epoch": 1.0717299578059072, + "grad_norm": 1.1521880626678467, + "learning_rate": 9.719671537219472e-05, + "loss": 0.6508163809776306, + "step": 2540 + }, + { + "epoch": 1.0725738396624473, + "grad_norm": 1.015013575553894, + "learning_rate": 9.718878190189947e-05, + "loss": 0.6954023838043213, + "step": 2542 + }, + { + "epoch": 1.0734177215189873, + "grad_norm": 1.1507678031921387, + "learning_rate": 9.718083754605851e-05, + "loss": 0.7201322913169861, + "step": 2544 + }, + { + "epoch": 1.0742616033755275, + "grad_norm": 1.0569016933441162, + "learning_rate": 9.717288230650444e-05, + "loss": 0.6688649654388428, + "step": 2546 + }, + { + "epoch": 1.0751054852320676, + "grad_norm": 1.2178492546081543, + "learning_rate": 9.716491618507241e-05, + "loss": 0.7077898979187012, + "step": 2548 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 1.3587230443954468, + "learning_rate": 9.715693918360002e-05, + "loss": 0.7312119603157043, + "step": 2550 + }, + { + "epoch": 1.0767932489451477, + "grad_norm": 1.1930122375488281, + "learning_rate": 9.714895130392744e-05, + "loss": 0.6910589337348938, + "step": 2552 + }, + { + "epoch": 1.0776371308016879, + "grad_norm": 1.2440707683563232, + "learning_rate": 9.71409525478973e-05, + "loss": 0.7942836284637451, + "step": 2554 + }, + { + "epoch": 1.0784810126582278, + "grad_norm": 1.3755065202713013, + "learning_rate": 9.713294291735477e-05, + "loss": 0.6652286052703857, + "step": 2556 + }, + { + "epoch": 1.079324894514768, + "grad_norm": 1.165448784828186, + "learning_rate": 9.71249224141475e-05, + "loss": 0.6025735139846802, + "step": 2558 + }, + { + "epoch": 1.080168776371308, + "grad_norm": 1.2981204986572266, + "learning_rate": 9.711689104012569e-05, + "loss": 0.7343734502792358, + "step": 2560 + }, + { + "epoch": 1.081012658227848, + "grad_norm": 1.2040622234344482, + "learning_rate": 9.710884879714202e-05, + "loss": 0.6903306841850281, + "step": 2562 + }, + { + "epoch": 1.0818565400843883, + "grad_norm": 1.1835904121398926, + "learning_rate": 9.710079568705168e-05, + "loss": 0.69134920835495, + "step": 2564 + }, + { + "epoch": 1.0827004219409282, + "grad_norm": 1.3345229625701904, + "learning_rate": 9.709273171171235e-05, + "loss": 0.6471185088157654, + "step": 2566 + }, + { + "epoch": 1.0835443037974684, + "grad_norm": 1.0884469747543335, + "learning_rate": 9.708465687298425e-05, + "loss": 0.6302382349967957, + "step": 2568 + }, + { + "epoch": 1.0843881856540085, + "grad_norm": 1.1994211673736572, + "learning_rate": 9.707657117273007e-05, + "loss": 0.7329678535461426, + "step": 2570 + }, + { + "epoch": 1.0852320675105485, + "grad_norm": 1.2609503269195557, + "learning_rate": 9.706847461281507e-05, + "loss": 0.719862163066864, + "step": 2572 + }, + { + "epoch": 1.0860759493670886, + "grad_norm": 1.2686879634857178, + "learning_rate": 9.706036719510694e-05, + "loss": 0.7142901420593262, + "step": 2574 + }, + { + "epoch": 1.0869198312236288, + "grad_norm": 1.2763310670852661, + "learning_rate": 9.705224892147591e-05, + "loss": 0.7009075284004211, + "step": 2576 + }, + { + "epoch": 1.0877637130801687, + "grad_norm": 1.1704022884368896, + "learning_rate": 9.70441197937947e-05, + "loss": 0.6873779296875, + "step": 2578 + }, + { + "epoch": 1.0886075949367089, + "grad_norm": 1.0482875108718872, + "learning_rate": 9.703597981393856e-05, + "loss": 0.6437726020812988, + "step": 2580 + }, + { + "epoch": 1.0894514767932488, + "grad_norm": 1.28431236743927, + "learning_rate": 9.702782898378521e-05, + "loss": 0.6933431625366211, + "step": 2582 + }, + { + "epoch": 1.090295358649789, + "grad_norm": 1.0962283611297607, + "learning_rate": 9.701966730521491e-05, + "loss": 0.6488757133483887, + "step": 2584 + }, + { + "epoch": 1.0911392405063292, + "grad_norm": 1.2177873849868774, + "learning_rate": 9.70114947801104e-05, + "loss": 0.6385396122932434, + "step": 2586 + }, + { + "epoch": 1.091983122362869, + "grad_norm": 1.197059988975525, + "learning_rate": 9.70033114103569e-05, + "loss": 0.6826614737510681, + "step": 2588 + }, + { + "epoch": 1.0928270042194093, + "grad_norm": 1.1624075174331665, + "learning_rate": 9.699511719784217e-05, + "loss": 0.605629563331604, + "step": 2590 + }, + { + "epoch": 1.0936708860759494, + "grad_norm": 1.2975167036056519, + "learning_rate": 9.698691214445648e-05, + "loss": 0.734926700592041, + "step": 2592 + }, + { + "epoch": 1.0945147679324894, + "grad_norm": 1.215414047241211, + "learning_rate": 9.697869625209255e-05, + "loss": 0.7281333804130554, + "step": 2594 + }, + { + "epoch": 1.0953586497890295, + "grad_norm": 1.1862860918045044, + "learning_rate": 9.697046952264563e-05, + "loss": 0.7388250827789307, + "step": 2596 + }, + { + "epoch": 1.0962025316455697, + "grad_norm": 1.1127797365188599, + "learning_rate": 9.696223195801348e-05, + "loss": 0.6495320796966553, + "step": 2598 + }, + { + "epoch": 1.0970464135021096, + "grad_norm": 1.0863338708877563, + "learning_rate": 9.695398356009636e-05, + "loss": 0.7157143950462341, + "step": 2600 + }, + { + "epoch": 1.0970464135021096, + "eval_loss": 0.7377332448959351, + "eval_runtime": 859.6612, + "eval_samples_per_second": 2.451, + "eval_steps_per_second": 2.451, + "step": 2600 + }, + { + "epoch": 1.0978902953586498, + "grad_norm": 1.1228652000427246, + "learning_rate": 9.694572433079699e-05, + "loss": 0.6597335934638977, + "step": 2602 + }, + { + "epoch": 1.09873417721519, + "grad_norm": 1.3077653646469116, + "learning_rate": 9.69374542720206e-05, + "loss": 0.6715680360794067, + "step": 2604 + }, + { + "epoch": 1.09957805907173, + "grad_norm": 1.241603970527649, + "learning_rate": 9.692917338567499e-05, + "loss": 0.6910243034362793, + "step": 2606 + }, + { + "epoch": 1.10042194092827, + "grad_norm": 1.1372551918029785, + "learning_rate": 9.692088167367037e-05, + "loss": 0.6519553065299988, + "step": 2608 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 1.2894765138626099, + "learning_rate": 9.691257913791949e-05, + "loss": 0.6542758941650391, + "step": 2610 + }, + { + "epoch": 1.1021097046413502, + "grad_norm": 1.0800915956497192, + "learning_rate": 9.690426578033755e-05, + "loss": 0.6886795163154602, + "step": 2612 + }, + { + "epoch": 1.1029535864978903, + "grad_norm": 1.3394384384155273, + "learning_rate": 9.689594160284233e-05, + "loss": 0.7512150406837463, + "step": 2614 + }, + { + "epoch": 1.1037974683544305, + "grad_norm": 1.2175323963165283, + "learning_rate": 9.688760660735402e-05, + "loss": 0.67207932472229, + "step": 2616 + }, + { + "epoch": 1.1046413502109704, + "grad_norm": 1.2181185483932495, + "learning_rate": 9.687926079579537e-05, + "loss": 0.6591740846633911, + "step": 2618 + }, + { + "epoch": 1.1054852320675106, + "grad_norm": 1.1740983724594116, + "learning_rate": 9.68709041700916e-05, + "loss": 0.6431041359901428, + "step": 2620 + }, + { + "epoch": 1.1063291139240505, + "grad_norm": 1.1792434453964233, + "learning_rate": 9.686253673217038e-05, + "loss": 0.6573615074157715, + "step": 2622 + }, + { + "epoch": 1.1071729957805907, + "grad_norm": 1.058391809463501, + "learning_rate": 9.685415848396196e-05, + "loss": 0.5576209425926208, + "step": 2624 + }, + { + "epoch": 1.1080168776371309, + "grad_norm": 1.3203206062316895, + "learning_rate": 9.684576942739903e-05, + "loss": 0.668684184551239, + "step": 2626 + }, + { + "epoch": 1.1088607594936708, + "grad_norm": 1.2391762733459473, + "learning_rate": 9.68373695644168e-05, + "loss": 0.6800089478492737, + "step": 2628 + }, + { + "epoch": 1.109704641350211, + "grad_norm": 1.2323405742645264, + "learning_rate": 9.682895889695292e-05, + "loss": 0.6433757543563843, + "step": 2630 + }, + { + "epoch": 1.1105485232067511, + "grad_norm": 1.2656551599502563, + "learning_rate": 9.682053742694759e-05, + "loss": 0.6628785729408264, + "step": 2632 + }, + { + "epoch": 1.111392405063291, + "grad_norm": 1.2984392642974854, + "learning_rate": 9.681210515634349e-05, + "loss": 0.6838971972465515, + "step": 2634 + }, + { + "epoch": 1.1122362869198312, + "grad_norm": 1.3200393915176392, + "learning_rate": 9.680366208708576e-05, + "loss": 0.7548647522926331, + "step": 2636 + }, + { + "epoch": 1.1130801687763714, + "grad_norm": 1.225388526916504, + "learning_rate": 9.679520822112208e-05, + "loss": 0.6553335189819336, + "step": 2638 + }, + { + "epoch": 1.1139240506329113, + "grad_norm": 1.2350653409957886, + "learning_rate": 9.678674356040259e-05, + "loss": 0.631401538848877, + "step": 2640 + }, + { + "epoch": 1.1147679324894515, + "grad_norm": 1.2325507402420044, + "learning_rate": 9.677826810687989e-05, + "loss": 0.6459156274795532, + "step": 2642 + }, + { + "epoch": 1.1156118143459917, + "grad_norm": 1.0008996725082397, + "learning_rate": 9.676978186250915e-05, + "loss": 0.6425284743309021, + "step": 2644 + }, + { + "epoch": 1.1164556962025316, + "grad_norm": 1.3767247200012207, + "learning_rate": 9.676128482924796e-05, + "loss": 0.6451422572135925, + "step": 2646 + }, + { + "epoch": 1.1172995780590718, + "grad_norm": 1.2070895433425903, + "learning_rate": 9.675277700905643e-05, + "loss": 0.6713272929191589, + "step": 2648 + }, + { + "epoch": 1.1181434599156117, + "grad_norm": 1.1582069396972656, + "learning_rate": 9.674425840389716e-05, + "loss": 0.6285044550895691, + "step": 2650 + }, + { + "epoch": 1.1189873417721519, + "grad_norm": 1.1641311645507812, + "learning_rate": 9.67357290157352e-05, + "loss": 0.624229907989502, + "step": 2652 + }, + { + "epoch": 1.119831223628692, + "grad_norm": 1.3071147203445435, + "learning_rate": 9.672718884653814e-05, + "loss": 0.7214919328689575, + "step": 2654 + }, + { + "epoch": 1.120675105485232, + "grad_norm": 1.2157800197601318, + "learning_rate": 9.671863789827602e-05, + "loss": 0.8062215447425842, + "step": 2656 + }, + { + "epoch": 1.1215189873417721, + "grad_norm": 1.2843927145004272, + "learning_rate": 9.671007617292138e-05, + "loss": 0.6362426280975342, + "step": 2658 + }, + { + "epoch": 1.1223628691983123, + "grad_norm": 1.1182712316513062, + "learning_rate": 9.670150367244927e-05, + "loss": 0.6181318163871765, + "step": 2660 + }, + { + "epoch": 1.1232067510548522, + "grad_norm": 1.566605806350708, + "learning_rate": 9.669292039883717e-05, + "loss": 0.6973897218704224, + "step": 2662 + }, + { + "epoch": 1.1240506329113924, + "grad_norm": 1.0726850032806396, + "learning_rate": 9.66843263540651e-05, + "loss": 0.6117324829101562, + "step": 2664 + }, + { + "epoch": 1.1248945147679326, + "grad_norm": 1.2953020334243774, + "learning_rate": 9.66757215401155e-05, + "loss": 0.642676830291748, + "step": 2666 + }, + { + "epoch": 1.1257383966244725, + "grad_norm": 1.1184383630752563, + "learning_rate": 9.66671059589734e-05, + "loss": 0.6757452487945557, + "step": 2668 + }, + { + "epoch": 1.1265822784810127, + "grad_norm": 1.2732970714569092, + "learning_rate": 9.66584796126262e-05, + "loss": 0.6861951947212219, + "step": 2670 + }, + { + "epoch": 1.1274261603375528, + "grad_norm": 1.2713000774383545, + "learning_rate": 9.664984250306383e-05, + "loss": 0.6727077960968018, + "step": 2672 + }, + { + "epoch": 1.1282700421940928, + "grad_norm": 1.269827961921692, + "learning_rate": 9.664119463227874e-05, + "loss": 0.7355974912643433, + "step": 2674 + }, + { + "epoch": 1.129113924050633, + "grad_norm": 1.3067172765731812, + "learning_rate": 9.663253600226581e-05, + "loss": 0.7121313214302063, + "step": 2676 + }, + { + "epoch": 1.129957805907173, + "grad_norm": 1.2958797216415405, + "learning_rate": 9.662386661502242e-05, + "loss": 0.6671369075775146, + "step": 2678 + }, + { + "epoch": 1.130801687763713, + "grad_norm": 1.2943401336669922, + "learning_rate": 9.661518647254842e-05, + "loss": 0.6153768301010132, + "step": 2680 + }, + { + "epoch": 1.1316455696202532, + "grad_norm": 1.1744167804718018, + "learning_rate": 9.660649557684616e-05, + "loss": 0.6070778965950012, + "step": 2682 + }, + { + "epoch": 1.1324894514767934, + "grad_norm": 1.159209132194519, + "learning_rate": 9.659779392992047e-05, + "loss": 0.676887035369873, + "step": 2684 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.1937510967254639, + "learning_rate": 9.658908153377866e-05, + "loss": 0.6086745262145996, + "step": 2686 + }, + { + "epoch": 1.1341772151898735, + "grad_norm": 1.1461687088012695, + "learning_rate": 9.658035839043049e-05, + "loss": 0.6493708491325378, + "step": 2688 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 2.066361665725708, + "learning_rate": 9.657162450188824e-05, + "loss": 0.6813004016876221, + "step": 2690 + }, + { + "epoch": 1.1358649789029536, + "grad_norm": 1.086910367012024, + "learning_rate": 9.656287987016664e-05, + "loss": 0.721062183380127, + "step": 2692 + }, + { + "epoch": 1.1367088607594937, + "grad_norm": 1.1869292259216309, + "learning_rate": 9.65541244972829e-05, + "loss": 0.5975021123886108, + "step": 2694 + }, + { + "epoch": 1.1375527426160337, + "grad_norm": 1.2456518411636353, + "learning_rate": 9.654535838525674e-05, + "loss": 0.6818324327468872, + "step": 2696 + }, + { + "epoch": 1.1383966244725738, + "grad_norm": 1.5271464586257935, + "learning_rate": 9.653658153611031e-05, + "loss": 0.6844469308853149, + "step": 2698 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.1403794288635254, + "learning_rate": 9.652779395186827e-05, + "loss": 0.6388684511184692, + "step": 2700 + }, + { + "epoch": 1.139240506329114, + "eval_loss": 0.7335711717605591, + "eval_runtime": 861.9651, + "eval_samples_per_second": 2.444, + "eval_steps_per_second": 2.444, + "step": 2700 + }, + { + "epoch": 1.140084388185654, + "grad_norm": 1.1091634035110474, + "learning_rate": 9.651899563455775e-05, + "loss": 0.6154619455337524, + "step": 2702 + }, + { + "epoch": 1.140928270042194, + "grad_norm": 1.3280601501464844, + "learning_rate": 9.651018658620837e-05, + "loss": 0.629319429397583, + "step": 2704 + }, + { + "epoch": 1.1417721518987343, + "grad_norm": 1.226806402206421, + "learning_rate": 9.650136680885216e-05, + "loss": 0.6088175773620605, + "step": 2706 + }, + { + "epoch": 1.1426160337552742, + "grad_norm": 1.0593408346176147, + "learning_rate": 9.649253630452372e-05, + "loss": 0.6199659705162048, + "step": 2708 + }, + { + "epoch": 1.1434599156118144, + "grad_norm": 1.1112475395202637, + "learning_rate": 9.648369507526008e-05, + "loss": 0.7233364582061768, + "step": 2710 + }, + { + "epoch": 1.1443037974683543, + "grad_norm": 1.1737885475158691, + "learning_rate": 9.647484312310068e-05, + "loss": 0.6687955856323242, + "step": 2712 + }, + { + "epoch": 1.1451476793248945, + "grad_norm": 1.194532036781311, + "learning_rate": 9.646598045008756e-05, + "loss": 0.6508969068527222, + "step": 2714 + }, + { + "epoch": 1.1459915611814346, + "grad_norm": 1.069395899772644, + "learning_rate": 9.645710705826517e-05, + "loss": 0.6408317685127258, + "step": 2716 + }, + { + "epoch": 1.1468354430379746, + "grad_norm": 1.2429133653640747, + "learning_rate": 9.644822294968037e-05, + "loss": 0.650763750076294, + "step": 2718 + }, + { + "epoch": 1.1476793248945147, + "grad_norm": 1.2950133085250854, + "learning_rate": 9.64393281263826e-05, + "loss": 0.6952191591262817, + "step": 2720 + }, + { + "epoch": 1.148523206751055, + "grad_norm": 1.1972628831863403, + "learning_rate": 9.643042259042372e-05, + "loss": 0.6772956252098083, + "step": 2722 + }, + { + "epoch": 1.1493670886075948, + "grad_norm": 1.1670407056808472, + "learning_rate": 9.642150634385805e-05, + "loss": 0.6734447479248047, + "step": 2724 + }, + { + "epoch": 1.150210970464135, + "grad_norm": 1.120302677154541, + "learning_rate": 9.641257938874243e-05, + "loss": 0.6387717127799988, + "step": 2726 + }, + { + "epoch": 1.1510548523206752, + "grad_norm": 1.1241344213485718, + "learning_rate": 9.640364172713609e-05, + "loss": 0.6592874526977539, + "step": 2728 + }, + { + "epoch": 1.1518987341772151, + "grad_norm": 1.2627261877059937, + "learning_rate": 9.639469336110083e-05, + "loss": 0.7257466912269592, + "step": 2730 + }, + { + "epoch": 1.1527426160337553, + "grad_norm": 1.0528618097305298, + "learning_rate": 9.638573429270083e-05, + "loss": 0.572188138961792, + "step": 2732 + }, + { + "epoch": 1.1535864978902954, + "grad_norm": 1.212536334991455, + "learning_rate": 9.637676452400277e-05, + "loss": 0.678981602191925, + "step": 2734 + }, + { + "epoch": 1.1544303797468354, + "grad_norm": 1.152167797088623, + "learning_rate": 9.636778405707582e-05, + "loss": 0.6375001072883606, + "step": 2736 + }, + { + "epoch": 1.1552742616033755, + "grad_norm": 1.2400429248809814, + "learning_rate": 9.635879289399161e-05, + "loss": 0.7602289319038391, + "step": 2738 + }, + { + "epoch": 1.1561181434599157, + "grad_norm": 1.3488622903823853, + "learning_rate": 9.634979103682421e-05, + "loss": 0.6209543943405151, + "step": 2740 + }, + { + "epoch": 1.1569620253164556, + "grad_norm": 1.1999555826187134, + "learning_rate": 9.634077848765019e-05, + "loss": 0.6215830445289612, + "step": 2742 + }, + { + "epoch": 1.1578059071729958, + "grad_norm": 1.2008578777313232, + "learning_rate": 9.633175524854855e-05, + "loss": 0.6634654998779297, + "step": 2744 + }, + { + "epoch": 1.158649789029536, + "grad_norm": 1.3920676708221436, + "learning_rate": 9.63227213216008e-05, + "loss": 0.7515161633491516, + "step": 2746 + }, + { + "epoch": 1.159493670886076, + "grad_norm": 1.0551656484603882, + "learning_rate": 9.631367670889089e-05, + "loss": 0.724361777305603, + "step": 2748 + }, + { + "epoch": 1.160337552742616, + "grad_norm": 1.2820028066635132, + "learning_rate": 9.630462141250523e-05, + "loss": 0.6673553586006165, + "step": 2750 + }, + { + "epoch": 1.1611814345991562, + "grad_norm": 1.1452983617782593, + "learning_rate": 9.62955554345327e-05, + "loss": 0.7029784917831421, + "step": 2752 + }, + { + "epoch": 1.1620253164556962, + "grad_norm": 1.1808624267578125, + "learning_rate": 9.628647877706466e-05, + "loss": 0.7355457544326782, + "step": 2754 + }, + { + "epoch": 1.1628691983122363, + "grad_norm": 1.0574703216552734, + "learning_rate": 9.627739144219492e-05, + "loss": 0.6144933700561523, + "step": 2756 + }, + { + "epoch": 1.1637130801687763, + "grad_norm": 1.215733528137207, + "learning_rate": 9.626829343201974e-05, + "loss": 0.6843759417533875, + "step": 2758 + }, + { + "epoch": 1.1645569620253164, + "grad_norm": 1.1667706966400146, + "learning_rate": 9.625918474863787e-05, + "loss": 0.6197049617767334, + "step": 2760 + }, + { + "epoch": 1.1654008438818566, + "grad_norm": 1.3765631914138794, + "learning_rate": 9.62500653941505e-05, + "loss": 0.715958297252655, + "step": 2762 + }, + { + "epoch": 1.1662447257383965, + "grad_norm": 1.173715591430664, + "learning_rate": 9.62409353706613e-05, + "loss": 0.7433139085769653, + "step": 2764 + }, + { + "epoch": 1.1670886075949367, + "grad_norm": 1.1837430000305176, + "learning_rate": 9.623179468027637e-05, + "loss": 0.7174371480941772, + "step": 2766 + }, + { + "epoch": 1.1679324894514769, + "grad_norm": 1.1577154397964478, + "learning_rate": 9.622264332510432e-05, + "loss": 0.7184823751449585, + "step": 2768 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 1.165246605873108, + "learning_rate": 9.621348130725617e-05, + "loss": 0.693343460559845, + "step": 2770 + }, + { + "epoch": 1.169620253164557, + "grad_norm": 1.2853080034255981, + "learning_rate": 9.620430862884542e-05, + "loss": 0.6999852061271667, + "step": 2772 + }, + { + "epoch": 1.1704641350210971, + "grad_norm": 1.1782865524291992, + "learning_rate": 9.619512529198806e-05, + "loss": 0.6034331321716309, + "step": 2774 + }, + { + "epoch": 1.171308016877637, + "grad_norm": 1.4055447578430176, + "learning_rate": 9.61859312988025e-05, + "loss": 0.7588269710540771, + "step": 2776 + }, + { + "epoch": 1.1721518987341772, + "grad_norm": 1.1148805618286133, + "learning_rate": 9.617672665140957e-05, + "loss": 0.6913981437683105, + "step": 2778 + }, + { + "epoch": 1.1729957805907172, + "grad_norm": 1.1311042308807373, + "learning_rate": 9.616751135193266e-05, + "loss": 0.5976925492286682, + "step": 2780 + }, + { + "epoch": 1.1738396624472573, + "grad_norm": 1.2378602027893066, + "learning_rate": 9.615828540249754e-05, + "loss": 0.6897050142288208, + "step": 2782 + }, + { + "epoch": 1.1746835443037975, + "grad_norm": 1.3445732593536377, + "learning_rate": 9.614904880523248e-05, + "loss": 0.6772098541259766, + "step": 2784 + }, + { + "epoch": 1.1755274261603375, + "grad_norm": 1.3380862474441528, + "learning_rate": 9.613980156226815e-05, + "loss": 0.6354818344116211, + "step": 2786 + }, + { + "epoch": 1.1763713080168776, + "grad_norm": 1.0955157279968262, + "learning_rate": 9.613054367573773e-05, + "loss": 0.6541208028793335, + "step": 2788 + }, + { + "epoch": 1.1772151898734178, + "grad_norm": 1.0176626443862915, + "learning_rate": 9.612127514777686e-05, + "loss": 0.6472887992858887, + "step": 2790 + }, + { + "epoch": 1.1780590717299577, + "grad_norm": 1.2644864320755005, + "learning_rate": 9.611199598052357e-05, + "loss": 0.7511212229728699, + "step": 2792 + }, + { + "epoch": 1.1789029535864979, + "grad_norm": 1.248197317123413, + "learning_rate": 9.61027061761184e-05, + "loss": 0.696236789226532, + "step": 2794 + }, + { + "epoch": 1.179746835443038, + "grad_norm": 1.189935564994812, + "learning_rate": 9.609340573670436e-05, + "loss": 0.5962010622024536, + "step": 2796 + }, + { + "epoch": 1.180590717299578, + "grad_norm": 1.1760492324829102, + "learning_rate": 9.608409466442685e-05, + "loss": 0.5981685519218445, + "step": 2798 + }, + { + "epoch": 1.1814345991561181, + "grad_norm": 1.1820716857910156, + "learning_rate": 9.607477296143374e-05, + "loss": 0.6186091303825378, + "step": 2800 + }, + { + "epoch": 1.1814345991561181, + "eval_loss": 0.7298192977905273, + "eval_runtime": 849.544, + "eval_samples_per_second": 2.48, + "eval_steps_per_second": 2.48, + "step": 2800 + }, + { + "epoch": 1.1822784810126583, + "grad_norm": 1.0353888273239136, + "learning_rate": 9.606544062987541e-05, + "loss": 0.5859389901161194, + "step": 2802 + }, + { + "epoch": 1.1831223628691983, + "grad_norm": 1.3141933679580688, + "learning_rate": 9.605609767190464e-05, + "loss": 0.6573460698127747, + "step": 2804 + }, + { + "epoch": 1.1839662447257384, + "grad_norm": 1.1209372282028198, + "learning_rate": 9.604674408967664e-05, + "loss": 0.6991921067237854, + "step": 2806 + }, + { + "epoch": 1.1848101265822786, + "grad_norm": 1.2830493450164795, + "learning_rate": 9.603737988534913e-05, + "loss": 0.6438087821006775, + "step": 2808 + }, + { + "epoch": 1.1856540084388185, + "grad_norm": 1.1427195072174072, + "learning_rate": 9.602800506108225e-05, + "loss": 0.6452094316482544, + "step": 2810 + }, + { + "epoch": 1.1864978902953587, + "grad_norm": 1.316420078277588, + "learning_rate": 9.601861961903857e-05, + "loss": 0.6745601296424866, + "step": 2812 + }, + { + "epoch": 1.1873417721518988, + "grad_norm": 1.1643308401107788, + "learning_rate": 9.600922356138317e-05, + "loss": 0.6761514544487, + "step": 2814 + }, + { + "epoch": 1.1881856540084388, + "grad_norm": 1.036056399345398, + "learning_rate": 9.59998168902835e-05, + "loss": 0.6453908681869507, + "step": 2816 + }, + { + "epoch": 1.189029535864979, + "grad_norm": 1.2211129665374756, + "learning_rate": 9.599039960790954e-05, + "loss": 0.6576406359672546, + "step": 2818 + }, + { + "epoch": 1.189873417721519, + "grad_norm": 1.084114670753479, + "learning_rate": 9.598097171643364e-05, + "loss": 0.6214181780815125, + "step": 2820 + }, + { + "epoch": 1.190717299578059, + "grad_norm": 1.1297314167022705, + "learning_rate": 9.597153321803064e-05, + "loss": 0.6381646990776062, + "step": 2822 + }, + { + "epoch": 1.1915611814345992, + "grad_norm": 1.2568120956420898, + "learning_rate": 9.596208411487784e-05, + "loss": 0.7129076719284058, + "step": 2824 + }, + { + "epoch": 1.1924050632911392, + "grad_norm": 1.07041335105896, + "learning_rate": 9.595262440915493e-05, + "loss": 0.7123546004295349, + "step": 2826 + }, + { + "epoch": 1.1932489451476793, + "grad_norm": 1.3950074911117554, + "learning_rate": 9.594315410304413e-05, + "loss": 0.7263038158416748, + "step": 2828 + }, + { + "epoch": 1.1940928270042195, + "grad_norm": 1.2470672130584717, + "learning_rate": 9.593367319873002e-05, + "loss": 0.6863036751747131, + "step": 2830 + }, + { + "epoch": 1.1949367088607594, + "grad_norm": 1.2065461874008179, + "learning_rate": 9.592418169839968e-05, + "loss": 0.745354175567627, + "step": 2832 + }, + { + "epoch": 1.1957805907172996, + "grad_norm": 1.1710152626037598, + "learning_rate": 9.591467960424261e-05, + "loss": 0.6401656866073608, + "step": 2834 + }, + { + "epoch": 1.1966244725738397, + "grad_norm": 1.3324087858200073, + "learning_rate": 9.590516691845077e-05, + "loss": 0.7402615547180176, + "step": 2836 + }, + { + "epoch": 1.1974683544303797, + "grad_norm": 1.0100195407867432, + "learning_rate": 9.589564364321855e-05, + "loss": 0.5723769068717957, + "step": 2838 + }, + { + "epoch": 1.1983122362869199, + "grad_norm": 1.2706246376037598, + "learning_rate": 9.588610978074277e-05, + "loss": 0.6618966460227966, + "step": 2840 + }, + { + "epoch": 1.1991561181434598, + "grad_norm": 1.1921758651733398, + "learning_rate": 9.587656533322273e-05, + "loss": 0.7090804576873779, + "step": 2842 + }, + { + "epoch": 1.2, + "grad_norm": 1.36713445186615, + "learning_rate": 9.586701030286014e-05, + "loss": 0.6930652856826782, + "step": 2844 + }, + { + "epoch": 1.2008438818565401, + "grad_norm": 1.3084295988082886, + "learning_rate": 9.585744469185917e-05, + "loss": 0.7386236190795898, + "step": 2846 + }, + { + "epoch": 1.20168776371308, + "grad_norm": 1.198922038078308, + "learning_rate": 9.584786850242642e-05, + "loss": 0.6179903149604797, + "step": 2848 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.2106369733810425, + "learning_rate": 9.583828173677092e-05, + "loss": 0.7027528882026672, + "step": 2850 + }, + { + "epoch": 1.2033755274261604, + "grad_norm": 1.2959522008895874, + "learning_rate": 9.582868439710418e-05, + "loss": 0.6612945199012756, + "step": 2852 + }, + { + "epoch": 1.2042194092827003, + "grad_norm": 1.1441705226898193, + "learning_rate": 9.58190764856401e-05, + "loss": 0.7085917592048645, + "step": 2854 + }, + { + "epoch": 1.2050632911392405, + "grad_norm": 1.1586185693740845, + "learning_rate": 9.580945800459504e-05, + "loss": 0.7480600476264954, + "step": 2856 + }, + { + "epoch": 1.2059071729957807, + "grad_norm": 1.2068266868591309, + "learning_rate": 9.579982895618783e-05, + "loss": 0.7185836434364319, + "step": 2858 + }, + { + "epoch": 1.2067510548523206, + "grad_norm": 1.2188525199890137, + "learning_rate": 9.579018934263966e-05, + "loss": 0.6737306118011475, + "step": 2860 + }, + { + "epoch": 1.2075949367088608, + "grad_norm": 1.1513181924819946, + "learning_rate": 9.578053916617423e-05, + "loss": 0.7239293456077576, + "step": 2862 + }, + { + "epoch": 1.208438818565401, + "grad_norm": 1.2063703536987305, + "learning_rate": 9.577087842901764e-05, + "loss": 0.6416276097297668, + "step": 2864 + }, + { + "epoch": 1.2092827004219409, + "grad_norm": 1.102460503578186, + "learning_rate": 9.576120713339844e-05, + "loss": 0.697213351726532, + "step": 2866 + }, + { + "epoch": 1.210126582278481, + "grad_norm": 1.2484638690948486, + "learning_rate": 9.575152528154763e-05, + "loss": 0.6664742231369019, + "step": 2868 + }, + { + "epoch": 1.2109704641350212, + "grad_norm": 1.4476624727249146, + "learning_rate": 9.57418328756986e-05, + "loss": 0.6914868354797363, + "step": 2870 + }, + { + "epoch": 1.2118143459915611, + "grad_norm": 1.0130122900009155, + "learning_rate": 9.573212991808722e-05, + "loss": 0.662024736404419, + "step": 2872 + }, + { + "epoch": 1.2126582278481013, + "grad_norm": 1.014470100402832, + "learning_rate": 9.572241641095177e-05, + "loss": 0.6330409646034241, + "step": 2874 + }, + { + "epoch": 1.2135021097046415, + "grad_norm": 1.1803333759307861, + "learning_rate": 9.571269235653298e-05, + "loss": 0.6607463955879211, + "step": 2876 + }, + { + "epoch": 1.2143459915611814, + "grad_norm": 1.261366844177246, + "learning_rate": 9.570295775707398e-05, + "loss": 0.6925629377365112, + "step": 2878 + }, + { + "epoch": 1.2151898734177216, + "grad_norm": 1.226670503616333, + "learning_rate": 9.569321261482037e-05, + "loss": 0.7070510983467102, + "step": 2880 + }, + { + "epoch": 1.2160337552742617, + "grad_norm": 1.164565920829773, + "learning_rate": 9.568345693202016e-05, + "loss": 0.7243561744689941, + "step": 2882 + }, + { + "epoch": 1.2168776371308017, + "grad_norm": 1.060331106185913, + "learning_rate": 9.567369071092382e-05, + "loss": 0.6316909790039062, + "step": 2884 + }, + { + "epoch": 1.2177215189873418, + "grad_norm": 1.1998693943023682, + "learning_rate": 9.566391395378419e-05, + "loss": 0.6139125227928162, + "step": 2886 + }, + { + "epoch": 1.2185654008438818, + "grad_norm": 1.1875834465026855, + "learning_rate": 9.565412666285661e-05, + "loss": 0.688897430896759, + "step": 2888 + }, + { + "epoch": 1.219409282700422, + "grad_norm": 1.199174404144287, + "learning_rate": 9.564432884039882e-05, + "loss": 0.684590756893158, + "step": 2890 + }, + { + "epoch": 1.220253164556962, + "grad_norm": 1.2428219318389893, + "learning_rate": 9.563452048867099e-05, + "loss": 0.67433100938797, + "step": 2892 + }, + { + "epoch": 1.221097046413502, + "grad_norm": 1.0826431512832642, + "learning_rate": 9.562470160993568e-05, + "loss": 0.6959785223007202, + "step": 2894 + }, + { + "epoch": 1.2219409282700422, + "grad_norm": 1.3140246868133545, + "learning_rate": 9.561487220645797e-05, + "loss": 0.6443175673484802, + "step": 2896 + }, + { + "epoch": 1.2227848101265824, + "grad_norm": 1.2758334875106812, + "learning_rate": 9.560503228050529e-05, + "loss": 0.6715332865715027, + "step": 2898 + }, + { + "epoch": 1.2236286919831223, + "grad_norm": 1.3326421976089478, + "learning_rate": 9.559518183434753e-05, + "loss": 0.6896081566810608, + "step": 2900 + }, + { + "epoch": 1.2236286919831223, + "eval_loss": 0.7281573414802551, + "eval_runtime": 854.563, + "eval_samples_per_second": 2.466, + "eval_steps_per_second": 2.466, + "step": 2900 + }, + { + "epoch": 1.2244725738396625, + "grad_norm": 1.3225606679916382, + "learning_rate": 9.558532087025697e-05, + "loss": 0.6797633171081543, + "step": 2902 + }, + { + "epoch": 1.2253164556962026, + "grad_norm": 1.3058340549468994, + "learning_rate": 9.55754493905084e-05, + "loss": 0.6510948538780212, + "step": 2904 + }, + { + "epoch": 1.2261603375527426, + "grad_norm": 1.140268087387085, + "learning_rate": 9.556556739737892e-05, + "loss": 0.6481176614761353, + "step": 2906 + }, + { + "epoch": 1.2270042194092827, + "grad_norm": 1.465113639831543, + "learning_rate": 9.555567489314816e-05, + "loss": 0.7533771991729736, + "step": 2908 + }, + { + "epoch": 1.2278481012658227, + "grad_norm": 1.1468979120254517, + "learning_rate": 9.554577188009812e-05, + "loss": 0.6924305558204651, + "step": 2910 + }, + { + "epoch": 1.2286919831223628, + "grad_norm": 1.2193517684936523, + "learning_rate": 9.553585836051321e-05, + "loss": 0.7082820534706116, + "step": 2912 + }, + { + "epoch": 1.229535864978903, + "grad_norm": 1.2015037536621094, + "learning_rate": 9.552593433668034e-05, + "loss": 0.6735695004463196, + "step": 2914 + }, + { + "epoch": 1.230379746835443, + "grad_norm": 1.1915435791015625, + "learning_rate": 9.551599981088874e-05, + "loss": 0.7312048673629761, + "step": 2916 + }, + { + "epoch": 1.231223628691983, + "grad_norm": 1.2849410772323608, + "learning_rate": 9.550605478543013e-05, + "loss": 0.6590308547019958, + "step": 2918 + }, + { + "epoch": 1.2320675105485233, + "grad_norm": 1.192238688468933, + "learning_rate": 9.549609926259866e-05, + "loss": 0.6237715482711792, + "step": 2920 + }, + { + "epoch": 1.2329113924050632, + "grad_norm": 1.141845703125, + "learning_rate": 9.548613324469085e-05, + "loss": 0.6546295881271362, + "step": 2922 + }, + { + "epoch": 1.2337552742616034, + "grad_norm": 1.1662311553955078, + "learning_rate": 9.547615673400566e-05, + "loss": 0.5800934433937073, + "step": 2924 + }, + { + "epoch": 1.2345991561181435, + "grad_norm": 1.120578646659851, + "learning_rate": 9.546616973284453e-05, + "loss": 0.6487136483192444, + "step": 2926 + }, + { + "epoch": 1.2354430379746835, + "grad_norm": 1.0884860754013062, + "learning_rate": 9.54561722435112e-05, + "loss": 0.7515342235565186, + "step": 2928 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 1.4208670854568481, + "learning_rate": 9.544616426831196e-05, + "loss": 0.7162003517150879, + "step": 2930 + }, + { + "epoch": 1.2371308016877638, + "grad_norm": 1.083389401435852, + "learning_rate": 9.543614580955543e-05, + "loss": 0.708450198173523, + "step": 2932 + }, + { + "epoch": 1.2379746835443037, + "grad_norm": 1.141364336013794, + "learning_rate": 9.542611686955268e-05, + "loss": 0.6255859732627869, + "step": 2934 + }, + { + "epoch": 1.238818565400844, + "grad_norm": 1.122036099433899, + "learning_rate": 9.54160774506172e-05, + "loss": 0.6485402584075928, + "step": 2936 + }, + { + "epoch": 1.239662447257384, + "grad_norm": 1.3514165878295898, + "learning_rate": 9.540602755506487e-05, + "loss": 0.6735473871231079, + "step": 2938 + }, + { + "epoch": 1.240506329113924, + "grad_norm": 1.1762629747390747, + "learning_rate": 9.539596718521403e-05, + "loss": 0.6154970526695251, + "step": 2940 + }, + { + "epoch": 1.2413502109704642, + "grad_norm": 1.1609408855438232, + "learning_rate": 9.53858963433854e-05, + "loss": 0.6410251259803772, + "step": 2942 + }, + { + "epoch": 1.2421940928270043, + "grad_norm": 1.1750361919403076, + "learning_rate": 9.537581503190214e-05, + "loss": 0.6841039657592773, + "step": 2944 + }, + { + "epoch": 1.2430379746835443, + "grad_norm": 1.3125680685043335, + "learning_rate": 9.536572325308982e-05, + "loss": 0.7293462753295898, + "step": 2946 + }, + { + "epoch": 1.2438818565400844, + "grad_norm": 1.1737277507781982, + "learning_rate": 9.53556210092764e-05, + "loss": 0.7713663578033447, + "step": 2948 + }, + { + "epoch": 1.2447257383966246, + "grad_norm": 1.1702152490615845, + "learning_rate": 9.53455083027923e-05, + "loss": 0.6612298488616943, + "step": 2950 + }, + { + "epoch": 1.2455696202531645, + "grad_norm": 1.2594486474990845, + "learning_rate": 9.533538513597028e-05, + "loss": 0.6725803017616272, + "step": 2952 + }, + { + "epoch": 1.2464135021097047, + "grad_norm": 1.180816411972046, + "learning_rate": 9.532525151114562e-05, + "loss": 0.6421069502830505, + "step": 2954 + }, + { + "epoch": 1.2472573839662446, + "grad_norm": 1.25814688205719, + "learning_rate": 9.531510743065593e-05, + "loss": 0.7042996287345886, + "step": 2956 + }, + { + "epoch": 1.2481012658227848, + "grad_norm": 1.2101783752441406, + "learning_rate": 9.530495289684122e-05, + "loss": 0.7359137535095215, + "step": 2958 + }, + { + "epoch": 1.248945147679325, + "grad_norm": 1.1438405513763428, + "learning_rate": 9.5294787912044e-05, + "loss": 0.6186386346817017, + "step": 2960 + }, + { + "epoch": 1.249789029535865, + "grad_norm": 1.163364291191101, + "learning_rate": 9.52846124786091e-05, + "loss": 0.6243056058883667, + "step": 2962 + }, + { + "epoch": 1.250632911392405, + "grad_norm": 1.0695953369140625, + "learning_rate": 9.52744265988838e-05, + "loss": 0.6568763852119446, + "step": 2964 + }, + { + "epoch": 1.2514767932489452, + "grad_norm": 1.2228879928588867, + "learning_rate": 9.52642302752178e-05, + "loss": 0.6486776471138, + "step": 2966 + }, + { + "epoch": 1.2523206751054852, + "grad_norm": 1.2262967824935913, + "learning_rate": 9.52540235099632e-05, + "loss": 0.6293455958366394, + "step": 2968 + }, + { + "epoch": 1.2531645569620253, + "grad_norm": 1.0862956047058105, + "learning_rate": 9.524380630547449e-05, + "loss": 0.6549884080886841, + "step": 2970 + }, + { + "epoch": 1.2540084388185653, + "grad_norm": 1.1721880435943604, + "learning_rate": 9.52335786641086e-05, + "loss": 0.6126490831375122, + "step": 2972 + }, + { + "epoch": 1.2548523206751054, + "grad_norm": 1.2452391386032104, + "learning_rate": 9.522334058822483e-05, + "loss": 0.7078590393066406, + "step": 2974 + }, + { + "epoch": 1.2556962025316456, + "grad_norm": 1.2290222644805908, + "learning_rate": 9.521309208018492e-05, + "loss": 0.6166214942932129, + "step": 2976 + }, + { + "epoch": 1.2565400843881855, + "grad_norm": 1.1823618412017822, + "learning_rate": 9.520283314235299e-05, + "loss": 0.666228175163269, + "step": 2978 + }, + { + "epoch": 1.2573839662447257, + "grad_norm": 1.1702475547790527, + "learning_rate": 9.51925637770956e-05, + "loss": 0.7436795830726624, + "step": 2980 + }, + { + "epoch": 1.2582278481012659, + "grad_norm": 1.0879321098327637, + "learning_rate": 9.518228398678168e-05, + "loss": 0.7120893001556396, + "step": 2982 + }, + { + "epoch": 1.2590717299578058, + "grad_norm": 1.1608418226242065, + "learning_rate": 9.517199377378261e-05, + "loss": 0.6931713223457336, + "step": 2984 + }, + { + "epoch": 1.259915611814346, + "grad_norm": 1.1289087533950806, + "learning_rate": 9.51616931404721e-05, + "loss": 0.6803538799285889, + "step": 2986 + }, + { + "epoch": 1.2607594936708861, + "grad_norm": 1.1622236967086792, + "learning_rate": 9.515138208922633e-05, + "loss": 0.6499706506729126, + "step": 2988 + }, + { + "epoch": 1.261603375527426, + "grad_norm": 1.2492594718933105, + "learning_rate": 9.514106062242386e-05, + "loss": 0.6132655739784241, + "step": 2990 + }, + { + "epoch": 1.2624472573839662, + "grad_norm": 1.1538822650909424, + "learning_rate": 9.513072874244567e-05, + "loss": 0.6309265494346619, + "step": 2992 + }, + { + "epoch": 1.2632911392405064, + "grad_norm": 1.0828478336334229, + "learning_rate": 9.512038645167509e-05, + "loss": 0.6297751665115356, + "step": 2994 + }, + { + "epoch": 1.2641350210970463, + "grad_norm": 1.2440937757492065, + "learning_rate": 9.511003375249792e-05, + "loss": 0.6335258483886719, + "step": 2996 + }, + { + "epoch": 1.2649789029535865, + "grad_norm": 1.1259970664978027, + "learning_rate": 9.50996706473023e-05, + "loss": 0.6513770818710327, + "step": 2998 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.1530309915542603, + "learning_rate": 9.508929713847884e-05, + "loss": 0.6490892767906189, + "step": 3000 + }, + { + "epoch": 1.2658227848101267, + "eval_loss": 0.72515869140625, + "eval_runtime": 868.0515, + "eval_samples_per_second": 2.427, + "eval_steps_per_second": 2.427, + "step": 3000 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.2257169485092163, + "learning_rate": 9.507891322842048e-05, + "loss": 0.6936060786247253, + "step": 3002 + }, + { + "epoch": 1.2675105485232068, + "grad_norm": 1.0380109548568726, + "learning_rate": 9.506851891952259e-05, + "loss": 0.5941951870918274, + "step": 3004 + }, + { + "epoch": 1.268354430379747, + "grad_norm": 1.2830222845077515, + "learning_rate": 9.505811421418296e-05, + "loss": 0.648429811000824, + "step": 3006 + }, + { + "epoch": 1.2691983122362869, + "grad_norm": 1.2212986946105957, + "learning_rate": 9.504769911480171e-05, + "loss": 0.6868565678596497, + "step": 3008 + }, + { + "epoch": 1.270042194092827, + "grad_norm": 1.104656457901001, + "learning_rate": 9.503727362378145e-05, + "loss": 0.6777986288070679, + "step": 3010 + }, + { + "epoch": 1.2708860759493672, + "grad_norm": 1.1449005603790283, + "learning_rate": 9.502683774352713e-05, + "loss": 0.6581128239631653, + "step": 3012 + }, + { + "epoch": 1.2717299578059071, + "grad_norm": 1.2753362655639648, + "learning_rate": 9.501639147644608e-05, + "loss": 0.689930260181427, + "step": 3014 + }, + { + "epoch": 1.2725738396624473, + "grad_norm": 1.3367106914520264, + "learning_rate": 9.500593482494809e-05, + "loss": 0.7549214363098145, + "step": 3016 + }, + { + "epoch": 1.2734177215189875, + "grad_norm": 1.2309048175811768, + "learning_rate": 9.499546779144528e-05, + "loss": 0.6713513135910034, + "step": 3018 + }, + { + "epoch": 1.2742616033755274, + "grad_norm": 1.3833240270614624, + "learning_rate": 9.49849903783522e-05, + "loss": 0.7045458555221558, + "step": 3020 + }, + { + "epoch": 1.2751054852320676, + "grad_norm": 1.1402570009231567, + "learning_rate": 9.49745025880858e-05, + "loss": 0.708249568939209, + "step": 3022 + }, + { + "epoch": 1.2759493670886077, + "grad_norm": 1.0476267337799072, + "learning_rate": 9.496400442306541e-05, + "loss": 0.616210401058197, + "step": 3024 + }, + { + "epoch": 1.2767932489451477, + "grad_norm": 1.1045979261398315, + "learning_rate": 9.495349588571274e-05, + "loss": 0.6691827178001404, + "step": 3026 + }, + { + "epoch": 1.2776371308016878, + "grad_norm": 1.1760368347167969, + "learning_rate": 9.494297697845194e-05, + "loss": 0.6198306083679199, + "step": 3028 + }, + { + "epoch": 1.2784810126582278, + "grad_norm": 1.0015549659729004, + "learning_rate": 9.493244770370946e-05, + "loss": 0.5756480097770691, + "step": 3030 + }, + { + "epoch": 1.279324894514768, + "grad_norm": 1.2190428972244263, + "learning_rate": 9.492190806391427e-05, + "loss": 0.6794419884681702, + "step": 3032 + }, + { + "epoch": 1.2801687763713079, + "grad_norm": 1.0210410356521606, + "learning_rate": 9.491135806149762e-05, + "loss": 0.5847988724708557, + "step": 3034 + }, + { + "epoch": 1.281012658227848, + "grad_norm": 1.0678503513336182, + "learning_rate": 9.490079769889319e-05, + "loss": 0.6760231256484985, + "step": 3036 + }, + { + "epoch": 1.2818565400843882, + "grad_norm": 1.1811012029647827, + "learning_rate": 9.489022697853709e-05, + "loss": 0.7188448309898376, + "step": 3038 + }, + { + "epoch": 1.2827004219409281, + "grad_norm": 1.1134302616119385, + "learning_rate": 9.487964590286776e-05, + "loss": 0.674904465675354, + "step": 3040 + }, + { + "epoch": 1.2835443037974683, + "grad_norm": 1.1868232488632202, + "learning_rate": 9.486905447432603e-05, + "loss": 0.6016344428062439, + "step": 3042 + }, + { + "epoch": 1.2843881856540085, + "grad_norm": 1.1586613655090332, + "learning_rate": 9.485845269535517e-05, + "loss": 0.6965603828430176, + "step": 3044 + }, + { + "epoch": 1.2852320675105484, + "grad_norm": 1.149837613105774, + "learning_rate": 9.48478405684008e-05, + "loss": 0.656144380569458, + "step": 3046 + }, + { + "epoch": 1.2860759493670886, + "grad_norm": 1.228752613067627, + "learning_rate": 9.48372180959109e-05, + "loss": 0.6388653516769409, + "step": 3048 + }, + { + "epoch": 1.2869198312236287, + "grad_norm": 1.2403100728988647, + "learning_rate": 9.482658528033595e-05, + "loss": 0.6255465745925903, + "step": 3050 + }, + { + "epoch": 1.2877637130801687, + "grad_norm": 1.2483839988708496, + "learning_rate": 9.481594212412865e-05, + "loss": 0.6828253269195557, + "step": 3052 + }, + { + "epoch": 1.2886075949367088, + "grad_norm": 1.4161021709442139, + "learning_rate": 9.480528862974422e-05, + "loss": 0.7072080373764038, + "step": 3054 + }, + { + "epoch": 1.289451476793249, + "grad_norm": 1.1500437259674072, + "learning_rate": 9.479462479964021e-05, + "loss": 0.6082415580749512, + "step": 3056 + }, + { + "epoch": 1.290295358649789, + "grad_norm": 1.196595549583435, + "learning_rate": 9.478395063627654e-05, + "loss": 0.6653015613555908, + "step": 3058 + }, + { + "epoch": 1.2911392405063291, + "grad_norm": 1.2832285165786743, + "learning_rate": 9.477326614211557e-05, + "loss": 0.7095832824707031, + "step": 3060 + }, + { + "epoch": 1.2919831223628693, + "grad_norm": 1.2234288454055786, + "learning_rate": 9.476257131962198e-05, + "loss": 0.7183426022529602, + "step": 3062 + }, + { + "epoch": 1.2928270042194092, + "grad_norm": 1.2350459098815918, + "learning_rate": 9.475186617126286e-05, + "loss": 0.713284432888031, + "step": 3064 + }, + { + "epoch": 1.2936708860759494, + "grad_norm": 1.2079555988311768, + "learning_rate": 9.47411506995077e-05, + "loss": 0.6580002307891846, + "step": 3066 + }, + { + "epoch": 1.2945147679324895, + "grad_norm": 1.129796028137207, + "learning_rate": 9.473042490682835e-05, + "loss": 0.5967763662338257, + "step": 3068 + }, + { + "epoch": 1.2953586497890295, + "grad_norm": 1.1706618070602417, + "learning_rate": 9.471968879569901e-05, + "loss": 0.6724388003349304, + "step": 3070 + }, + { + "epoch": 1.2962025316455696, + "grad_norm": 1.0336005687713623, + "learning_rate": 9.470894236859635e-05, + "loss": 0.6527577638626099, + "step": 3072 + }, + { + "epoch": 1.2970464135021098, + "grad_norm": 1.1124558448791504, + "learning_rate": 9.469818562799932e-05, + "loss": 0.677132785320282, + "step": 3074 + }, + { + "epoch": 1.2978902953586497, + "grad_norm": 1.158069372177124, + "learning_rate": 9.468741857638933e-05, + "loss": 0.649718165397644, + "step": 3076 + }, + { + "epoch": 1.29873417721519, + "grad_norm": 1.092926263809204, + "learning_rate": 9.46766412162501e-05, + "loss": 0.6872133612632751, + "step": 3078 + }, + { + "epoch": 1.29957805907173, + "grad_norm": 1.1324822902679443, + "learning_rate": 9.466585355006777e-05, + "loss": 0.6495246291160583, + "step": 3080 + }, + { + "epoch": 1.30042194092827, + "grad_norm": 1.5882837772369385, + "learning_rate": 9.465505558033086e-05, + "loss": 0.6730570197105408, + "step": 3082 + }, + { + "epoch": 1.3012658227848102, + "grad_norm": 0.9866069555282593, + "learning_rate": 9.464424730953023e-05, + "loss": 0.5677527785301208, + "step": 3084 + }, + { + "epoch": 1.3021097046413503, + "grad_norm": 1.1560224294662476, + "learning_rate": 9.463342874015917e-05, + "loss": 0.6247856020927429, + "step": 3086 + }, + { + "epoch": 1.3029535864978903, + "grad_norm": 1.135939359664917, + "learning_rate": 9.462259987471329e-05, + "loss": 0.6889358758926392, + "step": 3088 + }, + { + "epoch": 1.3037974683544304, + "grad_norm": 1.3935760259628296, + "learning_rate": 9.461176071569063e-05, + "loss": 0.7097522020339966, + "step": 3090 + }, + { + "epoch": 1.3046413502109704, + "grad_norm": 1.153518795967102, + "learning_rate": 9.460091126559155e-05, + "loss": 0.7044580578804016, + "step": 3092 + }, + { + "epoch": 1.3054852320675105, + "grad_norm": 1.2112717628479004, + "learning_rate": 9.45900515269188e-05, + "loss": 0.6119300723075867, + "step": 3094 + }, + { + "epoch": 1.3063291139240507, + "grad_norm": 1.295591115951538, + "learning_rate": 9.457918150217754e-05, + "loss": 0.7150222063064575, + "step": 3096 + }, + { + "epoch": 1.3071729957805907, + "grad_norm": 1.1175775527954102, + "learning_rate": 9.456830119387527e-05, + "loss": 0.6043334007263184, + "step": 3098 + }, + { + "epoch": 1.3080168776371308, + "grad_norm": 1.4022588729858398, + "learning_rate": 9.455741060452186e-05, + "loss": 0.6354425549507141, + "step": 3100 + }, + { + "epoch": 1.3080168776371308, + "eval_loss": 0.7225774526596069, + "eval_runtime": 862.4006, + "eval_samples_per_second": 2.443, + "eval_steps_per_second": 2.443, + "step": 3100 + }, + { + "epoch": 1.3088607594936708, + "grad_norm": 1.1657692193984985, + "learning_rate": 9.454650973662957e-05, + "loss": 0.7281571626663208, + "step": 3102 + }, + { + "epoch": 1.309704641350211, + "grad_norm": 1.6169127225875854, + "learning_rate": 9.453559859271301e-05, + "loss": 0.8038214445114136, + "step": 3104 + }, + { + "epoch": 1.310548523206751, + "grad_norm": 1.1256520748138428, + "learning_rate": 9.452467717528918e-05, + "loss": 0.6488606333732605, + "step": 3106 + }, + { + "epoch": 1.311392405063291, + "grad_norm": 1.1224530935287476, + "learning_rate": 9.451374548687745e-05, + "loss": 0.6897066235542297, + "step": 3108 + }, + { + "epoch": 1.3122362869198312, + "grad_norm": 1.1123055219650269, + "learning_rate": 9.450280352999952e-05, + "loss": 0.6332913041114807, + "step": 3110 + }, + { + "epoch": 1.3130801687763713, + "grad_norm": 1.1688940525054932, + "learning_rate": 9.449185130717952e-05, + "loss": 0.7426630854606628, + "step": 3112 + }, + { + "epoch": 1.3139240506329113, + "grad_norm": 1.1898044347763062, + "learning_rate": 9.44808888209439e-05, + "loss": 0.7156099677085876, + "step": 3114 + }, + { + "epoch": 1.3147679324894515, + "grad_norm": 1.3030686378479004, + "learning_rate": 9.44699160738215e-05, + "loss": 0.7150979042053223, + "step": 3116 + }, + { + "epoch": 1.3156118143459916, + "grad_norm": 1.1539074182510376, + "learning_rate": 9.445893306834352e-05, + "loss": 0.6687285900115967, + "step": 3118 + }, + { + "epoch": 1.3164556962025316, + "grad_norm": 1.311808466911316, + "learning_rate": 9.444793980704355e-05, + "loss": 0.7340983152389526, + "step": 3120 + }, + { + "epoch": 1.3172995780590717, + "grad_norm": 1.3325430154800415, + "learning_rate": 9.44369362924575e-05, + "loss": 0.6620677709579468, + "step": 3122 + }, + { + "epoch": 1.3181434599156119, + "grad_norm": 1.201518177986145, + "learning_rate": 9.442592252712365e-05, + "loss": 0.6169955134391785, + "step": 3124 + }, + { + "epoch": 1.3189873417721518, + "grad_norm": 1.2124013900756836, + "learning_rate": 9.441489851358272e-05, + "loss": 0.6696792840957642, + "step": 3126 + }, + { + "epoch": 1.319831223628692, + "grad_norm": 1.2186850309371948, + "learning_rate": 9.440386425437768e-05, + "loss": 0.7303428649902344, + "step": 3128 + }, + { + "epoch": 1.3206751054852321, + "grad_norm": 1.3780523538589478, + "learning_rate": 9.439281975205396e-05, + "loss": 0.7093026638031006, + "step": 3130 + }, + { + "epoch": 1.321518987341772, + "grad_norm": 1.233353614807129, + "learning_rate": 9.438176500915932e-05, + "loss": 0.6821767687797546, + "step": 3132 + }, + { + "epoch": 1.3223628691983123, + "grad_norm": 1.2425329685211182, + "learning_rate": 9.437070002824385e-05, + "loss": 0.700680136680603, + "step": 3134 + }, + { + "epoch": 1.3232067510548524, + "grad_norm": 1.1600432395935059, + "learning_rate": 9.435962481186003e-05, + "loss": 0.6173145771026611, + "step": 3136 + }, + { + "epoch": 1.3240506329113924, + "grad_norm": 1.279336929321289, + "learning_rate": 9.434853936256272e-05, + "loss": 0.6597106456756592, + "step": 3138 + }, + { + "epoch": 1.3248945147679325, + "grad_norm": 1.1787258386611938, + "learning_rate": 9.433744368290909e-05, + "loss": 0.6655287742614746, + "step": 3140 + }, + { + "epoch": 1.3257383966244727, + "grad_norm": 1.3658509254455566, + "learning_rate": 9.432633777545874e-05, + "loss": 0.6312944889068604, + "step": 3142 + }, + { + "epoch": 1.3265822784810126, + "grad_norm": 1.1220000982284546, + "learning_rate": 9.431522164277356e-05, + "loss": 0.6696156859397888, + "step": 3144 + }, + { + "epoch": 1.3274261603375528, + "grad_norm": 1.224761724472046, + "learning_rate": 9.430409528741783e-05, + "loss": 0.6586571335792542, + "step": 3146 + }, + { + "epoch": 1.328270042194093, + "grad_norm": 1.227510929107666, + "learning_rate": 9.429295871195821e-05, + "loss": 0.64905846118927, + "step": 3148 + }, + { + "epoch": 1.3291139240506329, + "grad_norm": 1.1359103918075562, + "learning_rate": 9.428181191896366e-05, + "loss": 0.6407933831214905, + "step": 3150 + }, + { + "epoch": 1.329957805907173, + "grad_norm": 1.2729473114013672, + "learning_rate": 9.427065491100556e-05, + "loss": 0.7004884481430054, + "step": 3152 + }, + { + "epoch": 1.3308016877637132, + "grad_norm": 1.1182841062545776, + "learning_rate": 9.42594876906576e-05, + "loss": 0.6835907101631165, + "step": 3154 + }, + { + "epoch": 1.3316455696202532, + "grad_norm": 1.2309781312942505, + "learning_rate": 9.424831026049585e-05, + "loss": 0.7476315498352051, + "step": 3156 + }, + { + "epoch": 1.3324894514767933, + "grad_norm": 1.0857728719711304, + "learning_rate": 9.423712262309873e-05, + "loss": 0.6811426281929016, + "step": 3158 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.299680233001709, + "learning_rate": 9.4225924781047e-05, + "loss": 0.6403942108154297, + "step": 3160 + }, + { + "epoch": 1.3341772151898734, + "grad_norm": 1.226472020149231, + "learning_rate": 9.421471673692382e-05, + "loss": 0.6758930683135986, + "step": 3162 + }, + { + "epoch": 1.3350210970464136, + "grad_norm": 1.1403205394744873, + "learning_rate": 9.420349849331463e-05, + "loss": 0.7119444608688354, + "step": 3164 + }, + { + "epoch": 1.3358649789029535, + "grad_norm": 1.2888442277908325, + "learning_rate": 9.419227005280729e-05, + "loss": 0.7411463260650635, + "step": 3166 + }, + { + "epoch": 1.3367088607594937, + "grad_norm": 1.1929190158843994, + "learning_rate": 9.418103141799197e-05, + "loss": 0.5992606282234192, + "step": 3168 + }, + { + "epoch": 1.3375527426160336, + "grad_norm": 1.2574355602264404, + "learning_rate": 9.416978259146122e-05, + "loss": 0.6728890538215637, + "step": 3170 + }, + { + "epoch": 1.3383966244725738, + "grad_norm": 0.9653727412223816, + "learning_rate": 9.415852357580992e-05, + "loss": 0.6294883489608765, + "step": 3172 + }, + { + "epoch": 1.339240506329114, + "grad_norm": 1.2107670307159424, + "learning_rate": 9.414725437363532e-05, + "loss": 0.6816665530204773, + "step": 3174 + }, + { + "epoch": 1.340084388185654, + "grad_norm": 1.024849534034729, + "learning_rate": 9.4135974987537e-05, + "loss": 0.6186381578445435, + "step": 3176 + }, + { + "epoch": 1.340928270042194, + "grad_norm": 1.1556614637374878, + "learning_rate": 9.41246854201169e-05, + "loss": 0.6071005463600159, + "step": 3178 + }, + { + "epoch": 1.3417721518987342, + "grad_norm": 1.2382808923721313, + "learning_rate": 9.41133856739793e-05, + "loss": 0.7871434092521667, + "step": 3180 + }, + { + "epoch": 1.3426160337552742, + "grad_norm": 1.0499578714370728, + "learning_rate": 9.410207575173082e-05, + "loss": 0.6578201651573181, + "step": 3182 + }, + { + "epoch": 1.3434599156118143, + "grad_norm": 1.2048250436782837, + "learning_rate": 9.409075565598049e-05, + "loss": 0.6271620392799377, + "step": 3184 + }, + { + "epoch": 1.3443037974683545, + "grad_norm": 1.0287591218948364, + "learning_rate": 9.407942538933958e-05, + "loss": 0.5773864388465881, + "step": 3186 + }, + { + "epoch": 1.3451476793248944, + "grad_norm": 1.1125097274780273, + "learning_rate": 9.406808495442181e-05, + "loss": 0.6745175719261169, + "step": 3188 + }, + { + "epoch": 1.3459915611814346, + "grad_norm": 1.036125898361206, + "learning_rate": 9.405673435384319e-05, + "loss": 0.6001214385032654, + "step": 3190 + }, + { + "epoch": 1.3468354430379748, + "grad_norm": 1.2771985530853271, + "learning_rate": 9.404537359022207e-05, + "loss": 0.6703945994377136, + "step": 3192 + }, + { + "epoch": 1.3476793248945147, + "grad_norm": 1.0891097784042358, + "learning_rate": 9.403400266617918e-05, + "loss": 0.6159096360206604, + "step": 3194 + }, + { + "epoch": 1.3485232067510549, + "grad_norm": 1.1926233768463135, + "learning_rate": 9.402262158433755e-05, + "loss": 0.6439315676689148, + "step": 3196 + }, + { + "epoch": 1.349367088607595, + "grad_norm": 1.272557020187378, + "learning_rate": 9.40112303473226e-05, + "loss": 0.7125352025032043, + "step": 3198 + }, + { + "epoch": 1.350210970464135, + "grad_norm": 1.052037239074707, + "learning_rate": 9.399982895776207e-05, + "loss": 0.594719648361206, + "step": 3200 + }, + { + "epoch": 1.350210970464135, + "eval_loss": 0.7200453281402588, + "eval_runtime": 846.2953, + "eval_samples_per_second": 2.49, + "eval_steps_per_second": 2.49, + "step": 3200 + }, + { + "epoch": 1.3510548523206751, + "grad_norm": 1.204728126525879, + "learning_rate": 9.398841741828601e-05, + "loss": 0.6390520334243774, + "step": 3202 + }, + { + "epoch": 1.3518987341772153, + "grad_norm": 1.0873899459838867, + "learning_rate": 9.397699573152689e-05, + "loss": 0.6010531187057495, + "step": 3204 + }, + { + "epoch": 1.3527426160337552, + "grad_norm": 1.3124359846115112, + "learning_rate": 9.396556390011944e-05, + "loss": 0.724280834197998, + "step": 3206 + }, + { + "epoch": 1.3535864978902954, + "grad_norm": 1.2179948091506958, + "learning_rate": 9.395412192670075e-05, + "loss": 0.6430405378341675, + "step": 3208 + }, + { + "epoch": 1.3544303797468356, + "grad_norm": 1.2617219686508179, + "learning_rate": 9.394266981391031e-05, + "loss": 0.7188641428947449, + "step": 3210 + }, + { + "epoch": 1.3552742616033755, + "grad_norm": 1.2151501178741455, + "learning_rate": 9.393120756438988e-05, + "loss": 0.6724364757537842, + "step": 3212 + }, + { + "epoch": 1.3561181434599157, + "grad_norm": 1.221528172492981, + "learning_rate": 9.391973518078357e-05, + "loss": 0.6340664625167847, + "step": 3214 + }, + { + "epoch": 1.3569620253164558, + "grad_norm": 1.3180092573165894, + "learning_rate": 9.390825266573786e-05, + "loss": 0.6914255023002625, + "step": 3216 + }, + { + "epoch": 1.3578059071729958, + "grad_norm": 1.103994369506836, + "learning_rate": 9.38967600219015e-05, + "loss": 0.6137136220932007, + "step": 3218 + }, + { + "epoch": 1.358649789029536, + "grad_norm": 1.33389413356781, + "learning_rate": 9.38852572519257e-05, + "loss": 0.7173700332641602, + "step": 3220 + }, + { + "epoch": 1.3594936708860759, + "grad_norm": 1.1074159145355225, + "learning_rate": 9.387374435846386e-05, + "loss": 0.5942243933677673, + "step": 3222 + }, + { + "epoch": 1.360337552742616, + "grad_norm": 1.1157063245773315, + "learning_rate": 9.386222134417182e-05, + "loss": 0.6362866163253784, + "step": 3224 + }, + { + "epoch": 1.3611814345991562, + "grad_norm": 1.1717792749404907, + "learning_rate": 9.38506882117077e-05, + "loss": 0.6784523129463196, + "step": 3226 + }, + { + "epoch": 1.3620253164556961, + "grad_norm": 1.0946043729782104, + "learning_rate": 9.383914496373197e-05, + "loss": 0.6647377014160156, + "step": 3228 + }, + { + "epoch": 1.3628691983122363, + "grad_norm": 1.1519699096679688, + "learning_rate": 9.382759160290746e-05, + "loss": 0.6302075982093811, + "step": 3230 + }, + { + "epoch": 1.3637130801687762, + "grad_norm": 0.9928684830665588, + "learning_rate": 9.381602813189929e-05, + "loss": 0.5979090332984924, + "step": 3232 + }, + { + "epoch": 1.3645569620253164, + "grad_norm": 1.2488124370574951, + "learning_rate": 9.380445455337492e-05, + "loss": 0.6949353218078613, + "step": 3234 + }, + { + "epoch": 1.3654008438818566, + "grad_norm": 1.3884797096252441, + "learning_rate": 9.379287087000416e-05, + "loss": 0.7225558161735535, + "step": 3236 + }, + { + "epoch": 1.3662447257383965, + "grad_norm": 1.2981176376342773, + "learning_rate": 9.378127708445917e-05, + "loss": 0.6993390917778015, + "step": 3238 + }, + { + "epoch": 1.3670886075949367, + "grad_norm": 0.9884640574455261, + "learning_rate": 9.376967319941438e-05, + "loss": 0.6983805894851685, + "step": 3240 + }, + { + "epoch": 1.3679324894514768, + "grad_norm": 1.2051894664764404, + "learning_rate": 9.375805921754659e-05, + "loss": 0.7062534689903259, + "step": 3242 + }, + { + "epoch": 1.3687763713080168, + "grad_norm": 1.1943434476852417, + "learning_rate": 9.374643514153494e-05, + "loss": 0.6405107378959656, + "step": 3244 + }, + { + "epoch": 1.369620253164557, + "grad_norm": 1.249214768409729, + "learning_rate": 9.373480097406086e-05, + "loss": 0.6844781637191772, + "step": 3246 + }, + { + "epoch": 1.370464135021097, + "grad_norm": 1.1847131252288818, + "learning_rate": 9.372315671780813e-05, + "loss": 0.6048306226730347, + "step": 3248 + }, + { + "epoch": 1.371308016877637, + "grad_norm": 1.125545859336853, + "learning_rate": 9.37115023754629e-05, + "loss": 0.6772685050964355, + "step": 3250 + }, + { + "epoch": 1.3721518987341772, + "grad_norm": 1.466615915298462, + "learning_rate": 9.369983794971354e-05, + "loss": 0.7536272406578064, + "step": 3252 + }, + { + "epoch": 1.3729957805907174, + "grad_norm": 1.066699504852295, + "learning_rate": 9.368816344325084e-05, + "loss": 0.6640655398368835, + "step": 3254 + }, + { + "epoch": 1.3738396624472573, + "grad_norm": 1.4793988466262817, + "learning_rate": 9.367647885876787e-05, + "loss": 0.7029458284378052, + "step": 3256 + }, + { + "epoch": 1.3746835443037975, + "grad_norm": 1.258540153503418, + "learning_rate": 9.366478419896006e-05, + "loss": 0.7231863737106323, + "step": 3258 + }, + { + "epoch": 1.3755274261603376, + "grad_norm": 1.176106333732605, + "learning_rate": 9.365307946652512e-05, + "loss": 0.6679144501686096, + "step": 3260 + }, + { + "epoch": 1.3763713080168776, + "grad_norm": 1.3301753997802734, + "learning_rate": 9.364136466416316e-05, + "loss": 0.6282188296318054, + "step": 3262 + }, + { + "epoch": 1.3772151898734177, + "grad_norm": 1.3616732358932495, + "learning_rate": 9.362963979457648e-05, + "loss": 0.6870840191841125, + "step": 3264 + }, + { + "epoch": 1.378059071729958, + "grad_norm": 1.1982418298721313, + "learning_rate": 9.361790486046985e-05, + "loss": 0.6823731660842896, + "step": 3266 + }, + { + "epoch": 1.3789029535864978, + "grad_norm": 1.1869033575057983, + "learning_rate": 9.360615986455024e-05, + "loss": 0.6582897305488586, + "step": 3268 + }, + { + "epoch": 1.379746835443038, + "grad_norm": 1.1192975044250488, + "learning_rate": 9.359440480952703e-05, + "loss": 0.716654360294342, + "step": 3270 + }, + { + "epoch": 1.3805907172995782, + "grad_norm": 1.2210016250610352, + "learning_rate": 9.358263969811189e-05, + "loss": 0.6880061626434326, + "step": 3272 + }, + { + "epoch": 1.381434599156118, + "grad_norm": 1.0358284711837769, + "learning_rate": 9.357086453301878e-05, + "loss": 0.666864812374115, + "step": 3274 + }, + { + "epoch": 1.3822784810126583, + "grad_norm": 1.2790803909301758, + "learning_rate": 9.355907931696401e-05, + "loss": 0.6872087121009827, + "step": 3276 + }, + { + "epoch": 1.3831223628691984, + "grad_norm": 1.182991623878479, + "learning_rate": 9.354728405266623e-05, + "loss": 0.5929665565490723, + "step": 3278 + }, + { + "epoch": 1.3839662447257384, + "grad_norm": 1.1071184873580933, + "learning_rate": 9.353547874284634e-05, + "loss": 0.5928181409835815, + "step": 3280 + }, + { + "epoch": 1.3848101265822785, + "grad_norm": 1.3139623403549194, + "learning_rate": 9.352366339022763e-05, + "loss": 0.6783652901649475, + "step": 3282 + }, + { + "epoch": 1.3856540084388187, + "grad_norm": 1.2534632682800293, + "learning_rate": 9.351183799753567e-05, + "loss": 0.7652941346168518, + "step": 3284 + }, + { + "epoch": 1.3864978902953586, + "grad_norm": 1.4487930536270142, + "learning_rate": 9.350000256749833e-05, + "loss": 0.7430433630943298, + "step": 3286 + }, + { + "epoch": 1.3873417721518988, + "grad_norm": 1.0786021947860718, + "learning_rate": 9.348815710284584e-05, + "loss": 0.5854598879814148, + "step": 3288 + }, + { + "epoch": 1.3881856540084387, + "grad_norm": 1.0544480085372925, + "learning_rate": 9.347630160631071e-05, + "loss": 0.6365222334861755, + "step": 3290 + }, + { + "epoch": 1.389029535864979, + "grad_norm": 0.9989988207817078, + "learning_rate": 9.346443608062778e-05, + "loss": 0.6485803127288818, + "step": 3292 + }, + { + "epoch": 1.389873417721519, + "grad_norm": 1.100951910018921, + "learning_rate": 9.345256052853419e-05, + "loss": 0.6417753100395203, + "step": 3294 + }, + { + "epoch": 1.390717299578059, + "grad_norm": 1.1398471593856812, + "learning_rate": 9.344067495276942e-05, + "loss": 0.6333693861961365, + "step": 3296 + }, + { + "epoch": 1.3915611814345992, + "grad_norm": 1.1745941638946533, + "learning_rate": 9.342877935607521e-05, + "loss": 0.677288293838501, + "step": 3298 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 1.2651115655899048, + "learning_rate": 9.34168737411957e-05, + "loss": 0.7408396005630493, + "step": 3300 + }, + { + "epoch": 1.3924050632911391, + "eval_loss": 0.7173135876655579, + "eval_runtime": 853.5344, + "eval_samples_per_second": 2.469, + "eval_steps_per_second": 2.469, + "step": 3300 + }, + { + "epoch": 1.3932489451476793, + "grad_norm": 1.0747730731964111, + "learning_rate": 9.340495811087723e-05, + "loss": 0.6810371279716492, + "step": 3302 + }, + { + "epoch": 1.3940928270042194, + "grad_norm": 1.2857651710510254, + "learning_rate": 9.339303246786854e-05, + "loss": 0.6693953275680542, + "step": 3304 + }, + { + "epoch": 1.3949367088607594, + "grad_norm": 1.4544212818145752, + "learning_rate": 9.338109681492063e-05, + "loss": 0.7019274234771729, + "step": 3306 + }, + { + "epoch": 1.3957805907172995, + "grad_norm": 1.687755823135376, + "learning_rate": 9.336915115478685e-05, + "loss": 0.6074224710464478, + "step": 3308 + }, + { + "epoch": 1.3966244725738397, + "grad_norm": 1.1645431518554688, + "learning_rate": 9.33571954902228e-05, + "loss": 0.6981383562088013, + "step": 3310 + }, + { + "epoch": 1.3974683544303796, + "grad_norm": 1.6173527240753174, + "learning_rate": 9.334522982398646e-05, + "loss": 0.7282926440238953, + "step": 3312 + }, + { + "epoch": 1.3983122362869198, + "grad_norm": 1.3132909536361694, + "learning_rate": 9.333325415883804e-05, + "loss": 0.6574883460998535, + "step": 3314 + }, + { + "epoch": 1.39915611814346, + "grad_norm": 1.1629762649536133, + "learning_rate": 9.332126849754014e-05, + "loss": 0.6559937596321106, + "step": 3316 + }, + { + "epoch": 1.4, + "grad_norm": 1.1666897535324097, + "learning_rate": 9.33092728428576e-05, + "loss": 0.683718740940094, + "step": 3318 + }, + { + "epoch": 1.40084388185654, + "grad_norm": 1.2269554138183594, + "learning_rate": 9.329726719755756e-05, + "loss": 0.6909779906272888, + "step": 3320 + }, + { + "epoch": 1.4016877637130802, + "grad_norm": 1.1010066270828247, + "learning_rate": 9.328525156440952e-05, + "loss": 0.6051948666572571, + "step": 3322 + }, + { + "epoch": 1.4025316455696202, + "grad_norm": 1.127143144607544, + "learning_rate": 9.327322594618528e-05, + "loss": 0.6266679763793945, + "step": 3324 + }, + { + "epoch": 1.4033755274261603, + "grad_norm": 1.2160708904266357, + "learning_rate": 9.326119034565887e-05, + "loss": 0.6587526202201843, + "step": 3326 + }, + { + "epoch": 1.4042194092827005, + "grad_norm": 1.0853947401046753, + "learning_rate": 9.32491447656067e-05, + "loss": 0.5916946530342102, + "step": 3328 + }, + { + "epoch": 1.4050632911392404, + "grad_norm": 1.2205027341842651, + "learning_rate": 9.323708920880744e-05, + "loss": 0.6032452583312988, + "step": 3330 + }, + { + "epoch": 1.4059071729957806, + "grad_norm": 1.1964668035507202, + "learning_rate": 9.32250236780421e-05, + "loss": 0.6649114489555359, + "step": 3332 + }, + { + "epoch": 1.4067510548523208, + "grad_norm": 1.2507994174957275, + "learning_rate": 9.321294817609394e-05, + "loss": 0.7142994403839111, + "step": 3334 + }, + { + "epoch": 1.4075949367088607, + "grad_norm": 1.1310259103775024, + "learning_rate": 9.320086270574854e-05, + "loss": 0.709568977355957, + "step": 3336 + }, + { + "epoch": 1.4084388185654009, + "grad_norm": 1.2454090118408203, + "learning_rate": 9.318876726979385e-05, + "loss": 0.7800853848457336, + "step": 3338 + }, + { + "epoch": 1.409282700421941, + "grad_norm": 1.1168389320373535, + "learning_rate": 9.317666187101996e-05, + "loss": 0.6187908053398132, + "step": 3340 + }, + { + "epoch": 1.410126582278481, + "grad_norm": 1.6696287393569946, + "learning_rate": 9.316454651221942e-05, + "loss": 0.6222613453865051, + "step": 3342 + }, + { + "epoch": 1.4109704641350211, + "grad_norm": 0.9500295519828796, + "learning_rate": 9.315242119618698e-05, + "loss": 0.6116594672203064, + "step": 3344 + }, + { + "epoch": 1.4118143459915613, + "grad_norm": 1.186358094215393, + "learning_rate": 9.314028592571973e-05, + "loss": 0.633224368095398, + "step": 3346 + }, + { + "epoch": 1.4126582278481012, + "grad_norm": 1.1855978965759277, + "learning_rate": 9.312814070361705e-05, + "loss": 0.6675921082496643, + "step": 3348 + }, + { + "epoch": 1.4135021097046414, + "grad_norm": 1.2465872764587402, + "learning_rate": 9.311598553268059e-05, + "loss": 0.7268879413604736, + "step": 3350 + }, + { + "epoch": 1.4143459915611816, + "grad_norm": 1.151274561882019, + "learning_rate": 9.310382041571435e-05, + "loss": 0.6147416830062866, + "step": 3352 + }, + { + "epoch": 1.4151898734177215, + "grad_norm": 1.1226807832717896, + "learning_rate": 9.309164535552453e-05, + "loss": 0.6678543090820312, + "step": 3354 + }, + { + "epoch": 1.4160337552742617, + "grad_norm": 1.375842571258545, + "learning_rate": 9.307946035491975e-05, + "loss": 0.6334129571914673, + "step": 3356 + }, + { + "epoch": 1.4168776371308016, + "grad_norm": 1.058353066444397, + "learning_rate": 9.306726541671081e-05, + "loss": 0.6582583785057068, + "step": 3358 + }, + { + "epoch": 1.4177215189873418, + "grad_norm": 1.0511330366134644, + "learning_rate": 9.305506054371084e-05, + "loss": 0.5877419114112854, + "step": 3360 + }, + { + "epoch": 1.4185654008438817, + "grad_norm": 1.2246462106704712, + "learning_rate": 9.304284573873532e-05, + "loss": 0.711665689945221, + "step": 3362 + }, + { + "epoch": 1.4194092827004219, + "grad_norm": 1.0242294073104858, + "learning_rate": 9.303062100460193e-05, + "loss": 0.6743642687797546, + "step": 3364 + }, + { + "epoch": 1.420253164556962, + "grad_norm": 1.1432100534439087, + "learning_rate": 9.301838634413069e-05, + "loss": 0.6825576424598694, + "step": 3366 + }, + { + "epoch": 1.421097046413502, + "grad_norm": 1.0128604173660278, + "learning_rate": 9.30061417601439e-05, + "loss": 0.624455988407135, + "step": 3368 + }, + { + "epoch": 1.4219409282700421, + "grad_norm": 1.2738330364227295, + "learning_rate": 9.299388725546617e-05, + "loss": 0.7029586434364319, + "step": 3370 + }, + { + "epoch": 1.4227848101265823, + "grad_norm": 1.0857324600219727, + "learning_rate": 9.298162283292435e-05, + "loss": 0.5994319915771484, + "step": 3372 + }, + { + "epoch": 1.4236286919831223, + "grad_norm": 1.0811917781829834, + "learning_rate": 9.296934849534763e-05, + "loss": 0.6537772417068481, + "step": 3374 + }, + { + "epoch": 1.4244725738396624, + "grad_norm": 1.006913185119629, + "learning_rate": 9.295706424556745e-05, + "loss": 0.5775008201599121, + "step": 3376 + }, + { + "epoch": 1.4253164556962026, + "grad_norm": 1.2306486368179321, + "learning_rate": 9.294477008641755e-05, + "loss": 0.7445536255836487, + "step": 3378 + }, + { + "epoch": 1.4261603375527425, + "grad_norm": 1.223608374595642, + "learning_rate": 9.293246602073398e-05, + "loss": 0.6081538796424866, + "step": 3380 + }, + { + "epoch": 1.4270042194092827, + "grad_norm": 1.0933321714401245, + "learning_rate": 9.2920152051355e-05, + "loss": 0.6134634613990784, + "step": 3382 + }, + { + "epoch": 1.4278481012658228, + "grad_norm": 1.1738401651382446, + "learning_rate": 9.290782818112127e-05, + "loss": 0.5961087346076965, + "step": 3384 + }, + { + "epoch": 1.4286919831223628, + "grad_norm": 1.1493438482284546, + "learning_rate": 9.289549441287561e-05, + "loss": 0.6284122467041016, + "step": 3386 + }, + { + "epoch": 1.429535864978903, + "grad_norm": 1.1907998323440552, + "learning_rate": 9.288315074946324e-05, + "loss": 0.6654639840126038, + "step": 3388 + }, + { + "epoch": 1.4303797468354431, + "grad_norm": 1.3423025608062744, + "learning_rate": 9.287079719373157e-05, + "loss": 0.652850329875946, + "step": 3390 + }, + { + "epoch": 1.431223628691983, + "grad_norm": 1.3932039737701416, + "learning_rate": 9.285843374853034e-05, + "loss": 0.703445315361023, + "step": 3392 + }, + { + "epoch": 1.4320675105485232, + "grad_norm": 5.349400043487549, + "learning_rate": 9.284606041671155e-05, + "loss": 0.693265438079834, + "step": 3394 + }, + { + "epoch": 1.4329113924050634, + "grad_norm": 1.0921961069107056, + "learning_rate": 9.28336772011295e-05, + "loss": 0.6578536033630371, + "step": 3396 + }, + { + "epoch": 1.4337552742616033, + "grad_norm": 1.184157133102417, + "learning_rate": 9.282128410464074e-05, + "loss": 0.7092277407646179, + "step": 3398 + }, + { + "epoch": 1.4345991561181435, + "grad_norm": 1.0923491716384888, + "learning_rate": 9.280888113010415e-05, + "loss": 0.6866328120231628, + "step": 3400 + }, + { + "epoch": 1.4345991561181435, + "eval_loss": 0.715917706489563, + "eval_runtime": 868.51, + "eval_samples_per_second": 2.426, + "eval_steps_per_second": 2.426, + "step": 3400 + }, + { + "epoch": 1.4354430379746836, + "grad_norm": 1.2515597343444824, + "learning_rate": 9.279646828038083e-05, + "loss": 0.6617444157600403, + "step": 3402 + }, + { + "epoch": 1.4362869198312236, + "grad_norm": 1.2122540473937988, + "learning_rate": 9.278404555833422e-05, + "loss": 0.6373176574707031, + "step": 3404 + }, + { + "epoch": 1.4371308016877637, + "grad_norm": 1.191904902458191, + "learning_rate": 9.277161296682997e-05, + "loss": 0.6506488919258118, + "step": 3406 + }, + { + "epoch": 1.437974683544304, + "grad_norm": 1.2492214441299438, + "learning_rate": 9.275917050873606e-05, + "loss": 0.7172291874885559, + "step": 3408 + }, + { + "epoch": 1.4388185654008439, + "grad_norm": 1.0518640279769897, + "learning_rate": 9.274671818692272e-05, + "loss": 0.6180248260498047, + "step": 3410 + }, + { + "epoch": 1.439662447257384, + "grad_norm": 1.150563359260559, + "learning_rate": 9.273425600426245e-05, + "loss": 0.6828892827033997, + "step": 3412 + }, + { + "epoch": 1.4405063291139242, + "grad_norm": 1.76945960521698, + "learning_rate": 9.272178396363005e-05, + "loss": 0.6585919857025146, + "step": 3414 + }, + { + "epoch": 1.4413502109704641, + "grad_norm": 1.2367758750915527, + "learning_rate": 9.270930206790257e-05, + "loss": 0.7548692226409912, + "step": 3416 + }, + { + "epoch": 1.4421940928270043, + "grad_norm": 1.2292778491973877, + "learning_rate": 9.269681031995936e-05, + "loss": 0.7017102837562561, + "step": 3418 + }, + { + "epoch": 1.4430379746835442, + "grad_norm": 1.2193396091461182, + "learning_rate": 9.268430872268202e-05, + "loss": 0.6657648682594299, + "step": 3420 + }, + { + "epoch": 1.4438818565400844, + "grad_norm": 1.0505954027175903, + "learning_rate": 9.267179727895443e-05, + "loss": 0.6950910091400146, + "step": 3422 + }, + { + "epoch": 1.4447257383966245, + "grad_norm": 1.1560698747634888, + "learning_rate": 9.265927599166272e-05, + "loss": 0.689308226108551, + "step": 3424 + }, + { + "epoch": 1.4455696202531645, + "grad_norm": 1.189336895942688, + "learning_rate": 9.264674486369533e-05, + "loss": 0.6481659412384033, + "step": 3426 + }, + { + "epoch": 1.4464135021097047, + "grad_norm": 1.3527976274490356, + "learning_rate": 9.263420389794294e-05, + "loss": 0.6626612544059753, + "step": 3428 + }, + { + "epoch": 1.4472573839662446, + "grad_norm": 1.096303105354309, + "learning_rate": 9.262165309729854e-05, + "loss": 0.690841794013977, + "step": 3430 + }, + { + "epoch": 1.4481012658227848, + "grad_norm": 1.2131421566009521, + "learning_rate": 9.260909246465732e-05, + "loss": 0.6497649550437927, + "step": 3432 + }, + { + "epoch": 1.448945147679325, + "grad_norm": 1.1831032037734985, + "learning_rate": 9.259652200291678e-05, + "loss": 0.6236130595207214, + "step": 3434 + }, + { + "epoch": 1.4497890295358649, + "grad_norm": 0.9745979309082031, + "learning_rate": 9.25839417149767e-05, + "loss": 0.5223423838615417, + "step": 3436 + }, + { + "epoch": 1.450632911392405, + "grad_norm": 1.372460126876831, + "learning_rate": 9.257135160373912e-05, + "loss": 0.6642022728919983, + "step": 3438 + }, + { + "epoch": 1.4514767932489452, + "grad_norm": 1.421044111251831, + "learning_rate": 9.255875167210832e-05, + "loss": 0.5426992774009705, + "step": 3440 + }, + { + "epoch": 1.4523206751054851, + "grad_norm": 1.1694250106811523, + "learning_rate": 9.254614192299086e-05, + "loss": 0.6260567307472229, + "step": 3442 + }, + { + "epoch": 1.4531645569620253, + "grad_norm": 1.0892298221588135, + "learning_rate": 9.253352235929558e-05, + "loss": 0.5776100158691406, + "step": 3444 + }, + { + "epoch": 1.4540084388185655, + "grad_norm": 1.1841259002685547, + "learning_rate": 9.252089298393356e-05, + "loss": 0.6495202779769897, + "step": 3446 + }, + { + "epoch": 1.4548523206751054, + "grad_norm": 1.1133549213409424, + "learning_rate": 9.250825379981815e-05, + "loss": 0.6570594906806946, + "step": 3448 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 1.197100281715393, + "learning_rate": 9.249560480986498e-05, + "loss": 0.6496587991714478, + "step": 3450 + }, + { + "epoch": 1.4565400843881857, + "grad_norm": 1.1661107540130615, + "learning_rate": 9.248294601699193e-05, + "loss": 0.6644704341888428, + "step": 3452 + }, + { + "epoch": 1.4573839662447257, + "grad_norm": 1.2257879972457886, + "learning_rate": 9.247027742411912e-05, + "loss": 0.6451231241226196, + "step": 3454 + }, + { + "epoch": 1.4582278481012658, + "grad_norm": 1.3634982109069824, + "learning_rate": 9.245759903416897e-05, + "loss": 0.6108601093292236, + "step": 3456 + }, + { + "epoch": 1.459071729957806, + "grad_norm": 1.1802605390548706, + "learning_rate": 9.244491085006615e-05, + "loss": 0.6080004572868347, + "step": 3458 + }, + { + "epoch": 1.459915611814346, + "grad_norm": 1.280831217765808, + "learning_rate": 9.243221287473756e-05, + "loss": 0.6406423449516296, + "step": 3460 + }, + { + "epoch": 1.460759493670886, + "grad_norm": 1.3127192258834839, + "learning_rate": 9.241950511111237e-05, + "loss": 0.7320113778114319, + "step": 3462 + }, + { + "epoch": 1.4616033755274263, + "grad_norm": 1.1711835861206055, + "learning_rate": 9.240678756212204e-05, + "loss": 0.572110652923584, + "step": 3464 + }, + { + "epoch": 1.4624472573839662, + "grad_norm": 1.347143292427063, + "learning_rate": 9.239406023070028e-05, + "loss": 0.7446795105934143, + "step": 3466 + }, + { + "epoch": 1.4632911392405064, + "grad_norm": 1.4953652620315552, + "learning_rate": 9.238132311978299e-05, + "loss": 0.6709978580474854, + "step": 3468 + }, + { + "epoch": 1.4641350210970465, + "grad_norm": 1.2199387550354004, + "learning_rate": 9.236857623230842e-05, + "loss": 0.6691445112228394, + "step": 3470 + }, + { + "epoch": 1.4649789029535865, + "grad_norm": 1.0959199666976929, + "learning_rate": 9.235581957121702e-05, + "loss": 0.6964292526245117, + "step": 3472 + }, + { + "epoch": 1.4658227848101266, + "grad_norm": 1.455505609512329, + "learning_rate": 9.234305313945149e-05, + "loss": 0.6880454421043396, + "step": 3474 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.2820862531661987, + "learning_rate": 9.233027693995681e-05, + "loss": 0.6737138032913208, + "step": 3476 + }, + { + "epoch": 1.4675105485232067, + "grad_norm": 1.3459213972091675, + "learning_rate": 9.231749097568023e-05, + "loss": 0.6874006390571594, + "step": 3478 + }, + { + "epoch": 1.4683544303797469, + "grad_norm": 1.2815442085266113, + "learning_rate": 9.230469524957119e-05, + "loss": 0.7179469466209412, + "step": 3480 + }, + { + "epoch": 1.469198312236287, + "grad_norm": 1.6181597709655762, + "learning_rate": 9.229188976458145e-05, + "loss": 0.7525522112846375, + "step": 3482 + }, + { + "epoch": 1.470042194092827, + "grad_norm": 1.0633227825164795, + "learning_rate": 9.227907452366495e-05, + "loss": 0.5918128490447998, + "step": 3484 + }, + { + "epoch": 1.4708860759493672, + "grad_norm": 1.2055985927581787, + "learning_rate": 9.226624952977796e-05, + "loss": 0.6686186194419861, + "step": 3486 + }, + { + "epoch": 1.471729957805907, + "grad_norm": 1.2495088577270508, + "learning_rate": 9.225341478587893e-05, + "loss": 0.764410674571991, + "step": 3488 + }, + { + "epoch": 1.4725738396624473, + "grad_norm": 1.174229383468628, + "learning_rate": 9.22405702949286e-05, + "loss": 0.7066780924797058, + "step": 3490 + }, + { + "epoch": 1.4734177215189874, + "grad_norm": 1.0970302820205688, + "learning_rate": 9.222771605988995e-05, + "loss": 0.6740228533744812, + "step": 3492 + }, + { + "epoch": 1.4742616033755274, + "grad_norm": 1.2470436096191406, + "learning_rate": 9.221485208372822e-05, + "loss": 0.698371410369873, + "step": 3494 + }, + { + "epoch": 1.4751054852320675, + "grad_norm": 1.0750112533569336, + "learning_rate": 9.220197836941084e-05, + "loss": 0.6354188919067383, + "step": 3496 + }, + { + "epoch": 1.4759493670886075, + "grad_norm": 1.2656232118606567, + "learning_rate": 9.218909491990757e-05, + "loss": 0.7268608212471008, + "step": 3498 + }, + { + "epoch": 1.4767932489451476, + "grad_norm": 1.2389028072357178, + "learning_rate": 9.217620173819037e-05, + "loss": 0.6652966141700745, + "step": 3500 + }, + { + "epoch": 1.4767932489451476, + "eval_loss": 0.7155047059059143, + "eval_runtime": 855.8428, + "eval_samples_per_second": 2.462, + "eval_steps_per_second": 2.462, + "step": 3500 + }, + { + "epoch": 1.4776371308016878, + "grad_norm": 1.218304991722107, + "learning_rate": 9.216329882723343e-05, + "loss": 0.6845020651817322, + "step": 3502 + }, + { + "epoch": 1.4784810126582277, + "grad_norm": 1.123903512954712, + "learning_rate": 9.21503861900132e-05, + "loss": 0.6972519755363464, + "step": 3504 + }, + { + "epoch": 1.479324894514768, + "grad_norm": 1.1827739477157593, + "learning_rate": 9.213746382950839e-05, + "loss": 0.6699702739715576, + "step": 3506 + }, + { + "epoch": 1.480168776371308, + "grad_norm": 0.9934872984886169, + "learning_rate": 9.212453174869995e-05, + "loss": 0.5623225569725037, + "step": 3508 + }, + { + "epoch": 1.481012658227848, + "grad_norm": 1.221093773841858, + "learning_rate": 9.211158995057105e-05, + "loss": 0.6527173519134521, + "step": 3510 + }, + { + "epoch": 1.4818565400843882, + "grad_norm": 1.4569166898727417, + "learning_rate": 9.209863843810711e-05, + "loss": 0.7015712261199951, + "step": 3512 + }, + { + "epoch": 1.4827004219409283, + "grad_norm": 1.0764813423156738, + "learning_rate": 9.208567721429581e-05, + "loss": 0.6442505717277527, + "step": 3514 + }, + { + "epoch": 1.4835443037974683, + "grad_norm": 2.1307506561279297, + "learning_rate": 9.207270628212704e-05, + "loss": 0.666451096534729, + "step": 3516 + }, + { + "epoch": 1.4843881856540084, + "grad_norm": 1.180590271949768, + "learning_rate": 9.205972564459296e-05, + "loss": 0.6354807019233704, + "step": 3518 + }, + { + "epoch": 1.4852320675105486, + "grad_norm": 1.2999447584152222, + "learning_rate": 9.204673530468795e-05, + "loss": 0.6080324053764343, + "step": 3520 + }, + { + "epoch": 1.4860759493670885, + "grad_norm": 1.1680655479431152, + "learning_rate": 9.203373526540862e-05, + "loss": 0.6411244869232178, + "step": 3522 + }, + { + "epoch": 1.4869198312236287, + "grad_norm": 1.0565013885498047, + "learning_rate": 9.202072552975383e-05, + "loss": 0.6498287916183472, + "step": 3524 + }, + { + "epoch": 1.4877637130801689, + "grad_norm": 1.246267318725586, + "learning_rate": 9.20077061007247e-05, + "loss": 0.633613109588623, + "step": 3526 + }, + { + "epoch": 1.4886075949367088, + "grad_norm": 1.0626300573349, + "learning_rate": 9.199467698132453e-05, + "loss": 0.6102107167243958, + "step": 3528 + }, + { + "epoch": 1.489451476793249, + "grad_norm": 1.256600260734558, + "learning_rate": 9.198163817455892e-05, + "loss": 0.669352114200592, + "step": 3530 + }, + { + "epoch": 1.4902953586497891, + "grad_norm": 1.143188238143921, + "learning_rate": 9.196858968343565e-05, + "loss": 0.6305804252624512, + "step": 3532 + }, + { + "epoch": 1.491139240506329, + "grad_norm": 1.1471205949783325, + "learning_rate": 9.195553151096475e-05, + "loss": 0.6256994605064392, + "step": 3534 + }, + { + "epoch": 1.4919831223628692, + "grad_norm": 1.1771589517593384, + "learning_rate": 9.194246366015851e-05, + "loss": 0.6395107507705688, + "step": 3536 + }, + { + "epoch": 1.4928270042194094, + "grad_norm": 1.1997097730636597, + "learning_rate": 9.192938613403144e-05, + "loss": 0.6875160932540894, + "step": 3538 + }, + { + "epoch": 1.4936708860759493, + "grad_norm": 1.3962169885635376, + "learning_rate": 9.191629893560024e-05, + "loss": 0.7216510772705078, + "step": 3540 + }, + { + "epoch": 1.4945147679324895, + "grad_norm": 1.1835654973983765, + "learning_rate": 9.19032020678839e-05, + "loss": 0.6870693564414978, + "step": 3542 + }, + { + "epoch": 1.4953586497890297, + "grad_norm": 1.112331509590149, + "learning_rate": 9.18900955339036e-05, + "loss": 0.6266092658042908, + "step": 3544 + }, + { + "epoch": 1.4962025316455696, + "grad_norm": 1.0298354625701904, + "learning_rate": 9.187697933668278e-05, + "loss": 0.5906343460083008, + "step": 3546 + }, + { + "epoch": 1.4970464135021098, + "grad_norm": 1.2650012969970703, + "learning_rate": 9.186385347924709e-05, + "loss": 0.6203610897064209, + "step": 3548 + }, + { + "epoch": 1.49789029535865, + "grad_norm": 1.1208417415618896, + "learning_rate": 9.185071796462441e-05, + "loss": 0.6841281652450562, + "step": 3550 + }, + { + "epoch": 1.4987341772151899, + "grad_norm": 1.1319488286972046, + "learning_rate": 9.183757279584486e-05, + "loss": 0.7089514136314392, + "step": 3552 + }, + { + "epoch": 1.49957805907173, + "grad_norm": 1.1104235649108887, + "learning_rate": 9.182441797594076e-05, + "loss": 0.6663861870765686, + "step": 3554 + }, + { + "epoch": 1.5004219409282702, + "grad_norm": 1.161412000656128, + "learning_rate": 9.18112535079467e-05, + "loss": 0.6713237762451172, + "step": 3556 + }, + { + "epoch": 1.5012658227848101, + "grad_norm": 1.2925246953964233, + "learning_rate": 9.179807939489945e-05, + "loss": 0.6665274500846863, + "step": 3558 + }, + { + "epoch": 1.50210970464135, + "grad_norm": 1.0968270301818848, + "learning_rate": 9.178489563983802e-05, + "loss": 0.6881593465805054, + "step": 3560 + }, + { + "epoch": 1.5029535864978905, + "grad_norm": 1.111439824104309, + "learning_rate": 9.177170224580368e-05, + "loss": 0.631568431854248, + "step": 3562 + }, + { + "epoch": 1.5037974683544304, + "grad_norm": 1.6731075048446655, + "learning_rate": 9.175849921583986e-05, + "loss": 0.6896167397499084, + "step": 3564 + }, + { + "epoch": 1.5046413502109703, + "grad_norm": 1.226739525794983, + "learning_rate": 9.174528655299226e-05, + "loss": 0.6285277605056763, + "step": 3566 + }, + { + "epoch": 1.5054852320675105, + "grad_norm": 1.2030941247940063, + "learning_rate": 9.17320642603088e-05, + "loss": 0.6256678700447083, + "step": 3568 + }, + { + "epoch": 1.5063291139240507, + "grad_norm": 1.1980781555175781, + "learning_rate": 9.171883234083958e-05, + "loss": 0.6895992159843445, + "step": 3570 + }, + { + "epoch": 1.5071729957805906, + "grad_norm": 1.2083429098129272, + "learning_rate": 9.170559079763696e-05, + "loss": 0.6642275452613831, + "step": 3572 + }, + { + "epoch": 1.5080168776371308, + "grad_norm": 1.134020209312439, + "learning_rate": 9.169233963375552e-05, + "loss": 0.7441924214363098, + "step": 3574 + }, + { + "epoch": 1.508860759493671, + "grad_norm": 1.8178621530532837, + "learning_rate": 9.167907885225204e-05, + "loss": 0.6435995101928711, + "step": 3576 + }, + { + "epoch": 1.5097046413502109, + "grad_norm": 1.3850326538085938, + "learning_rate": 9.166580845618553e-05, + "loss": 0.6933603882789612, + "step": 3578 + }, + { + "epoch": 1.510548523206751, + "grad_norm": 1.2500641345977783, + "learning_rate": 9.165252844861723e-05, + "loss": 0.6686714887619019, + "step": 3580 + }, + { + "epoch": 1.5113924050632912, + "grad_norm": 1.0226643085479736, + "learning_rate": 9.163923883261056e-05, + "loss": 0.607890248298645, + "step": 3582 + }, + { + "epoch": 1.5122362869198311, + "grad_norm": 1.233402132987976, + "learning_rate": 9.162593961123118e-05, + "loss": 0.6604583859443665, + "step": 3584 + }, + { + "epoch": 1.5130801687763713, + "grad_norm": 1.2609056234359741, + "learning_rate": 9.161263078754698e-05, + "loss": 0.6756428480148315, + "step": 3586 + }, + { + "epoch": 1.5139240506329115, + "grad_norm": 1.22673761844635, + "learning_rate": 9.159931236462805e-05, + "loss": 0.6990940570831299, + "step": 3588 + }, + { + "epoch": 1.5147679324894514, + "grad_norm": 1.1386182308197021, + "learning_rate": 9.158598434554668e-05, + "loss": 0.6436648964881897, + "step": 3590 + }, + { + "epoch": 1.5156118143459916, + "grad_norm": 1.1136831045150757, + "learning_rate": 9.157264673337739e-05, + "loss": 0.6420145034790039, + "step": 3592 + }, + { + "epoch": 1.5164556962025317, + "grad_norm": 1.1957908868789673, + "learning_rate": 9.155929953119693e-05, + "loss": 0.6518592834472656, + "step": 3594 + }, + { + "epoch": 1.5172995780590717, + "grad_norm": 1.1049647331237793, + "learning_rate": 9.154594274208422e-05, + "loss": 0.6891129612922668, + "step": 3596 + }, + { + "epoch": 1.5181434599156118, + "grad_norm": 1.243675947189331, + "learning_rate": 9.153257636912043e-05, + "loss": 0.6945107579231262, + "step": 3598 + }, + { + "epoch": 1.518987341772152, + "grad_norm": 1.2633713483810425, + "learning_rate": 9.15192004153889e-05, + "loss": 0.7011660933494568, + "step": 3600 + }, + { + "epoch": 1.518987341772152, + "eval_loss": 0.7118256688117981, + "eval_runtime": 851.3079, + "eval_samples_per_second": 2.475, + "eval_steps_per_second": 2.475, + "step": 3600 + }, + { + "epoch": 1.519831223628692, + "grad_norm": 1.2995525598526, + "learning_rate": 9.150581488397525e-05, + "loss": 0.6843758821487427, + "step": 3602 + }, + { + "epoch": 1.520675105485232, + "grad_norm": 1.3140910863876343, + "learning_rate": 9.149241977796723e-05, + "loss": 0.6699353456497192, + "step": 3604 + }, + { + "epoch": 1.5215189873417723, + "grad_norm": 1.2674909830093384, + "learning_rate": 9.147901510045485e-05, + "loss": 0.7269271612167358, + "step": 3606 + }, + { + "epoch": 1.5223628691983122, + "grad_norm": 1.0232038497924805, + "learning_rate": 9.146560085453031e-05, + "loss": 0.5556837916374207, + "step": 3608 + }, + { + "epoch": 1.5232067510548524, + "grad_norm": 1.2598992586135864, + "learning_rate": 9.1452177043288e-05, + "loss": 0.7273092269897461, + "step": 3610 + }, + { + "epoch": 1.5240506329113925, + "grad_norm": 1.2002917528152466, + "learning_rate": 9.143874366982455e-05, + "loss": 0.6897470355033875, + "step": 3612 + }, + { + "epoch": 1.5248945147679325, + "grad_norm": 1.0959099531173706, + "learning_rate": 9.142530073723878e-05, + "loss": 0.6060715913772583, + "step": 3614 + }, + { + "epoch": 1.5257383966244724, + "grad_norm": 1.9890750646591187, + "learning_rate": 9.141184824863173e-05, + "loss": 0.6585046052932739, + "step": 3616 + }, + { + "epoch": 1.5265822784810128, + "grad_norm": 1.1460137367248535, + "learning_rate": 9.139838620710663e-05, + "loss": 0.6022046804428101, + "step": 3618 + }, + { + "epoch": 1.5274261603375527, + "grad_norm": 1.193206548690796, + "learning_rate": 9.138491461576888e-05, + "loss": 0.6332581639289856, + "step": 3620 + }, + { + "epoch": 1.5282700421940927, + "grad_norm": 1.2813689708709717, + "learning_rate": 9.137143347772614e-05, + "loss": 0.6690208315849304, + "step": 3622 + }, + { + "epoch": 1.529113924050633, + "grad_norm": 1.0950052738189697, + "learning_rate": 9.135794279608827e-05, + "loss": 0.6034293174743652, + "step": 3624 + }, + { + "epoch": 1.529957805907173, + "grad_norm": 1.208884358406067, + "learning_rate": 9.134444257396729e-05, + "loss": 0.7077960968017578, + "step": 3626 + }, + { + "epoch": 1.530801687763713, + "grad_norm": 1.093759298324585, + "learning_rate": 9.133093281447742e-05, + "loss": 0.6741147637367249, + "step": 3628 + }, + { + "epoch": 1.5316455696202531, + "grad_norm": 1.1280012130737305, + "learning_rate": 9.131741352073514e-05, + "loss": 0.6816818118095398, + "step": 3630 + }, + { + "epoch": 1.5324894514767933, + "grad_norm": 1.2868385314941406, + "learning_rate": 9.130388469585907e-05, + "loss": 0.7149180769920349, + "step": 3632 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.9654553532600403, + "learning_rate": 9.129034634297007e-05, + "loss": 0.613467812538147, + "step": 3634 + }, + { + "epoch": 1.5341772151898734, + "grad_norm": 1.8958736658096313, + "learning_rate": 9.127679846519115e-05, + "loss": 0.7034116387367249, + "step": 3636 + }, + { + "epoch": 1.5350210970464135, + "grad_norm": 1.305284857749939, + "learning_rate": 9.126324106564757e-05, + "loss": 0.7076106667518616, + "step": 3638 + }, + { + "epoch": 1.5358649789029535, + "grad_norm": 1.1843762397766113, + "learning_rate": 9.124967414746675e-05, + "loss": 0.6671180725097656, + "step": 3640 + }, + { + "epoch": 1.5367088607594936, + "grad_norm": 1.0460047721862793, + "learning_rate": 9.123609771377832e-05, + "loss": 0.667533814907074, + "step": 3642 + }, + { + "epoch": 1.5375527426160338, + "grad_norm": 1.0441135168075562, + "learning_rate": 9.122251176771409e-05, + "loss": 0.6454499959945679, + "step": 3644 + }, + { + "epoch": 1.5383966244725737, + "grad_norm": 1.5647634267807007, + "learning_rate": 9.120891631240811e-05, + "loss": 0.677007794380188, + "step": 3646 + }, + { + "epoch": 1.539240506329114, + "grad_norm": 1.0650273561477661, + "learning_rate": 9.119531135099655e-05, + "loss": 0.7017449736595154, + "step": 3648 + }, + { + "epoch": 1.540084388185654, + "grad_norm": 1.2904767990112305, + "learning_rate": 9.118169688661784e-05, + "loss": 0.683830738067627, + "step": 3650 + }, + { + "epoch": 1.540928270042194, + "grad_norm": 1.1278672218322754, + "learning_rate": 9.116807292241257e-05, + "loss": 0.5923286080360413, + "step": 3652 + }, + { + "epoch": 1.5417721518987342, + "grad_norm": 1.1107184886932373, + "learning_rate": 9.115443946152352e-05, + "loss": 0.6595140099525452, + "step": 3654 + }, + { + "epoch": 1.5426160337552743, + "grad_norm": 1.0917898416519165, + "learning_rate": 9.114079650709566e-05, + "loss": 0.655241072177887, + "step": 3656 + }, + { + "epoch": 1.5434599156118143, + "grad_norm": 1.1922433376312256, + "learning_rate": 9.11271440622762e-05, + "loss": 0.5987096428871155, + "step": 3658 + }, + { + "epoch": 1.5443037974683544, + "grad_norm": 0.9974617958068848, + "learning_rate": 9.111348213021445e-05, + "loss": 0.5710145235061646, + "step": 3660 + }, + { + "epoch": 1.5451476793248946, + "grad_norm": 1.133683443069458, + "learning_rate": 9.109981071406197e-05, + "loss": 0.6067734360694885, + "step": 3662 + }, + { + "epoch": 1.5459915611814345, + "grad_norm": 1.1958736181259155, + "learning_rate": 9.108612981697248e-05, + "loss": 0.622981071472168, + "step": 3664 + }, + { + "epoch": 1.5468354430379747, + "grad_norm": 1.234328031539917, + "learning_rate": 9.107243944210194e-05, + "loss": 0.6520710587501526, + "step": 3666 + }, + { + "epoch": 1.5476793248945149, + "grad_norm": 1.0374714136123657, + "learning_rate": 9.105873959260842e-05, + "loss": 0.5993341207504272, + "step": 3668 + }, + { + "epoch": 1.5485232067510548, + "grad_norm": 0.9987428784370422, + "learning_rate": 9.104503027165223e-05, + "loss": 0.6564813852310181, + "step": 3670 + }, + { + "epoch": 1.549367088607595, + "grad_norm": 1.0823339223861694, + "learning_rate": 9.103131148239584e-05, + "loss": 0.61710524559021, + "step": 3672 + }, + { + "epoch": 1.5502109704641351, + "grad_norm": 1.3481065034866333, + "learning_rate": 9.101758322800391e-05, + "loss": 0.687752366065979, + "step": 3674 + }, + { + "epoch": 1.551054852320675, + "grad_norm": 1.2243965864181519, + "learning_rate": 9.10038455116433e-05, + "loss": 0.5981095433235168, + "step": 3676 + }, + { + "epoch": 1.5518987341772152, + "grad_norm": 1.1384631395339966, + "learning_rate": 9.0990098336483e-05, + "loss": 0.7181004285812378, + "step": 3678 + }, + { + "epoch": 1.5527426160337554, + "grad_norm": 1.042925477027893, + "learning_rate": 9.097634170569426e-05, + "loss": 0.6137188076972961, + "step": 3680 + }, + { + "epoch": 1.5535864978902953, + "grad_norm": 1.372023105621338, + "learning_rate": 9.096257562245045e-05, + "loss": 0.6761168241500854, + "step": 3682 + }, + { + "epoch": 1.5544303797468353, + "grad_norm": 1.0574673414230347, + "learning_rate": 9.094880008992714e-05, + "loss": 0.614276647567749, + "step": 3684 + }, + { + "epoch": 1.5552742616033757, + "grad_norm": 1.2894645929336548, + "learning_rate": 9.093501511130208e-05, + "loss": 0.668122410774231, + "step": 3686 + }, + { + "epoch": 1.5561181434599156, + "grad_norm": 1.2241230010986328, + "learning_rate": 9.092122068975523e-05, + "loss": 0.6305631399154663, + "step": 3688 + }, + { + "epoch": 1.5569620253164556, + "grad_norm": 1.1316208839416504, + "learning_rate": 9.090741682846866e-05, + "loss": 0.633276641368866, + "step": 3690 + }, + { + "epoch": 1.557805907172996, + "grad_norm": 1.2857953310012817, + "learning_rate": 9.089360353062666e-05, + "loss": 0.6657599806785583, + "step": 3692 + }, + { + "epoch": 1.5586497890295359, + "grad_norm": 1.2325671911239624, + "learning_rate": 9.087978079941573e-05, + "loss": 0.6379332542419434, + "step": 3694 + }, + { + "epoch": 1.5594936708860758, + "grad_norm": 1.3286080360412598, + "learning_rate": 9.086594863802445e-05, + "loss": 0.6841909885406494, + "step": 3696 + }, + { + "epoch": 1.560337552742616, + "grad_norm": 1.261890172958374, + "learning_rate": 9.085210704964368e-05, + "loss": 0.6735964417457581, + "step": 3698 + }, + { + "epoch": 1.5611814345991561, + "grad_norm": 1.0922305583953857, + "learning_rate": 9.083825603746639e-05, + "loss": 0.6602351665496826, + "step": 3700 + }, + { + "epoch": 1.5611814345991561, + "eval_loss": 0.7099412679672241, + "eval_runtime": 857.2273, + "eval_samples_per_second": 2.458, + "eval_steps_per_second": 2.458, + "step": 3700 + }, + { + "epoch": 1.562025316455696, + "grad_norm": 1.1113468408584595, + "learning_rate": 9.082439560468774e-05, + "loss": 0.6590834259986877, + "step": 3702 + }, + { + "epoch": 1.5628691983122363, + "grad_norm": 1.1476659774780273, + "learning_rate": 9.081052575450508e-05, + "loss": 0.6397460103034973, + "step": 3704 + }, + { + "epoch": 1.5637130801687764, + "grad_norm": 1.2270452976226807, + "learning_rate": 9.07966464901179e-05, + "loss": 0.6337460279464722, + "step": 3706 + }, + { + "epoch": 1.5645569620253164, + "grad_norm": 1.233667016029358, + "learning_rate": 9.07827578147279e-05, + "loss": 0.680374801158905, + "step": 3708 + }, + { + "epoch": 1.5654008438818565, + "grad_norm": 1.0761466026306152, + "learning_rate": 9.076885973153891e-05, + "loss": 0.6234241724014282, + "step": 3710 + }, + { + "epoch": 1.5662447257383967, + "grad_norm": 0.9219012260437012, + "learning_rate": 9.075495224375697e-05, + "loss": 0.6096800565719604, + "step": 3712 + }, + { + "epoch": 1.5670886075949366, + "grad_norm": 1.151168942451477, + "learning_rate": 9.074103535459026e-05, + "loss": 0.649919867515564, + "step": 3714 + }, + { + "epoch": 1.5679324894514768, + "grad_norm": 1.1380470991134644, + "learning_rate": 9.072710906724914e-05, + "loss": 0.6704574227333069, + "step": 3716 + }, + { + "epoch": 1.568776371308017, + "grad_norm": 1.2184447050094604, + "learning_rate": 9.071317338494614e-05, + "loss": 0.6619362831115723, + "step": 3718 + }, + { + "epoch": 1.5696202531645569, + "grad_norm": 1.131170630455017, + "learning_rate": 9.069922831089594e-05, + "loss": 0.6179121732711792, + "step": 3720 + }, + { + "epoch": 1.570464135021097, + "grad_norm": 1.2668405771255493, + "learning_rate": 9.06852738483154e-05, + "loss": 0.594958484172821, + "step": 3722 + }, + { + "epoch": 1.5713080168776372, + "grad_norm": 1.1624782085418701, + "learning_rate": 9.067131000042359e-05, + "loss": 0.6323778629302979, + "step": 3724 + }, + { + "epoch": 1.5721518987341772, + "grad_norm": 1.2936128377914429, + "learning_rate": 9.065733677044166e-05, + "loss": 0.628058910369873, + "step": 3726 + }, + { + "epoch": 1.5729957805907173, + "grad_norm": 1.1847784519195557, + "learning_rate": 9.064335416159296e-05, + "loss": 0.6472614407539368, + "step": 3728 + }, + { + "epoch": 1.5738396624472575, + "grad_norm": 1.8903449773788452, + "learning_rate": 9.062936217710305e-05, + "loss": 0.6395491361618042, + "step": 3730 + }, + { + "epoch": 1.5746835443037974, + "grad_norm": 1.1150785684585571, + "learning_rate": 9.061536082019956e-05, + "loss": 0.6911961436271667, + "step": 3732 + }, + { + "epoch": 1.5755274261603376, + "grad_norm": 1.1206107139587402, + "learning_rate": 9.060135009411239e-05, + "loss": 0.7051874399185181, + "step": 3734 + }, + { + "epoch": 1.5763713080168777, + "grad_norm": 1.27924382686615, + "learning_rate": 9.05873300020735e-05, + "loss": 0.7012752890586853, + "step": 3736 + }, + { + "epoch": 1.5772151898734177, + "grad_norm": 1.3970832824707031, + "learning_rate": 9.057330054731707e-05, + "loss": 0.7185142040252686, + "step": 3738 + }, + { + "epoch": 1.5780590717299579, + "grad_norm": 0.9732457995414734, + "learning_rate": 9.055926173307945e-05, + "loss": 0.6298858523368835, + "step": 3740 + }, + { + "epoch": 1.578902953586498, + "grad_norm": 1.230928897857666, + "learning_rate": 9.054521356259909e-05, + "loss": 0.7142943739891052, + "step": 3742 + }, + { + "epoch": 1.579746835443038, + "grad_norm": 1.1297426223754883, + "learning_rate": 9.053115603911664e-05, + "loss": 0.6535376310348511, + "step": 3744 + }, + { + "epoch": 1.580590717299578, + "grad_norm": 1.2132076025009155, + "learning_rate": 9.051708916587491e-05, + "loss": 0.6236510872840881, + "step": 3746 + }, + { + "epoch": 1.5814345991561183, + "grad_norm": 1.201319932937622, + "learning_rate": 9.050301294611885e-05, + "loss": 0.6752219200134277, + "step": 3748 + }, + { + "epoch": 1.5822784810126582, + "grad_norm": 1.2969163656234741, + "learning_rate": 9.048892738309559e-05, + "loss": 0.7248554825782776, + "step": 3750 + }, + { + "epoch": 1.5831223628691982, + "grad_norm": 1.0721957683563232, + "learning_rate": 9.047483248005439e-05, + "loss": 0.6488997340202332, + "step": 3752 + }, + { + "epoch": 1.5839662447257385, + "grad_norm": 0.9988508820533752, + "learning_rate": 9.046072824024667e-05, + "loss": 0.6191130876541138, + "step": 3754 + }, + { + "epoch": 1.5848101265822785, + "grad_norm": 1.260183572769165, + "learning_rate": 9.0446614666926e-05, + "loss": 0.6681985259056091, + "step": 3756 + }, + { + "epoch": 1.5856540084388184, + "grad_norm": 1.1288834810256958, + "learning_rate": 9.043249176334812e-05, + "loss": 0.662024736404419, + "step": 3758 + }, + { + "epoch": 1.5864978902953588, + "grad_norm": 1.4384263753890991, + "learning_rate": 9.04183595327709e-05, + "loss": 0.609916627407074, + "step": 3760 + }, + { + "epoch": 1.5873417721518988, + "grad_norm": 1.1109941005706787, + "learning_rate": 9.04042179784544e-05, + "loss": 0.6532528400421143, + "step": 3762 + }, + { + "epoch": 1.5881856540084387, + "grad_norm": 1.0959233045578003, + "learning_rate": 9.039006710366078e-05, + "loss": 0.7136290669441223, + "step": 3764 + }, + { + "epoch": 1.5890295358649789, + "grad_norm": 1.2313964366912842, + "learning_rate": 9.037590691165439e-05, + "loss": 0.6907190084457397, + "step": 3766 + }, + { + "epoch": 1.589873417721519, + "grad_norm": 1.3127682209014893, + "learning_rate": 9.036173740570172e-05, + "loss": 0.7114790678024292, + "step": 3768 + }, + { + "epoch": 1.590717299578059, + "grad_norm": 1.0038903951644897, + "learning_rate": 9.034755858907138e-05, + "loss": 0.6257581114768982, + "step": 3770 + }, + { + "epoch": 1.5915611814345991, + "grad_norm": 1.1058061122894287, + "learning_rate": 9.033337046503416e-05, + "loss": 0.578145444393158, + "step": 3772 + }, + { + "epoch": 1.5924050632911393, + "grad_norm": 1.0893515348434448, + "learning_rate": 9.0319173036863e-05, + "loss": 0.6312620043754578, + "step": 3774 + }, + { + "epoch": 1.5932489451476792, + "grad_norm": 1.1091047525405884, + "learning_rate": 9.030496630783297e-05, + "loss": 0.6799508333206177, + "step": 3776 + }, + { + "epoch": 1.5940928270042194, + "grad_norm": 1.1103609800338745, + "learning_rate": 9.029075028122127e-05, + "loss": 0.678726315498352, + "step": 3778 + }, + { + "epoch": 1.5949367088607596, + "grad_norm": 1.1918376684188843, + "learning_rate": 9.027652496030728e-05, + "loss": 0.7357890009880066, + "step": 3780 + }, + { + "epoch": 1.5957805907172995, + "grad_norm": 1.0541924238204956, + "learning_rate": 9.026229034837253e-05, + "loss": 0.6079391241073608, + "step": 3782 + }, + { + "epoch": 1.5966244725738397, + "grad_norm": 1.195845603942871, + "learning_rate": 9.024804644870062e-05, + "loss": 0.7173702120780945, + "step": 3784 + }, + { + "epoch": 1.5974683544303798, + "grad_norm": 1.1362866163253784, + "learning_rate": 9.023379326457737e-05, + "loss": 0.6431670188903809, + "step": 3786 + }, + { + "epoch": 1.5983122362869198, + "grad_norm": 1.2327499389648438, + "learning_rate": 9.021953079929074e-05, + "loss": 0.6346777677536011, + "step": 3788 + }, + { + "epoch": 1.59915611814346, + "grad_norm": 1.1623177528381348, + "learning_rate": 9.020525905613078e-05, + "loss": 0.6852784156799316, + "step": 3790 + }, + { + "epoch": 1.6, + "grad_norm": 1.0258424282073975, + "learning_rate": 9.019097803838971e-05, + "loss": 0.6357095241546631, + "step": 3792 + }, + { + "epoch": 1.60084388185654, + "grad_norm": 1.0825177431106567, + "learning_rate": 9.017668774936188e-05, + "loss": 0.6663659811019897, + "step": 3794 + }, + { + "epoch": 1.6016877637130802, + "grad_norm": 1.1190401315689087, + "learning_rate": 9.016238819234381e-05, + "loss": 0.6009758710861206, + "step": 3796 + }, + { + "epoch": 1.6025316455696204, + "grad_norm": 1.09871244430542, + "learning_rate": 9.01480793706341e-05, + "loss": 0.6907890439033508, + "step": 3798 + }, + { + "epoch": 1.6033755274261603, + "grad_norm": 1.2046958208084106, + "learning_rate": 9.013376128753354e-05, + "loss": 0.6709389090538025, + "step": 3800 + }, + { + "epoch": 1.6033755274261603, + "eval_loss": 0.7080941200256348, + "eval_runtime": 865.6774, + "eval_samples_per_second": 2.434, + "eval_steps_per_second": 2.434, + "step": 3800 + }, + { + "epoch": 1.6042194092827005, + "grad_norm": 1.0671489238739014, + "learning_rate": 9.011943394634505e-05, + "loss": 0.653937041759491, + "step": 3802 + }, + { + "epoch": 1.6050632911392406, + "grad_norm": 1.4205375909805298, + "learning_rate": 9.010509735037364e-05, + "loss": 0.6647229194641113, + "step": 3804 + }, + { + "epoch": 1.6059071729957806, + "grad_norm": 1.3793799877166748, + "learning_rate": 9.009075150292652e-05, + "loss": 0.6981267929077148, + "step": 3806 + }, + { + "epoch": 1.6067510548523207, + "grad_norm": 1.0534380674362183, + "learning_rate": 9.007639640731298e-05, + "loss": 0.6151314973831177, + "step": 3808 + }, + { + "epoch": 1.6075949367088609, + "grad_norm": 1.1359853744506836, + "learning_rate": 9.006203206684447e-05, + "loss": 0.6671237349510193, + "step": 3810 + }, + { + "epoch": 1.6084388185654008, + "grad_norm": 1.2385475635528564, + "learning_rate": 9.004765848483456e-05, + "loss": 0.7145646810531616, + "step": 3812 + }, + { + "epoch": 1.6092827004219408, + "grad_norm": 1.1323930025100708, + "learning_rate": 9.003327566459899e-05, + "loss": 0.6524789929389954, + "step": 3814 + }, + { + "epoch": 1.6101265822784812, + "grad_norm": 1.1863508224487305, + "learning_rate": 9.001888360945555e-05, + "loss": 0.7574670314788818, + "step": 3816 + }, + { + "epoch": 1.610970464135021, + "grad_norm": 1.0288994312286377, + "learning_rate": 9.000448232272425e-05, + "loss": 0.5858811736106873, + "step": 3818 + }, + { + "epoch": 1.611814345991561, + "grad_norm": 1.2674148082733154, + "learning_rate": 8.999007180772719e-05, + "loss": 0.6834250688552856, + "step": 3820 + }, + { + "epoch": 1.6126582278481014, + "grad_norm": 1.2014318704605103, + "learning_rate": 8.997565206778856e-05, + "loss": 0.6435309052467346, + "step": 3822 + }, + { + "epoch": 1.6135021097046414, + "grad_norm": 1.205741286277771, + "learning_rate": 8.996122310623476e-05, + "loss": 0.6212471127510071, + "step": 3824 + }, + { + "epoch": 1.6143459915611813, + "grad_norm": 1.0866186618804932, + "learning_rate": 8.994678492639426e-05, + "loss": 0.6832143664360046, + "step": 3826 + }, + { + "epoch": 1.6151898734177215, + "grad_norm": 1.0786924362182617, + "learning_rate": 8.993233753159768e-05, + "loss": 0.6129988431930542, + "step": 3828 + }, + { + "epoch": 1.6160337552742616, + "grad_norm": 1.176597237586975, + "learning_rate": 8.991788092517775e-05, + "loss": 0.6376019716262817, + "step": 3830 + }, + { + "epoch": 1.6168776371308016, + "grad_norm": 1.149990200996399, + "learning_rate": 8.99034151104693e-05, + "loss": 0.7300569415092468, + "step": 3832 + }, + { + "epoch": 1.6177215189873417, + "grad_norm": 1.0655301809310913, + "learning_rate": 8.988894009080936e-05, + "loss": 0.6163336634635925, + "step": 3834 + }, + { + "epoch": 1.618565400843882, + "grad_norm": 1.1596909761428833, + "learning_rate": 8.987445586953703e-05, + "loss": 0.6459008455276489, + "step": 3836 + }, + { + "epoch": 1.6194092827004218, + "grad_norm": 1.201897382736206, + "learning_rate": 8.985996244999352e-05, + "loss": 0.6166399121284485, + "step": 3838 + }, + { + "epoch": 1.620253164556962, + "grad_norm": 1.1000950336456299, + "learning_rate": 8.984545983552219e-05, + "loss": 0.6438087224960327, + "step": 3840 + }, + { + "epoch": 1.6210970464135022, + "grad_norm": 0.9962409734725952, + "learning_rate": 8.983094802946854e-05, + "loss": 0.6238043308258057, + "step": 3842 + }, + { + "epoch": 1.621940928270042, + "grad_norm": 1.2501682043075562, + "learning_rate": 8.981642703518015e-05, + "loss": 0.6445946097373962, + "step": 3844 + }, + { + "epoch": 1.6227848101265823, + "grad_norm": 1.2027913331985474, + "learning_rate": 8.980189685600673e-05, + "loss": 0.7147613167762756, + "step": 3846 + }, + { + "epoch": 1.6236286919831224, + "grad_norm": 1.1382197141647339, + "learning_rate": 8.97873574953001e-05, + "loss": 0.6531714200973511, + "step": 3848 + }, + { + "epoch": 1.6244725738396624, + "grad_norm": 1.2600723505020142, + "learning_rate": 8.977280895641425e-05, + "loss": 0.6811055541038513, + "step": 3850 + }, + { + "epoch": 1.6253164556962025, + "grad_norm": 0.9908071160316467, + "learning_rate": 8.97582512427052e-05, + "loss": 0.6142261624336243, + "step": 3852 + }, + { + "epoch": 1.6261603375527427, + "grad_norm": 1.171557068824768, + "learning_rate": 8.974368435753117e-05, + "loss": 0.6408987045288086, + "step": 3854 + }, + { + "epoch": 1.6270042194092826, + "grad_norm": 1.1839419603347778, + "learning_rate": 8.972910830425247e-05, + "loss": 0.7352069616317749, + "step": 3856 + }, + { + "epoch": 1.6278481012658228, + "grad_norm": 1.233730673789978, + "learning_rate": 8.971452308623148e-05, + "loss": 0.7663040161132812, + "step": 3858 + }, + { + "epoch": 1.628691983122363, + "grad_norm": 1.3636224269866943, + "learning_rate": 8.969992870683273e-05, + "loss": 0.6496971249580383, + "step": 3860 + }, + { + "epoch": 1.629535864978903, + "grad_norm": 1.2819573879241943, + "learning_rate": 8.96853251694229e-05, + "loss": 0.6079609394073486, + "step": 3862 + }, + { + "epoch": 1.630379746835443, + "grad_norm": 1.087265968322754, + "learning_rate": 8.967071247737071e-05, + "loss": 0.6299422979354858, + "step": 3864 + }, + { + "epoch": 1.6312236286919832, + "grad_norm": 1.24200439453125, + "learning_rate": 8.965609063404706e-05, + "loss": 0.6691840291023254, + "step": 3866 + }, + { + "epoch": 1.6320675105485232, + "grad_norm": 1.0771806240081787, + "learning_rate": 8.96414596428249e-05, + "loss": 0.6623613238334656, + "step": 3868 + }, + { + "epoch": 1.6329113924050633, + "grad_norm": 1.1830974817276, + "learning_rate": 8.962681950707932e-05, + "loss": 0.6663276553153992, + "step": 3870 + }, + { + "epoch": 1.6337552742616035, + "grad_norm": 1.1107177734375, + "learning_rate": 8.961217023018754e-05, + "loss": 0.6426810622215271, + "step": 3872 + }, + { + "epoch": 1.6345991561181434, + "grad_norm": 1.2528507709503174, + "learning_rate": 8.959751181552886e-05, + "loss": 0.7113696336746216, + "step": 3874 + }, + { + "epoch": 1.6354430379746834, + "grad_norm": 1.0656070709228516, + "learning_rate": 8.958284426648467e-05, + "loss": 0.6211581230163574, + "step": 3876 + }, + { + "epoch": 1.6362869198312238, + "grad_norm": 1.0627381801605225, + "learning_rate": 8.956816758643852e-05, + "loss": 0.5950066447257996, + "step": 3878 + }, + { + "epoch": 1.6371308016877637, + "grad_norm": 0.9812912344932556, + "learning_rate": 8.955348177877603e-05, + "loss": 0.6519815325737, + "step": 3880 + }, + { + "epoch": 1.6379746835443036, + "grad_norm": 1.1843842267990112, + "learning_rate": 8.953878684688493e-05, + "loss": 0.6830767393112183, + "step": 3882 + }, + { + "epoch": 1.638818565400844, + "grad_norm": 1.0393236875534058, + "learning_rate": 8.952408279415507e-05, + "loss": 0.5920302271842957, + "step": 3884 + }, + { + "epoch": 1.639662447257384, + "grad_norm": 0.9931944608688354, + "learning_rate": 8.950936962397838e-05, + "loss": 0.6269177198410034, + "step": 3886 + }, + { + "epoch": 1.640506329113924, + "grad_norm": 1.1461358070373535, + "learning_rate": 8.949464733974891e-05, + "loss": 0.7021532654762268, + "step": 3888 + }, + { + "epoch": 1.6413502109704643, + "grad_norm": 1.2654093503952026, + "learning_rate": 8.947991594486279e-05, + "loss": 0.7331246733665466, + "step": 3890 + }, + { + "epoch": 1.6421940928270042, + "grad_norm": 1.1487081050872803, + "learning_rate": 8.946517544271831e-05, + "loss": 0.6438513994216919, + "step": 3892 + }, + { + "epoch": 1.6430379746835442, + "grad_norm": 1.0876784324645996, + "learning_rate": 8.945042583671579e-05, + "loss": 0.6779276728630066, + "step": 3894 + }, + { + "epoch": 1.6438818565400843, + "grad_norm": 1.2382020950317383, + "learning_rate": 8.943566713025768e-05, + "loss": 0.7255419492721558, + "step": 3896 + }, + { + "epoch": 1.6447257383966245, + "grad_norm": 1.3502718210220337, + "learning_rate": 8.942089932674855e-05, + "loss": 0.7068934440612793, + "step": 3898 + }, + { + "epoch": 1.6455696202531644, + "grad_norm": 1.050878643989563, + "learning_rate": 8.940612242959503e-05, + "loss": 0.608700156211853, + "step": 3900 + }, + { + "epoch": 1.6455696202531644, + "eval_loss": 0.7049403786659241, + "eval_runtime": 854.9866, + "eval_samples_per_second": 2.464, + "eval_steps_per_second": 2.464, + "step": 3900 + }, + { + "epoch": 1.6464135021097046, + "grad_norm": 1.0536954402923584, + "learning_rate": 8.939133644220588e-05, + "loss": 0.6257222890853882, + "step": 3902 + }, + { + "epoch": 1.6472573839662448, + "grad_norm": 1.1903947591781616, + "learning_rate": 8.937654136799195e-05, + "loss": 0.6823404431343079, + "step": 3904 + }, + { + "epoch": 1.6481012658227847, + "grad_norm": 1.225679874420166, + "learning_rate": 8.936173721036616e-05, + "loss": 0.6596478819847107, + "step": 3906 + }, + { + "epoch": 1.6489451476793249, + "grad_norm": 1.0071430206298828, + "learning_rate": 8.934692397274354e-05, + "loss": 0.5638422966003418, + "step": 3908 + }, + { + "epoch": 1.649789029535865, + "grad_norm": 1.0146223306655884, + "learning_rate": 8.933210165854125e-05, + "loss": 0.5743419528007507, + "step": 3910 + }, + { + "epoch": 1.650632911392405, + "grad_norm": 1.122976541519165, + "learning_rate": 8.931727027117848e-05, + "loss": 0.6775169372558594, + "step": 3912 + }, + { + "epoch": 1.6514767932489451, + "grad_norm": 0.9223271012306213, + "learning_rate": 8.930242981407656e-05, + "loss": 0.5984215140342712, + "step": 3914 + }, + { + "epoch": 1.6523206751054853, + "grad_norm": 1.1599735021591187, + "learning_rate": 8.928758029065891e-05, + "loss": 0.6342158913612366, + "step": 3916 + }, + { + "epoch": 1.6531645569620252, + "grad_norm": 1.2680121660232544, + "learning_rate": 8.927272170435101e-05, + "loss": 0.678507924079895, + "step": 3918 + }, + { + "epoch": 1.6540084388185654, + "grad_norm": 1.3628549575805664, + "learning_rate": 8.925785405858047e-05, + "loss": 0.6739710569381714, + "step": 3920 + }, + { + "epoch": 1.6548523206751056, + "grad_norm": 1.163482427597046, + "learning_rate": 8.924297735677694e-05, + "loss": 0.7050020098686218, + "step": 3922 + }, + { + "epoch": 1.6556962025316455, + "grad_norm": 1.2057000398635864, + "learning_rate": 8.922809160237222e-05, + "loss": 0.6847540140151978, + "step": 3924 + }, + { + "epoch": 1.6565400843881857, + "grad_norm": 1.2784082889556885, + "learning_rate": 8.921319679880016e-05, + "loss": 0.7079069018363953, + "step": 3926 + }, + { + "epoch": 1.6573839662447258, + "grad_norm": 1.1701157093048096, + "learning_rate": 8.919829294949671e-05, + "loss": 0.665060818195343, + "step": 3928 + }, + { + "epoch": 1.6582278481012658, + "grad_norm": 1.3886606693267822, + "learning_rate": 8.918338005789988e-05, + "loss": 0.7547550201416016, + "step": 3930 + }, + { + "epoch": 1.659071729957806, + "grad_norm": 0.9504727721214294, + "learning_rate": 8.91684581274498e-05, + "loss": 0.5718522667884827, + "step": 3932 + }, + { + "epoch": 1.659915611814346, + "grad_norm": 1.1185030937194824, + "learning_rate": 8.915352716158869e-05, + "loss": 0.5984254479408264, + "step": 3934 + }, + { + "epoch": 1.660759493670886, + "grad_norm": 1.1489602327346802, + "learning_rate": 8.913858716376081e-05, + "loss": 0.6749780774116516, + "step": 3936 + }, + { + "epoch": 1.6616033755274262, + "grad_norm": 1.389431118965149, + "learning_rate": 8.912363813741255e-05, + "loss": 0.6537864804267883, + "step": 3938 + }, + { + "epoch": 1.6624472573839664, + "grad_norm": 1.0958757400512695, + "learning_rate": 8.910868008599235e-05, + "loss": 0.6033569574356079, + "step": 3940 + }, + { + "epoch": 1.6632911392405063, + "grad_norm": 1.2735344171524048, + "learning_rate": 8.909371301295075e-05, + "loss": 0.7404987215995789, + "step": 3942 + }, + { + "epoch": 1.6641350210970463, + "grad_norm": 1.123336911201477, + "learning_rate": 8.907873692174038e-05, + "loss": 0.6265006065368652, + "step": 3944 + }, + { + "epoch": 1.6649789029535866, + "grad_norm": 1.259470820426941, + "learning_rate": 8.90637518158159e-05, + "loss": 0.650705099105835, + "step": 3946 + }, + { + "epoch": 1.6658227848101266, + "grad_norm": 1.4020485877990723, + "learning_rate": 8.904875769863412e-05, + "loss": 0.7813970446586609, + "step": 3948 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.1709671020507812, + "learning_rate": 8.903375457365389e-05, + "loss": 0.6499447822570801, + "step": 3950 + }, + { + "epoch": 1.667510548523207, + "grad_norm": 1.085585355758667, + "learning_rate": 8.901874244433612e-05, + "loss": 0.6141875386238098, + "step": 3952 + }, + { + "epoch": 1.6683544303797468, + "grad_norm": 1.2340166568756104, + "learning_rate": 8.900372131414386e-05, + "loss": 0.7080221176147461, + "step": 3954 + }, + { + "epoch": 1.6691983122362868, + "grad_norm": 1.148576259613037, + "learning_rate": 8.898869118654216e-05, + "loss": 0.6340513229370117, + "step": 3956 + }, + { + "epoch": 1.6700421940928272, + "grad_norm": 1.2231999635696411, + "learning_rate": 8.89736520649982e-05, + "loss": 0.6999116539955139, + "step": 3958 + }, + { + "epoch": 1.6708860759493671, + "grad_norm": 1.1600396633148193, + "learning_rate": 8.895860395298121e-05, + "loss": 0.7177759408950806, + "step": 3960 + }, + { + "epoch": 1.671729957805907, + "grad_norm": 1.3019158840179443, + "learning_rate": 8.894354685396251e-05, + "loss": 0.6485702395439148, + "step": 3962 + }, + { + "epoch": 1.6725738396624472, + "grad_norm": 1.0153226852416992, + "learning_rate": 8.892848077141546e-05, + "loss": 0.6189450025558472, + "step": 3964 + }, + { + "epoch": 1.6734177215189874, + "grad_norm": 1.1953094005584717, + "learning_rate": 8.891340570881555e-05, + "loss": 0.6756728291511536, + "step": 3966 + }, + { + "epoch": 1.6742616033755273, + "grad_norm": 1.3376187086105347, + "learning_rate": 8.889832166964027e-05, + "loss": 0.6851167678833008, + "step": 3968 + }, + { + "epoch": 1.6751054852320675, + "grad_norm": 1.0045926570892334, + "learning_rate": 8.888322865736924e-05, + "loss": 0.5991915464401245, + "step": 3970 + }, + { + "epoch": 1.6759493670886076, + "grad_norm": 1.2115750312805176, + "learning_rate": 8.886812667548414e-05, + "loss": 0.713362455368042, + "step": 3972 + }, + { + "epoch": 1.6767932489451476, + "grad_norm": 1.1887929439544678, + "learning_rate": 8.88530157274687e-05, + "loss": 0.7058883309364319, + "step": 3974 + }, + { + "epoch": 1.6776371308016877, + "grad_norm": 1.1465295553207397, + "learning_rate": 8.883789581680868e-05, + "loss": 0.6501380801200867, + "step": 3976 + }, + { + "epoch": 1.678481012658228, + "grad_norm": 1.184693694114685, + "learning_rate": 8.882276694699204e-05, + "loss": 0.6109840273857117, + "step": 3978 + }, + { + "epoch": 1.6793248945147679, + "grad_norm": 1.2034777402877808, + "learning_rate": 8.880762912150862e-05, + "loss": 0.6815584897994995, + "step": 3980 + }, + { + "epoch": 1.680168776371308, + "grad_norm": 1.1312000751495361, + "learning_rate": 8.879248234385052e-05, + "loss": 0.6859248876571655, + "step": 3982 + }, + { + "epoch": 1.6810126582278482, + "grad_norm": 1.2273681163787842, + "learning_rate": 8.877732661751173e-05, + "loss": 0.6426702737808228, + "step": 3984 + }, + { + "epoch": 1.6818565400843881, + "grad_norm": 1.2550326585769653, + "learning_rate": 8.876216194598844e-05, + "loss": 0.6462456583976746, + "step": 3986 + }, + { + "epoch": 1.6827004219409283, + "grad_norm": 1.3111321926116943, + "learning_rate": 8.874698833277884e-05, + "loss": 0.6293925046920776, + "step": 3988 + }, + { + "epoch": 1.6835443037974684, + "grad_norm": 1.037883996963501, + "learning_rate": 8.873180578138316e-05, + "loss": 0.59798264503479, + "step": 3990 + }, + { + "epoch": 1.6843881856540084, + "grad_norm": 1.2411901950836182, + "learning_rate": 8.871661429530376e-05, + "loss": 0.6741529703140259, + "step": 3992 + }, + { + "epoch": 1.6852320675105485, + "grad_norm": 1.206354022026062, + "learning_rate": 8.8701413878045e-05, + "loss": 0.5972680449485779, + "step": 3994 + }, + { + "epoch": 1.6860759493670887, + "grad_norm": 1.1922144889831543, + "learning_rate": 8.868620453311334e-05, + "loss": 0.5879245400428772, + "step": 3996 + }, + { + "epoch": 1.6869198312236287, + "grad_norm": 1.3499996662139893, + "learning_rate": 8.867098626401729e-05, + "loss": 0.7381167411804199, + "step": 3998 + }, + { + "epoch": 1.6877637130801688, + "grad_norm": 1.3601514101028442, + "learning_rate": 8.865575907426737e-05, + "loss": 0.6590276956558228, + "step": 4000 + }, + { + "epoch": 1.6877637130801688, + "eval_loss": 0.7027890682220459, + "eval_runtime": 848.7529, + "eval_samples_per_second": 2.482, + "eval_steps_per_second": 2.482, + "step": 4000 + }, + { + "epoch": 1.688607594936709, + "grad_norm": 1.1060529947280884, + "learning_rate": 8.864052296737624e-05, + "loss": 0.5958077907562256, + "step": 4002 + }, + { + "epoch": 1.689451476793249, + "grad_norm": 1.2067371606826782, + "learning_rate": 8.862527794685858e-05, + "loss": 0.6802279353141785, + "step": 4004 + }, + { + "epoch": 1.690295358649789, + "grad_norm": 1.0094636678695679, + "learning_rate": 8.86100240162311e-05, + "loss": 0.5701603889465332, + "step": 4006 + }, + { + "epoch": 1.6911392405063292, + "grad_norm": 1.0976500511169434, + "learning_rate": 8.85947611790126e-05, + "loss": 0.6580625176429749, + "step": 4008 + }, + { + "epoch": 1.6919831223628692, + "grad_norm": 0.9448981285095215, + "learning_rate": 8.857948943872392e-05, + "loss": 0.5947542190551758, + "step": 4010 + }, + { + "epoch": 1.6928270042194091, + "grad_norm": 1.219609260559082, + "learning_rate": 8.856420879888796e-05, + "loss": 0.6361464262008667, + "step": 4012 + }, + { + "epoch": 1.6936708860759495, + "grad_norm": 1.2395503520965576, + "learning_rate": 8.854891926302966e-05, + "loss": 0.608664333820343, + "step": 4014 + }, + { + "epoch": 1.6945147679324895, + "grad_norm": 1.1300057172775269, + "learning_rate": 8.853362083467604e-05, + "loss": 0.6932460069656372, + "step": 4016 + }, + { + "epoch": 1.6953586497890294, + "grad_norm": 1.2300254106521606, + "learning_rate": 8.851831351735616e-05, + "loss": 0.646004855632782, + "step": 4018 + }, + { + "epoch": 1.6962025316455698, + "grad_norm": 1.2328956127166748, + "learning_rate": 8.85029973146011e-05, + "loss": 0.6760826110839844, + "step": 4020 + }, + { + "epoch": 1.6970464135021097, + "grad_norm": 1.1252286434173584, + "learning_rate": 8.848767222994401e-05, + "loss": 0.5943224430084229, + "step": 4022 + }, + { + "epoch": 1.6978902953586497, + "grad_norm": 1.1587592363357544, + "learning_rate": 8.847233826692012e-05, + "loss": 0.7535276412963867, + "step": 4024 + }, + { + "epoch": 1.6987341772151898, + "grad_norm": 1.0294606685638428, + "learning_rate": 8.845699542906667e-05, + "loss": 0.5903090834617615, + "step": 4026 + }, + { + "epoch": 1.69957805907173, + "grad_norm": 1.1940597295761108, + "learning_rate": 8.844164371992295e-05, + "loss": 0.6031379699707031, + "step": 4028 + }, + { + "epoch": 1.70042194092827, + "grad_norm": 1.0416409969329834, + "learning_rate": 8.842628314303031e-05, + "loss": 0.6185168623924255, + "step": 4030 + }, + { + "epoch": 1.70126582278481, + "grad_norm": 1.8715689182281494, + "learning_rate": 8.841091370193214e-05, + "loss": 0.6325570344924927, + "step": 4032 + }, + { + "epoch": 1.7021097046413503, + "grad_norm": 1.230658769607544, + "learning_rate": 8.839553540017387e-05, + "loss": 0.7413952350616455, + "step": 4034 + }, + { + "epoch": 1.7029535864978902, + "grad_norm": 1.298003077507019, + "learning_rate": 8.838014824130299e-05, + "loss": 0.6973189115524292, + "step": 4036 + }, + { + "epoch": 1.7037974683544304, + "grad_norm": 1.0246652364730835, + "learning_rate": 8.836475222886902e-05, + "loss": 0.6582493185997009, + "step": 4038 + }, + { + "epoch": 1.7046413502109705, + "grad_norm": 1.3652594089508057, + "learning_rate": 8.834934736642351e-05, + "loss": 0.6934399008750916, + "step": 4040 + }, + { + "epoch": 1.7054852320675105, + "grad_norm": 1.029778242111206, + "learning_rate": 8.833393365752007e-05, + "loss": 0.6437561511993408, + "step": 4042 + }, + { + "epoch": 1.7063291139240506, + "grad_norm": 1.1993004083633423, + "learning_rate": 8.831851110571437e-05, + "loss": 0.605059027671814, + "step": 4044 + }, + { + "epoch": 1.7071729957805908, + "grad_norm": 1.286389946937561, + "learning_rate": 8.830307971456406e-05, + "loss": 0.7035017609596252, + "step": 4046 + }, + { + "epoch": 1.7080168776371307, + "grad_norm": 1.1211459636688232, + "learning_rate": 8.82876394876289e-05, + "loss": 0.6429924964904785, + "step": 4048 + }, + { + "epoch": 1.7088607594936709, + "grad_norm": 1.1284868717193604, + "learning_rate": 8.827219042847064e-05, + "loss": 0.6454769968986511, + "step": 4050 + }, + { + "epoch": 1.709704641350211, + "grad_norm": 1.1934884786605835, + "learning_rate": 8.825673254065306e-05, + "loss": 0.707233190536499, + "step": 4052 + }, + { + "epoch": 1.710548523206751, + "grad_norm": 1.1560680866241455, + "learning_rate": 8.824126582774203e-05, + "loss": 0.6790444254875183, + "step": 4054 + }, + { + "epoch": 1.7113924050632912, + "grad_norm": 1.1924364566802979, + "learning_rate": 8.822579029330541e-05, + "loss": 0.6115295886993408, + "step": 4056 + }, + { + "epoch": 1.7122362869198313, + "grad_norm": 1.107370138168335, + "learning_rate": 8.82103059409131e-05, + "loss": 0.7039182186126709, + "step": 4058 + }, + { + "epoch": 1.7130801687763713, + "grad_norm": 1.2554657459259033, + "learning_rate": 8.819481277413707e-05, + "loss": 0.6580052971839905, + "step": 4060 + }, + { + "epoch": 1.7139240506329114, + "grad_norm": 1.2873135805130005, + "learning_rate": 8.817931079655127e-05, + "loss": 0.6042479276657104, + "step": 4062 + }, + { + "epoch": 1.7147679324894516, + "grad_norm": 1.027056097984314, + "learning_rate": 8.816380001173172e-05, + "loss": 0.5992372632026672, + "step": 4064 + }, + { + "epoch": 1.7156118143459915, + "grad_norm": 1.0694721937179565, + "learning_rate": 8.814828042325644e-05, + "loss": 0.7078655362129211, + "step": 4066 + }, + { + "epoch": 1.7164556962025317, + "grad_norm": 1.194984793663025, + "learning_rate": 8.813275203470555e-05, + "loss": 0.6618752479553223, + "step": 4068 + }, + { + "epoch": 1.7172995780590719, + "grad_norm": 1.1713165044784546, + "learning_rate": 8.811721484966109e-05, + "loss": 0.6328625679016113, + "step": 4070 + }, + { + "epoch": 1.7181434599156118, + "grad_norm": 0.9993656277656555, + "learning_rate": 8.810166887170724e-05, + "loss": 0.5916416645050049, + "step": 4072 + }, + { + "epoch": 1.7189873417721517, + "grad_norm": 1.172642707824707, + "learning_rate": 8.808611410443011e-05, + "loss": 0.6490002274513245, + "step": 4074 + }, + { + "epoch": 1.7198312236286921, + "grad_norm": 1.1404821872711182, + "learning_rate": 8.807055055141793e-05, + "loss": 0.6571791172027588, + "step": 4076 + }, + { + "epoch": 1.720675105485232, + "grad_norm": 1.2104214429855347, + "learning_rate": 8.80549782162609e-05, + "loss": 0.6233854293823242, + "step": 4078 + }, + { + "epoch": 1.721518987341772, + "grad_norm": 1.1691396236419678, + "learning_rate": 8.803939710255126e-05, + "loss": 0.6331531405448914, + "step": 4080 + }, + { + "epoch": 1.7223628691983124, + "grad_norm": 1.263174057006836, + "learning_rate": 8.802380721388325e-05, + "loss": 0.6321156620979309, + "step": 4082 + }, + { + "epoch": 1.7232067510548523, + "grad_norm": 1.0685606002807617, + "learning_rate": 8.80082085538532e-05, + "loss": 0.644904613494873, + "step": 4084 + }, + { + "epoch": 1.7240506329113923, + "grad_norm": 1.2289735078811646, + "learning_rate": 8.799260112605938e-05, + "loss": 0.6743831634521484, + "step": 4086 + }, + { + "epoch": 1.7248945147679327, + "grad_norm": 1.0661355257034302, + "learning_rate": 8.797698493410216e-05, + "loss": 0.6866999268531799, + "step": 4088 + }, + { + "epoch": 1.7257383966244726, + "grad_norm": 1.1001228094100952, + "learning_rate": 8.796135998158386e-05, + "loss": 0.691387414932251, + "step": 4090 + }, + { + "epoch": 1.7265822784810125, + "grad_norm": 1.1078115701675415, + "learning_rate": 8.794572627210887e-05, + "loss": 0.5882864594459534, + "step": 4092 + }, + { + "epoch": 1.7274261603375527, + "grad_norm": 1.0483999252319336, + "learning_rate": 8.79300838092836e-05, + "loss": 0.6192089319229126, + "step": 4094 + }, + { + "epoch": 1.7282700421940929, + "grad_norm": 1.1194913387298584, + "learning_rate": 8.791443259671645e-05, + "loss": 0.603322446346283, + "step": 4096 + }, + { + "epoch": 1.7291139240506328, + "grad_norm": 1.1800397634506226, + "learning_rate": 8.789877263801787e-05, + "loss": 0.6141818165779114, + "step": 4098 + }, + { + "epoch": 1.729957805907173, + "grad_norm": 1.261768102645874, + "learning_rate": 8.78831039368003e-05, + "loss": 0.6707983016967773, + "step": 4100 + }, + { + "epoch": 1.729957805907173, + "eval_loss": 0.7022181153297424, + "eval_runtime": 844.6405, + "eval_samples_per_second": 2.495, + "eval_steps_per_second": 2.495, + "step": 4100 + }, + { + "epoch": 1.7308016877637131, + "grad_norm": 1.2505232095718384, + "learning_rate": 8.786742649667822e-05, + "loss": 0.6440353989601135, + "step": 4102 + }, + { + "epoch": 1.731645569620253, + "grad_norm": 1.2631809711456299, + "learning_rate": 8.78517403212681e-05, + "loss": 0.6712808012962341, + "step": 4104 + }, + { + "epoch": 1.7324894514767932, + "grad_norm": 1.2781071662902832, + "learning_rate": 8.783604541418845e-05, + "loss": 0.6854958534240723, + "step": 4106 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 1.1065936088562012, + "learning_rate": 8.782034177905976e-05, + "loss": 0.6281477808952332, + "step": 4108 + }, + { + "epoch": 1.7341772151898733, + "grad_norm": 1.010961890220642, + "learning_rate": 8.780462941950457e-05, + "loss": 0.6835165619850159, + "step": 4110 + }, + { + "epoch": 1.7350210970464135, + "grad_norm": 1.1467366218566895, + "learning_rate": 8.778890833914744e-05, + "loss": 0.6674962639808655, + "step": 4112 + }, + { + "epoch": 1.7358649789029537, + "grad_norm": 1.0221859216690063, + "learning_rate": 8.77731785416149e-05, + "loss": 0.5967551469802856, + "step": 4114 + }, + { + "epoch": 1.7367088607594936, + "grad_norm": 1.347937822341919, + "learning_rate": 8.775744003053552e-05, + "loss": 0.7356855869293213, + "step": 4116 + }, + { + "epoch": 1.7375527426160338, + "grad_norm": 1.2952557802200317, + "learning_rate": 8.774169280953988e-05, + "loss": 0.6932644844055176, + "step": 4118 + }, + { + "epoch": 1.738396624472574, + "grad_norm": 1.0157089233398438, + "learning_rate": 8.772593688226052e-05, + "loss": 0.5917407870292664, + "step": 4120 + }, + { + "epoch": 1.7392405063291139, + "grad_norm": 1.1537878513336182, + "learning_rate": 8.77101722523321e-05, + "loss": 0.6335760354995728, + "step": 4122 + }, + { + "epoch": 1.740084388185654, + "grad_norm": 1.0989667177200317, + "learning_rate": 8.769439892339115e-05, + "loss": 0.6892110109329224, + "step": 4124 + }, + { + "epoch": 1.7409282700421942, + "grad_norm": 1.1293572187423706, + "learning_rate": 8.767861689907633e-05, + "loss": 0.5966230630874634, + "step": 4126 + }, + { + "epoch": 1.7417721518987341, + "grad_norm": 1.1167775392532349, + "learning_rate": 8.76628261830282e-05, + "loss": 0.5981804728507996, + "step": 4128 + }, + { + "epoch": 1.7426160337552743, + "grad_norm": 1.0572419166564941, + "learning_rate": 8.76470267788894e-05, + "loss": 0.5539529919624329, + "step": 4130 + }, + { + "epoch": 1.7434599156118145, + "grad_norm": 0.937256932258606, + "learning_rate": 8.763121869030456e-05, + "loss": 0.6238219141960144, + "step": 4132 + }, + { + "epoch": 1.7443037974683544, + "grad_norm": 1.082932472229004, + "learning_rate": 8.761540192092029e-05, + "loss": 0.6033329963684082, + "step": 4134 + }, + { + "epoch": 1.7451476793248946, + "grad_norm": 1.0495184659957886, + "learning_rate": 8.75995764743852e-05, + "loss": 0.5567626357078552, + "step": 4136 + }, + { + "epoch": 1.7459915611814347, + "grad_norm": 1.3143779039382935, + "learning_rate": 8.758374235434994e-05, + "loss": 0.6759346127510071, + "step": 4138 + }, + { + "epoch": 1.7468354430379747, + "grad_norm": 1.2385786771774292, + "learning_rate": 8.756789956446713e-05, + "loss": 0.6439400315284729, + "step": 4140 + }, + { + "epoch": 1.7476793248945146, + "grad_norm": 1.0453747510910034, + "learning_rate": 8.75520481083914e-05, + "loss": 0.627493679523468, + "step": 4142 + }, + { + "epoch": 1.748523206751055, + "grad_norm": 1.09946608543396, + "learning_rate": 8.753618798977935e-05, + "loss": 0.677209198474884, + "step": 4144 + }, + { + "epoch": 1.749367088607595, + "grad_norm": 1.2207063436508179, + "learning_rate": 8.752031921228965e-05, + "loss": 0.6874014735221863, + "step": 4146 + }, + { + "epoch": 1.7502109704641349, + "grad_norm": 1.2520697116851807, + "learning_rate": 8.750444177958288e-05, + "loss": 0.6332831382751465, + "step": 4148 + }, + { + "epoch": 1.7510548523206753, + "grad_norm": 1.2463186979293823, + "learning_rate": 8.748855569532168e-05, + "loss": 0.682744562625885, + "step": 4150 + }, + { + "epoch": 1.7518987341772152, + "grad_norm": 1.1895235776901245, + "learning_rate": 8.747266096317069e-05, + "loss": 0.7006803750991821, + "step": 4152 + }, + { + "epoch": 1.7527426160337551, + "grad_norm": 1.1627185344696045, + "learning_rate": 8.745675758679646e-05, + "loss": 0.6751191020011902, + "step": 4154 + }, + { + "epoch": 1.7535864978902953, + "grad_norm": 1.324127197265625, + "learning_rate": 8.744084556986764e-05, + "loss": 0.661848247051239, + "step": 4156 + }, + { + "epoch": 1.7544303797468355, + "grad_norm": 1.226809024810791, + "learning_rate": 8.74249249160548e-05, + "loss": 0.7057217955589294, + "step": 4158 + }, + { + "epoch": 1.7552742616033754, + "grad_norm": 1.2341214418411255, + "learning_rate": 8.740899562903056e-05, + "loss": 0.6856105923652649, + "step": 4160 + }, + { + "epoch": 1.7561181434599156, + "grad_norm": 1.3907564878463745, + "learning_rate": 8.739305771246946e-05, + "loss": 0.6616930365562439, + "step": 4162 + }, + { + "epoch": 1.7569620253164557, + "grad_norm": 1.2756825685501099, + "learning_rate": 8.737711117004812e-05, + "loss": 0.5791551470756531, + "step": 4164 + }, + { + "epoch": 1.7578059071729957, + "grad_norm": 1.2861095666885376, + "learning_rate": 8.736115600544506e-05, + "loss": 0.7074756622314453, + "step": 4166 + }, + { + "epoch": 1.7586497890295358, + "grad_norm": 1.2198424339294434, + "learning_rate": 8.734519222234083e-05, + "loss": 0.6494167447090149, + "step": 4168 + }, + { + "epoch": 1.759493670886076, + "grad_norm": 1.19169020652771, + "learning_rate": 8.732921982441799e-05, + "loss": 0.6546841859817505, + "step": 4170 + }, + { + "epoch": 1.760337552742616, + "grad_norm": 1.11533784866333, + "learning_rate": 8.731323881536108e-05, + "loss": 0.6701815724372864, + "step": 4172 + }, + { + "epoch": 1.761181434599156, + "grad_norm": 1.2148140668869019, + "learning_rate": 8.729724919885657e-05, + "loss": 0.6678179502487183, + "step": 4174 + }, + { + "epoch": 1.7620253164556963, + "grad_norm": 1.1968709230422974, + "learning_rate": 8.728125097859298e-05, + "loss": 0.6505144834518433, + "step": 4176 + }, + { + "epoch": 1.7628691983122362, + "grad_norm": 1.0954766273498535, + "learning_rate": 8.726524415826079e-05, + "loss": 0.6531696915626526, + "step": 4178 + }, + { + "epoch": 1.7637130801687764, + "grad_norm": 1.5149537324905396, + "learning_rate": 8.724922874155246e-05, + "loss": 0.710014283657074, + "step": 4180 + }, + { + "epoch": 1.7645569620253165, + "grad_norm": 1.145113229751587, + "learning_rate": 8.723320473216245e-05, + "loss": 0.714016318321228, + "step": 4182 + }, + { + "epoch": 1.7654008438818565, + "grad_norm": 0.9454524517059326, + "learning_rate": 8.721717213378719e-05, + "loss": 0.6775414347648621, + "step": 4184 + }, + { + "epoch": 1.7662447257383966, + "grad_norm": 1.1414754390716553, + "learning_rate": 8.720113095012507e-05, + "loss": 0.6279728412628174, + "step": 4186 + }, + { + "epoch": 1.7670886075949368, + "grad_norm": 1.212802767753601, + "learning_rate": 8.718508118487652e-05, + "loss": 0.5894309282302856, + "step": 4188 + }, + { + "epoch": 1.7679324894514767, + "grad_norm": 1.5213478803634644, + "learning_rate": 8.716902284174388e-05, + "loss": 0.6124046444892883, + "step": 4190 + }, + { + "epoch": 1.768776371308017, + "grad_norm": 0.9973840713500977, + "learning_rate": 8.715295592443154e-05, + "loss": 0.5990801453590393, + "step": 4192 + }, + { + "epoch": 1.769620253164557, + "grad_norm": 1.1084294319152832, + "learning_rate": 8.713688043664579e-05, + "loss": 0.6485559344291687, + "step": 4194 + }, + { + "epoch": 1.770464135021097, + "grad_norm": 1.1401913166046143, + "learning_rate": 8.712079638209493e-05, + "loss": 0.7083099484443665, + "step": 4196 + }, + { + "epoch": 1.7713080168776372, + "grad_norm": 1.278105616569519, + "learning_rate": 8.71047037644893e-05, + "loss": 0.7237915992736816, + "step": 4198 + }, + { + "epoch": 1.7721518987341773, + "grad_norm": 1.2407530546188354, + "learning_rate": 8.708860258754108e-05, + "loss": 0.6259870529174805, + "step": 4200 + }, + { + "epoch": 1.7721518987341773, + "eval_loss": 0.6993561387062073, + "eval_runtime": 542.0281, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 3.887, + "step": 4200 + }, + { + "epoch": 1.7729957805907173, + "grad_norm": 1.102859616279602, + "learning_rate": 8.707249285496457e-05, + "loss": 0.6604248285293579, + "step": 4202 + }, + { + "epoch": 1.7738396624472574, + "grad_norm": 1.2478244304656982, + "learning_rate": 8.705637457047594e-05, + "loss": 0.6799775958061218, + "step": 4204 + }, + { + "epoch": 1.7746835443037976, + "grad_norm": 1.1178022623062134, + "learning_rate": 8.704024773779338e-05, + "loss": 0.6136477589607239, + "step": 4206 + }, + { + "epoch": 1.7755274261603375, + "grad_norm": 1.904076337814331, + "learning_rate": 8.702411236063703e-05, + "loss": 0.6568390130996704, + "step": 4208 + }, + { + "epoch": 1.7763713080168775, + "grad_norm": 1.0902835130691528, + "learning_rate": 8.700796844272903e-05, + "loss": 0.6404406428337097, + "step": 4210 + }, + { + "epoch": 1.7772151898734179, + "grad_norm": 1.1858288049697876, + "learning_rate": 8.699181598779347e-05, + "loss": 0.6924911737442017, + "step": 4212 + }, + { + "epoch": 1.7780590717299578, + "grad_norm": 1.0015727281570435, + "learning_rate": 8.69756549995564e-05, + "loss": 0.572692334651947, + "step": 4214 + }, + { + "epoch": 1.7789029535864977, + "grad_norm": 1.440079689025879, + "learning_rate": 8.695948548174583e-05, + "loss": 0.7196018695831299, + "step": 4216 + }, + { + "epoch": 1.7797468354430381, + "grad_norm": 1.1320992708206177, + "learning_rate": 8.69433074380918e-05, + "loss": 0.5870906710624695, + "step": 4218 + }, + { + "epoch": 1.780590717299578, + "grad_norm": 1.3156964778900146, + "learning_rate": 8.692712087232626e-05, + "loss": 0.6501539349555969, + "step": 4220 + }, + { + "epoch": 1.781434599156118, + "grad_norm": 1.1869803667068481, + "learning_rate": 8.691092578818311e-05, + "loss": 0.7017278075218201, + "step": 4222 + }, + { + "epoch": 1.7822784810126582, + "grad_norm": 0.9708380699157715, + "learning_rate": 8.689472218939829e-05, + "loss": 0.5954802632331848, + "step": 4224 + }, + { + "epoch": 1.7831223628691983, + "grad_norm": 1.0753228664398193, + "learning_rate": 8.687851007970962e-05, + "loss": 0.6494144797325134, + "step": 4226 + }, + { + "epoch": 1.7839662447257383, + "grad_norm": 1.1038413047790527, + "learning_rate": 8.686228946285695e-05, + "loss": 0.7247282862663269, + "step": 4228 + }, + { + "epoch": 1.7848101265822784, + "grad_norm": 0.9666786789894104, + "learning_rate": 8.684606034258206e-05, + "loss": 0.5673812627792358, + "step": 4230 + }, + { + "epoch": 1.7856540084388186, + "grad_norm": 1.1972676515579224, + "learning_rate": 8.682982272262869e-05, + "loss": 0.5950504541397095, + "step": 4232 + }, + { + "epoch": 1.7864978902953585, + "grad_norm": 1.23736572265625, + "learning_rate": 8.681357660674255e-05, + "loss": 0.6477514505386353, + "step": 4234 + }, + { + "epoch": 1.7873417721518987, + "grad_norm": 1.0238158702850342, + "learning_rate": 8.679732199867127e-05, + "loss": 0.6180200576782227, + "step": 4236 + }, + { + "epoch": 1.7881856540084389, + "grad_norm": 1.0333375930786133, + "learning_rate": 8.678105890216455e-05, + "loss": 0.5771099328994751, + "step": 4238 + }, + { + "epoch": 1.7890295358649788, + "grad_norm": 1.30390202999115, + "learning_rate": 8.676478732097393e-05, + "loss": 0.6592516899108887, + "step": 4240 + }, + { + "epoch": 1.789873417721519, + "grad_norm": 1.115160346031189, + "learning_rate": 8.674850725885294e-05, + "loss": 0.6662757396697998, + "step": 4242 + }, + { + "epoch": 1.7907172995780591, + "grad_norm": 1.2130142450332642, + "learning_rate": 8.67322187195571e-05, + "loss": 0.6673333048820496, + "step": 4244 + }, + { + "epoch": 1.791561181434599, + "grad_norm": 1.1505554914474487, + "learning_rate": 8.671592170684386e-05, + "loss": 0.6698325872421265, + "step": 4246 + }, + { + "epoch": 1.7924050632911392, + "grad_norm": 1.0758062601089478, + "learning_rate": 8.669961622447262e-05, + "loss": 0.6216199398040771, + "step": 4248 + }, + { + "epoch": 1.7932489451476794, + "grad_norm": 0.9300920367240906, + "learning_rate": 8.668330227620475e-05, + "loss": 0.6460495591163635, + "step": 4250 + }, + { + "epoch": 1.7940928270042193, + "grad_norm": 1.3860046863555908, + "learning_rate": 8.666697986580357e-05, + "loss": 0.6949506998062134, + "step": 4252 + }, + { + "epoch": 1.7949367088607595, + "grad_norm": 1.2287555932998657, + "learning_rate": 8.665064899703433e-05, + "loss": 0.6320405602455139, + "step": 4254 + }, + { + "epoch": 1.7957805907172997, + "grad_norm": 1.1585466861724854, + "learning_rate": 8.663430967366426e-05, + "loss": 0.6635019779205322, + "step": 4256 + }, + { + "epoch": 1.7966244725738396, + "grad_norm": 1.1007941961288452, + "learning_rate": 8.661796189946252e-05, + "loss": 0.645052969455719, + "step": 4258 + }, + { + "epoch": 1.7974683544303798, + "grad_norm": 1.2059847116470337, + "learning_rate": 8.660160567820023e-05, + "loss": 0.70420902967453, + "step": 4260 + }, + { + "epoch": 1.79831223628692, + "grad_norm": 1.0648717880249023, + "learning_rate": 8.658524101365044e-05, + "loss": 0.6263765096664429, + "step": 4262 + }, + { + "epoch": 1.7991561181434599, + "grad_norm": 1.017052412033081, + "learning_rate": 8.656886790958821e-05, + "loss": 0.6199937462806702, + "step": 4264 + }, + { + "epoch": 1.8, + "grad_norm": 1.1153450012207031, + "learning_rate": 8.655248636979045e-05, + "loss": 0.5891271233558655, + "step": 4266 + }, + { + "epoch": 1.8008438818565402, + "grad_norm": 1.0661747455596924, + "learning_rate": 8.65360963980361e-05, + "loss": 0.5442121028900146, + "step": 4268 + }, + { + "epoch": 1.8016877637130801, + "grad_norm": 1.3049758672714233, + "learning_rate": 8.6519697998106e-05, + "loss": 0.6988245248794556, + "step": 4270 + }, + { + "epoch": 1.80253164556962, + "grad_norm": 1.2679938077926636, + "learning_rate": 8.650329117378294e-05, + "loss": 0.7260398864746094, + "step": 4272 + }, + { + "epoch": 1.8033755274261605, + "grad_norm": 1.0899536609649658, + "learning_rate": 8.648687592885168e-05, + "loss": 0.5757678151130676, + "step": 4274 + }, + { + "epoch": 1.8042194092827004, + "grad_norm": 1.4088575839996338, + "learning_rate": 8.647045226709887e-05, + "loss": 0.7042108178138733, + "step": 4276 + }, + { + "epoch": 1.8050632911392404, + "grad_norm": 1.2143783569335938, + "learning_rate": 8.645402019231316e-05, + "loss": 0.641275942325592, + "step": 4278 + }, + { + "epoch": 1.8059071729957807, + "grad_norm": 1.4072896242141724, + "learning_rate": 8.64375797082851e-05, + "loss": 0.7657124996185303, + "step": 4280 + }, + { + "epoch": 1.8067510548523207, + "grad_norm": 1.2563380002975464, + "learning_rate": 8.642113081880718e-05, + "loss": 0.713768720626831, + "step": 4282 + }, + { + "epoch": 1.8075949367088606, + "grad_norm": 1.1195416450500488, + "learning_rate": 8.64046735276739e-05, + "loss": 0.6276429295539856, + "step": 4284 + }, + { + "epoch": 1.808438818565401, + "grad_norm": 1.2472422122955322, + "learning_rate": 8.638820783868158e-05, + "loss": 0.5641238689422607, + "step": 4286 + }, + { + "epoch": 1.809282700421941, + "grad_norm": 1.1974313259124756, + "learning_rate": 8.637173375562855e-05, + "loss": 0.6312015056610107, + "step": 4288 + }, + { + "epoch": 1.810126582278481, + "grad_norm": 1.1673604249954224, + "learning_rate": 8.63552512823151e-05, + "loss": 0.6674410104751587, + "step": 4290 + }, + { + "epoch": 1.810970464135021, + "grad_norm": 1.199095368385315, + "learning_rate": 8.633876042254337e-05, + "loss": 0.6772016286849976, + "step": 4292 + }, + { + "epoch": 1.8118143459915612, + "grad_norm": 1.2302746772766113, + "learning_rate": 8.632226118011752e-05, + "loss": 0.6621671915054321, + "step": 4294 + }, + { + "epoch": 1.8126582278481012, + "grad_norm": 1.304010033607483, + "learning_rate": 8.63057535588436e-05, + "loss": 0.6965363621711731, + "step": 4296 + }, + { + "epoch": 1.8135021097046413, + "grad_norm": 1.223366618156433, + "learning_rate": 8.62892375625296e-05, + "loss": 0.6300807595252991, + "step": 4298 + }, + { + "epoch": 1.8143459915611815, + "grad_norm": 1.028496265411377, + "learning_rate": 8.627271319498544e-05, + "loss": 0.5610660910606384, + "step": 4300 + }, + { + "epoch": 1.8143459915611815, + "eval_loss": 0.6981000900268555, + "eval_runtime": 514.4659, + "eval_samples_per_second": 4.096, + "eval_steps_per_second": 4.096, + "step": 4300 + }, + { + "epoch": 1.8151898734177214, + "grad_norm": 1.2050007581710815, + "learning_rate": 8.625618046002298e-05, + "loss": 0.6666551232337952, + "step": 4302 + }, + { + "epoch": 1.8160337552742616, + "grad_norm": 1.1233220100402832, + "learning_rate": 8.6239639361456e-05, + "loss": 0.6631835103034973, + "step": 4304 + }, + { + "epoch": 1.8168776371308017, + "grad_norm": 1.1262956857681274, + "learning_rate": 8.622308990310021e-05, + "loss": 0.6395270228385925, + "step": 4306 + }, + { + "epoch": 1.8177215189873417, + "grad_norm": 1.0448222160339355, + "learning_rate": 8.620653208877328e-05, + "loss": 0.6165015697479248, + "step": 4308 + }, + { + "epoch": 1.8185654008438819, + "grad_norm": 1.1555759906768799, + "learning_rate": 8.618996592229473e-05, + "loss": 0.5915844440460205, + "step": 4310 + }, + { + "epoch": 1.819409282700422, + "grad_norm": 1.5407506227493286, + "learning_rate": 8.617339140748608e-05, + "loss": 0.6491456627845764, + "step": 4312 + }, + { + "epoch": 1.820253164556962, + "grad_norm": 1.3690788745880127, + "learning_rate": 8.615680854817077e-05, + "loss": 0.6053901314735413, + "step": 4314 + }, + { + "epoch": 1.8210970464135021, + "grad_norm": 1.052583932876587, + "learning_rate": 8.614021734817413e-05, + "loss": 0.5821644067764282, + "step": 4316 + }, + { + "epoch": 1.8219409282700423, + "grad_norm": 1.090567708015442, + "learning_rate": 8.612361781132344e-05, + "loss": 0.645878255367279, + "step": 4318 + }, + { + "epoch": 1.8227848101265822, + "grad_norm": 1.122719645500183, + "learning_rate": 8.610700994144787e-05, + "loss": 0.6883123517036438, + "step": 4320 + }, + { + "epoch": 1.8236286919831224, + "grad_norm": 1.3273001909255981, + "learning_rate": 8.609039374237856e-05, + "loss": 0.6918330788612366, + "step": 4322 + }, + { + "epoch": 1.8244725738396625, + "grad_norm": 1.0628443956375122, + "learning_rate": 8.607376921794855e-05, + "loss": 0.6292204856872559, + "step": 4324 + }, + { + "epoch": 1.8253164556962025, + "grad_norm": 1.287466287612915, + "learning_rate": 8.605713637199279e-05, + "loss": 0.6136105060577393, + "step": 4326 + }, + { + "epoch": 1.8261603375527427, + "grad_norm": 1.1399345397949219, + "learning_rate": 8.604049520834816e-05, + "loss": 0.6099681854248047, + "step": 4328 + }, + { + "epoch": 1.8270042194092828, + "grad_norm": 1.1131435632705688, + "learning_rate": 8.602384573085345e-05, + "loss": 0.6267056465148926, + "step": 4330 + }, + { + "epoch": 1.8278481012658228, + "grad_norm": 1.1312925815582275, + "learning_rate": 8.600718794334939e-05, + "loss": 0.609437882900238, + "step": 4332 + }, + { + "epoch": 1.828691983122363, + "grad_norm": 1.3711494207382202, + "learning_rate": 8.599052184967859e-05, + "loss": 0.727881669998169, + "step": 4334 + }, + { + "epoch": 1.829535864978903, + "grad_norm": 1.1403605937957764, + "learning_rate": 8.597384745368562e-05, + "loss": 0.6771696209907532, + "step": 4336 + }, + { + "epoch": 1.830379746835443, + "grad_norm": 1.2769951820373535, + "learning_rate": 8.595716475921693e-05, + "loss": 0.6812924742698669, + "step": 4338 + }, + { + "epoch": 1.831223628691983, + "grad_norm": 1.055721402168274, + "learning_rate": 8.59404737701209e-05, + "loss": 0.6403515338897705, + "step": 4340 + }, + { + "epoch": 1.8320675105485233, + "grad_norm": 1.1047639846801758, + "learning_rate": 8.592377449024784e-05, + "loss": 0.663240373134613, + "step": 4342 + }, + { + "epoch": 1.8329113924050633, + "grad_norm": 1.0808883905410767, + "learning_rate": 8.590706692344991e-05, + "loss": 0.6398993134498596, + "step": 4344 + }, + { + "epoch": 1.8337552742616032, + "grad_norm": 1.2433407306671143, + "learning_rate": 8.589035107358125e-05, + "loss": 0.6838348507881165, + "step": 4346 + }, + { + "epoch": 1.8345991561181436, + "grad_norm": 1.031216025352478, + "learning_rate": 8.58736269444979e-05, + "loss": 0.640884280204773, + "step": 4348 + }, + { + "epoch": 1.8354430379746836, + "grad_norm": 1.1417057514190674, + "learning_rate": 8.585689454005776e-05, + "loss": 0.6346741914749146, + "step": 4350 + }, + { + "epoch": 1.8362869198312235, + "grad_norm": 1.210988998413086, + "learning_rate": 8.584015386412072e-05, + "loss": 0.6209521889686584, + "step": 4352 + }, + { + "epoch": 1.8371308016877637, + "grad_norm": 1.2120760679244995, + "learning_rate": 8.582340492054847e-05, + "loss": 0.6699252128601074, + "step": 4354 + }, + { + "epoch": 1.8379746835443038, + "grad_norm": 1.1768114566802979, + "learning_rate": 8.580664771320475e-05, + "loss": 0.6472980380058289, + "step": 4356 + }, + { + "epoch": 1.8388185654008438, + "grad_norm": 1.060070276260376, + "learning_rate": 8.578988224595506e-05, + "loss": 0.6440452933311462, + "step": 4358 + }, + { + "epoch": 1.839662447257384, + "grad_norm": 1.1366443634033203, + "learning_rate": 8.57731085226669e-05, + "loss": 0.5894474387168884, + "step": 4360 + }, + { + "epoch": 1.840506329113924, + "grad_norm": 1.1571751832962036, + "learning_rate": 8.575632654720963e-05, + "loss": 0.5868900418281555, + "step": 4362 + }, + { + "epoch": 1.841350210970464, + "grad_norm": 1.1983840465545654, + "learning_rate": 8.573953632345453e-05, + "loss": 0.5841533541679382, + "step": 4364 + }, + { + "epoch": 1.8421940928270042, + "grad_norm": 1.101806640625, + "learning_rate": 8.572273785527481e-05, + "loss": 0.5503215193748474, + "step": 4366 + }, + { + "epoch": 1.8430379746835444, + "grad_norm": 1.0327471494674683, + "learning_rate": 8.570593114654552e-05, + "loss": 0.6131128072738647, + "step": 4368 + }, + { + "epoch": 1.8438818565400843, + "grad_norm": 1.1421098709106445, + "learning_rate": 8.568911620114368e-05, + "loss": 0.6614060401916504, + "step": 4370 + }, + { + "epoch": 1.8447257383966245, + "grad_norm": 1.1707026958465576, + "learning_rate": 8.567229302294814e-05, + "loss": 0.6392307877540588, + "step": 4372 + }, + { + "epoch": 1.8455696202531646, + "grad_norm": 1.1704418659210205, + "learning_rate": 8.565546161583969e-05, + "loss": 0.6560825109481812, + "step": 4374 + }, + { + "epoch": 1.8464135021097046, + "grad_norm": 1.3618037700653076, + "learning_rate": 8.563862198370103e-05, + "loss": 0.6996290683746338, + "step": 4376 + }, + { + "epoch": 1.8472573839662447, + "grad_norm": 1.116645097732544, + "learning_rate": 8.562177413041674e-05, + "loss": 0.6776535511016846, + "step": 4378 + }, + { + "epoch": 1.8481012658227849, + "grad_norm": 1.1669151782989502, + "learning_rate": 8.560491805987327e-05, + "loss": 0.6390423774719238, + "step": 4380 + }, + { + "epoch": 1.8489451476793248, + "grad_norm": 1.2188117504119873, + "learning_rate": 8.558805377595904e-05, + "loss": 0.6554020047187805, + "step": 4382 + }, + { + "epoch": 1.849789029535865, + "grad_norm": 1.216829776763916, + "learning_rate": 8.557118128256425e-05, + "loss": 0.6291787624359131, + "step": 4384 + }, + { + "epoch": 1.8506329113924052, + "grad_norm": 1.0431596040725708, + "learning_rate": 8.555430058358111e-05, + "loss": 0.6484442949295044, + "step": 4386 + }, + { + "epoch": 1.851476793248945, + "grad_norm": 1.3015289306640625, + "learning_rate": 8.553741168290367e-05, + "loss": 0.7034047842025757, + "step": 4388 + }, + { + "epoch": 1.8523206751054853, + "grad_norm": 1.2062040567398071, + "learning_rate": 8.552051458442785e-05, + "loss": 0.644135594367981, + "step": 4390 + }, + { + "epoch": 1.8531645569620254, + "grad_norm": 1.238461971282959, + "learning_rate": 8.55036092920515e-05, + "loss": 0.6767282485961914, + "step": 4392 + }, + { + "epoch": 1.8540084388185654, + "grad_norm": 1.2978830337524414, + "learning_rate": 8.548669580967435e-05, + "loss": 0.7292267680168152, + "step": 4394 + }, + { + "epoch": 1.8548523206751055, + "grad_norm": 1.1448328495025635, + "learning_rate": 8.546977414119801e-05, + "loss": 0.6788421273231506, + "step": 4396 + }, + { + "epoch": 1.8556962025316457, + "grad_norm": 1.0685368776321411, + "learning_rate": 8.5452844290526e-05, + "loss": 0.6745942234992981, + "step": 4398 + }, + { + "epoch": 1.8565400843881856, + "grad_norm": 1.125707983970642, + "learning_rate": 8.543590626156368e-05, + "loss": 0.6351125836372375, + "step": 4400 + }, + { + "epoch": 1.8565400843881856, + "eval_loss": 0.6961485147476196, + "eval_runtime": 513.5724, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 4400 + }, + { + "epoch": 1.8573839662447258, + "grad_norm": 1.072179913520813, + "learning_rate": 8.541896005821835e-05, + "loss": 0.5840762257575989, + "step": 4402 + }, + { + "epoch": 1.858227848101266, + "grad_norm": 1.2572803497314453, + "learning_rate": 8.540200568439915e-05, + "loss": 0.6431074738502502, + "step": 4404 + }, + { + "epoch": 1.859071729957806, + "grad_norm": 1.3294413089752197, + "learning_rate": 8.538504314401718e-05, + "loss": 0.708808183670044, + "step": 4406 + }, + { + "epoch": 1.8599156118143458, + "grad_norm": 1.1775587797164917, + "learning_rate": 8.536807244098533e-05, + "loss": 0.6580085754394531, + "step": 4408 + }, + { + "epoch": 1.8607594936708862, + "grad_norm": 1.1880089044570923, + "learning_rate": 8.53510935792184e-05, + "loss": 0.6500136256217957, + "step": 4410 + }, + { + "epoch": 1.8616033755274262, + "grad_norm": 1.2166204452514648, + "learning_rate": 8.533410656263313e-05, + "loss": 0.6922352313995361, + "step": 4412 + }, + { + "epoch": 1.862447257383966, + "grad_norm": 1.0405415296554565, + "learning_rate": 8.531711139514808e-05, + "loss": 0.6761626601219177, + "step": 4414 + }, + { + "epoch": 1.8632911392405065, + "grad_norm": 1.0674270391464233, + "learning_rate": 8.530010808068371e-05, + "loss": 0.672576904296875, + "step": 4416 + }, + { + "epoch": 1.8641350210970464, + "grad_norm": 1.0584741830825806, + "learning_rate": 8.528309662316236e-05, + "loss": 0.5521218180656433, + "step": 4418 + }, + { + "epoch": 1.8649789029535864, + "grad_norm": 1.3619039058685303, + "learning_rate": 8.526607702650824e-05, + "loss": 0.6546680927276611, + "step": 4420 + }, + { + "epoch": 1.8658227848101265, + "grad_norm": 0.9904745221138, + "learning_rate": 8.524904929464745e-05, + "loss": 0.6043933629989624, + "step": 4422 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 1.3046703338623047, + "learning_rate": 8.523201343150795e-05, + "loss": 0.7106801271438599, + "step": 4424 + }, + { + "epoch": 1.8675105485232066, + "grad_norm": 1.1166832447052002, + "learning_rate": 8.52149694410196e-05, + "loss": 0.6456703543663025, + "step": 4426 + }, + { + "epoch": 1.8683544303797468, + "grad_norm": 1.1260632276535034, + "learning_rate": 8.519791732711412e-05, + "loss": 0.5963318347930908, + "step": 4428 + }, + { + "epoch": 1.869198312236287, + "grad_norm": 1.0990599393844604, + "learning_rate": 8.51808570937251e-05, + "loss": 0.6295356750488281, + "step": 4430 + }, + { + "epoch": 1.870042194092827, + "grad_norm": 1.3689274787902832, + "learning_rate": 8.516378874478801e-05, + "loss": 0.6984617114067078, + "step": 4432 + }, + { + "epoch": 1.870886075949367, + "grad_norm": 1.0986580848693848, + "learning_rate": 8.514671228424018e-05, + "loss": 0.5598900318145752, + "step": 4434 + }, + { + "epoch": 1.8717299578059072, + "grad_norm": 0.9570761322975159, + "learning_rate": 8.512962771602085e-05, + "loss": 0.6286435723304749, + "step": 4436 + }, + { + "epoch": 1.8725738396624472, + "grad_norm": 1.1480669975280762, + "learning_rate": 8.511253504407107e-05, + "loss": 0.5956313014030457, + "step": 4438 + }, + { + "epoch": 1.8734177215189873, + "grad_norm": 1.1132479906082153, + "learning_rate": 8.50954342723338e-05, + "loss": 0.6523844599723816, + "step": 4440 + }, + { + "epoch": 1.8742616033755275, + "grad_norm": 1.1569167375564575, + "learning_rate": 8.507832540475387e-05, + "loss": 0.6231355667114258, + "step": 4442 + }, + { + "epoch": 1.8751054852320674, + "grad_norm": 1.1327043771743774, + "learning_rate": 8.506120844527796e-05, + "loss": 0.660773754119873, + "step": 4444 + }, + { + "epoch": 1.8759493670886076, + "grad_norm": 0.8939630389213562, + "learning_rate": 8.504408339785463e-05, + "loss": 0.6319235563278198, + "step": 4446 + }, + { + "epoch": 1.8767932489451478, + "grad_norm": 1.1910638809204102, + "learning_rate": 8.50269502664343e-05, + "loss": 0.6753001809120178, + "step": 4448 + }, + { + "epoch": 1.8776371308016877, + "grad_norm": 1.1502408981323242, + "learning_rate": 8.500980905496923e-05, + "loss": 0.6300671696662903, + "step": 4450 + }, + { + "epoch": 1.8784810126582279, + "grad_norm": 1.0639009475708008, + "learning_rate": 8.49926597674136e-05, + "loss": 0.6196691989898682, + "step": 4452 + }, + { + "epoch": 1.879324894514768, + "grad_norm": 1.1072754859924316, + "learning_rate": 8.497550240772341e-05, + "loss": 0.7029181122779846, + "step": 4454 + }, + { + "epoch": 1.880168776371308, + "grad_norm": 1.0440188646316528, + "learning_rate": 8.495833697985652e-05, + "loss": 0.65432208776474, + "step": 4456 + }, + { + "epoch": 1.8810126582278481, + "grad_norm": 1.0646617412567139, + "learning_rate": 8.494116348777269e-05, + "loss": 0.6446614861488342, + "step": 4458 + }, + { + "epoch": 1.8818565400843883, + "grad_norm": 1.2163805961608887, + "learning_rate": 8.492398193543349e-05, + "loss": 0.6430497765541077, + "step": 4460 + }, + { + "epoch": 1.8827004219409282, + "grad_norm": 1.2715297937393188, + "learning_rate": 8.490679232680241e-05, + "loss": 0.6609845161437988, + "step": 4462 + }, + { + "epoch": 1.8835443037974684, + "grad_norm": 1.0435588359832764, + "learning_rate": 8.488959466584469e-05, + "loss": 0.5791062712669373, + "step": 4464 + }, + { + "epoch": 1.8843881856540086, + "grad_norm": 1.229202151298523, + "learning_rate": 8.487238895652759e-05, + "loss": 0.6312171220779419, + "step": 4466 + }, + { + "epoch": 1.8852320675105485, + "grad_norm": 1.0713022947311401, + "learning_rate": 8.485517520282008e-05, + "loss": 0.6698815226554871, + "step": 4468 + }, + { + "epoch": 1.8860759493670884, + "grad_norm": 1.0172312259674072, + "learning_rate": 8.483795340869305e-05, + "loss": 0.6283810138702393, + "step": 4470 + }, + { + "epoch": 1.8869198312236288, + "grad_norm": 1.2880207300186157, + "learning_rate": 8.482072357811926e-05, + "loss": 0.6659437417984009, + "step": 4472 + }, + { + "epoch": 1.8877637130801688, + "grad_norm": 1.0840508937835693, + "learning_rate": 8.480348571507329e-05, + "loss": 0.6190289258956909, + "step": 4474 + }, + { + "epoch": 1.8886075949367087, + "grad_norm": 1.1101994514465332, + "learning_rate": 8.478623982353156e-05, + "loss": 0.5760066509246826, + "step": 4476 + }, + { + "epoch": 1.889451476793249, + "grad_norm": 1.2388770580291748, + "learning_rate": 8.476898590747237e-05, + "loss": 0.6151811480522156, + "step": 4478 + }, + { + "epoch": 1.890295358649789, + "grad_norm": 0.9986408948898315, + "learning_rate": 8.475172397087591e-05, + "loss": 0.5991593599319458, + "step": 4480 + }, + { + "epoch": 1.891139240506329, + "grad_norm": 1.1380778551101685, + "learning_rate": 8.473445401772415e-05, + "loss": 0.7262179255485535, + "step": 4482 + }, + { + "epoch": 1.8919831223628694, + "grad_norm": 1.3933676481246948, + "learning_rate": 8.471717605200092e-05, + "loss": 0.5806916356086731, + "step": 4484 + }, + { + "epoch": 1.8928270042194093, + "grad_norm": 1.0242944955825806, + "learning_rate": 8.469989007769194e-05, + "loss": 0.617904782295227, + "step": 4486 + }, + { + "epoch": 1.8936708860759492, + "grad_norm": 1.0909028053283691, + "learning_rate": 8.468259609878475e-05, + "loss": 0.6488202810287476, + "step": 4488 + }, + { + "epoch": 1.8945147679324894, + "grad_norm": 1.042611002922058, + "learning_rate": 8.466529411926874e-05, + "loss": 0.6015118956565857, + "step": 4490 + }, + { + "epoch": 1.8953586497890296, + "grad_norm": 1.3965784311294556, + "learning_rate": 8.46479841431351e-05, + "loss": 0.7035272717475891, + "step": 4492 + }, + { + "epoch": 1.8962025316455695, + "grad_norm": 1.1486462354660034, + "learning_rate": 8.463066617437698e-05, + "loss": 0.6611229777336121, + "step": 4494 + }, + { + "epoch": 1.8970464135021097, + "grad_norm": 1.0845859050750732, + "learning_rate": 8.461334021698925e-05, + "loss": 0.6378056406974792, + "step": 4496 + }, + { + "epoch": 1.8978902953586498, + "grad_norm": 0.936612069606781, + "learning_rate": 8.459600627496869e-05, + "loss": 0.642429769039154, + "step": 4498 + }, + { + "epoch": 1.8987341772151898, + "grad_norm": 1.1905454397201538, + "learning_rate": 8.457866435231391e-05, + "loss": 0.6341768503189087, + "step": 4500 + }, + { + "epoch": 1.8987341772151898, + "eval_loss": 0.6938078999519348, + "eval_runtime": 513.615, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 4.102, + "step": 4500 + }, + { + "epoch": 1.89957805907173, + "grad_norm": 0.9778118133544922, + "learning_rate": 8.456131445302538e-05, + "loss": 0.5973100662231445, + "step": 4502 + }, + { + "epoch": 1.90042194092827, + "grad_norm": 0.9587083458900452, + "learning_rate": 8.454395658110536e-05, + "loss": 0.5982911586761475, + "step": 4504 + }, + { + "epoch": 1.90126582278481, + "grad_norm": 1.327643871307373, + "learning_rate": 8.452659074055798e-05, + "loss": 0.6858586668968201, + "step": 4506 + }, + { + "epoch": 1.9021097046413502, + "grad_norm": 1.0740257501602173, + "learning_rate": 8.450921693538922e-05, + "loss": 0.6172328591346741, + "step": 4508 + }, + { + "epoch": 1.9029535864978904, + "grad_norm": 1.0705101490020752, + "learning_rate": 8.449183516960685e-05, + "loss": 0.5349634289741516, + "step": 4510 + }, + { + "epoch": 1.9037974683544303, + "grad_norm": 0.9151237607002258, + "learning_rate": 8.447444544722058e-05, + "loss": 0.5769277811050415, + "step": 4512 + }, + { + "epoch": 1.9046413502109705, + "grad_norm": 1.139900803565979, + "learning_rate": 8.44570477722418e-05, + "loss": 0.6579093933105469, + "step": 4514 + }, + { + "epoch": 1.9054852320675106, + "grad_norm": 1.2481658458709717, + "learning_rate": 8.443964214868387e-05, + "loss": 0.6748929619789124, + "step": 4516 + }, + { + "epoch": 1.9063291139240506, + "grad_norm": 1.1661686897277832, + "learning_rate": 8.442222858056193e-05, + "loss": 0.6492021083831787, + "step": 4518 + }, + { + "epoch": 1.9071729957805907, + "grad_norm": 1.241477370262146, + "learning_rate": 8.440480707189295e-05, + "loss": 0.635409951210022, + "step": 4520 + }, + { + "epoch": 1.908016877637131, + "grad_norm": 1.1102054119110107, + "learning_rate": 8.438737762669573e-05, + "loss": 0.631928026676178, + "step": 4522 + }, + { + "epoch": 1.9088607594936708, + "grad_norm": 1.0638107061386108, + "learning_rate": 8.43699402489909e-05, + "loss": 0.604518473148346, + "step": 4524 + }, + { + "epoch": 1.909704641350211, + "grad_norm": 1.0270655155181885, + "learning_rate": 8.435249494280096e-05, + "loss": 0.61314457654953, + "step": 4526 + }, + { + "epoch": 1.9105485232067512, + "grad_norm": 1.1840111017227173, + "learning_rate": 8.433504171215018e-05, + "loss": 0.661663293838501, + "step": 4528 + }, + { + "epoch": 1.9113924050632911, + "grad_norm": 1.1404399871826172, + "learning_rate": 8.43175805610647e-05, + "loss": 0.7026967406272888, + "step": 4530 + }, + { + "epoch": 1.9122362869198313, + "grad_norm": 1.2371265888214111, + "learning_rate": 8.430011149357246e-05, + "loss": 0.6599440574645996, + "step": 4532 + }, + { + "epoch": 1.9130801687763714, + "grad_norm": 1.0042651891708374, + "learning_rate": 8.428263451370326e-05, + "loss": 0.5728344321250916, + "step": 4534 + }, + { + "epoch": 1.9139240506329114, + "grad_norm": 1.04367196559906, + "learning_rate": 8.426514962548866e-05, + "loss": 0.6495450735092163, + "step": 4536 + }, + { + "epoch": 1.9147679324894513, + "grad_norm": 1.0867135524749756, + "learning_rate": 8.424765683296215e-05, + "loss": 0.6406553387641907, + "step": 4538 + }, + { + "epoch": 1.9156118143459917, + "grad_norm": 1.0751310586929321, + "learning_rate": 8.423015614015892e-05, + "loss": 0.6692186594009399, + "step": 4540 + }, + { + "epoch": 1.9164556962025316, + "grad_norm": 1.13556969165802, + "learning_rate": 8.421264755111607e-05, + "loss": 0.6029785871505737, + "step": 4542 + }, + { + "epoch": 1.9172995780590716, + "grad_norm": 1.1560977697372437, + "learning_rate": 8.419513106987251e-05, + "loss": 0.6457844972610474, + "step": 4544 + }, + { + "epoch": 1.918143459915612, + "grad_norm": 1.2192902565002441, + "learning_rate": 8.417760670046893e-05, + "loss": 0.7082147598266602, + "step": 4546 + }, + { + "epoch": 1.918987341772152, + "grad_norm": 1.1170696020126343, + "learning_rate": 8.41600744469479e-05, + "loss": 0.6919234991073608, + "step": 4548 + }, + { + "epoch": 1.9198312236286919, + "grad_norm": 1.061253547668457, + "learning_rate": 8.414253431335373e-05, + "loss": 0.6310052871704102, + "step": 4550 + }, + { + "epoch": 1.920675105485232, + "grad_norm": 1.0671885013580322, + "learning_rate": 8.412498630373263e-05, + "loss": 0.6330236792564392, + "step": 4552 + }, + { + "epoch": 1.9215189873417722, + "grad_norm": 1.2085163593292236, + "learning_rate": 8.410743042213256e-05, + "loss": 0.7031015157699585, + "step": 4554 + }, + { + "epoch": 1.9223628691983121, + "grad_norm": 1.2682013511657715, + "learning_rate": 8.408986667260334e-05, + "loss": 0.7078304290771484, + "step": 4556 + }, + { + "epoch": 1.9232067510548523, + "grad_norm": 1.2966876029968262, + "learning_rate": 8.407229505919658e-05, + "loss": 0.6542860865592957, + "step": 4558 + }, + { + "epoch": 1.9240506329113924, + "grad_norm": 1.1086169481277466, + "learning_rate": 8.405471558596573e-05, + "loss": 0.5856828093528748, + "step": 4560 + }, + { + "epoch": 1.9248945147679324, + "grad_norm": 1.3175504207611084, + "learning_rate": 8.403712825696604e-05, + "loss": 0.7382104992866516, + "step": 4562 + }, + { + "epoch": 1.9257383966244725, + "grad_norm": 1.163164496421814, + "learning_rate": 8.401953307625454e-05, + "loss": 0.6862360239028931, + "step": 4564 + }, + { + "epoch": 1.9265822784810127, + "grad_norm": 1.207650899887085, + "learning_rate": 8.400193004789013e-05, + "loss": 0.7442302703857422, + "step": 4566 + }, + { + "epoch": 1.9274261603375527, + "grad_norm": 1.1570589542388916, + "learning_rate": 8.398431917593345e-05, + "loss": 0.595226526260376, + "step": 4568 + }, + { + "epoch": 1.9282700421940928, + "grad_norm": 1.091927170753479, + "learning_rate": 8.396670046444704e-05, + "loss": 0.6360410451889038, + "step": 4570 + }, + { + "epoch": 1.929113924050633, + "grad_norm": 1.149559497833252, + "learning_rate": 8.394907391749516e-05, + "loss": 0.6343122124671936, + "step": 4572 + }, + { + "epoch": 1.929957805907173, + "grad_norm": 1.0585254430770874, + "learning_rate": 8.393143953914395e-05, + "loss": 0.7394745349884033, + "step": 4574 + }, + { + "epoch": 1.930801687763713, + "grad_norm": 1.1648521423339844, + "learning_rate": 8.391379733346128e-05, + "loss": 0.6489678025245667, + "step": 4576 + }, + { + "epoch": 1.9316455696202532, + "grad_norm": 1.1756316423416138, + "learning_rate": 8.389614730451692e-05, + "loss": 0.6687861084938049, + "step": 4578 + }, + { + "epoch": 1.9324894514767932, + "grad_norm": 0.9857237339019775, + "learning_rate": 8.387848945638235e-05, + "loss": 0.523727536201477, + "step": 4580 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 1.1038693189620972, + "learning_rate": 8.386082379313092e-05, + "loss": 0.6545047760009766, + "step": 4582 + }, + { + "epoch": 1.9341772151898735, + "grad_norm": 1.0780832767486572, + "learning_rate": 8.384315031883774e-05, + "loss": 0.6067036390304565, + "step": 4584 + }, + { + "epoch": 1.9350210970464135, + "grad_norm": 1.2915070056915283, + "learning_rate": 8.382546903757975e-05, + "loss": 0.6880824565887451, + "step": 4586 + }, + { + "epoch": 1.9358649789029536, + "grad_norm": 1.1243441104888916, + "learning_rate": 8.380777995343568e-05, + "loss": 0.7319117188453674, + "step": 4588 + }, + { + "epoch": 1.9367088607594938, + "grad_norm": 1.1143072843551636, + "learning_rate": 8.379008307048609e-05, + "loss": 0.6845395565032959, + "step": 4590 + }, + { + "epoch": 1.9375527426160337, + "grad_norm": 1.039494276046753, + "learning_rate": 8.377237839281327e-05, + "loss": 0.6653600335121155, + "step": 4592 + }, + { + "epoch": 1.9383966244725739, + "grad_norm": 1.299617886543274, + "learning_rate": 8.375466592450136e-05, + "loss": 0.6352495551109314, + "step": 4594 + }, + { + "epoch": 1.939240506329114, + "grad_norm": 0.9918657541275024, + "learning_rate": 8.373694566963631e-05, + "loss": 0.5660957098007202, + "step": 4596 + }, + { + "epoch": 1.940084388185654, + "grad_norm": 1.0540478229522705, + "learning_rate": 8.371921763230579e-05, + "loss": 0.6296496987342834, + "step": 4598 + }, + { + "epoch": 1.9409282700421941, + "grad_norm": 1.1309545040130615, + "learning_rate": 8.370148181659939e-05, + "loss": 0.6672025918960571, + "step": 4600 + }, + { + "epoch": 1.9409282700421941, + "eval_loss": 0.6930755376815796, + "eval_runtime": 617.8927, + "eval_samples_per_second": 3.41, + "eval_steps_per_second": 3.41, + "step": 4600 + }, + { + "epoch": 1.9417721518987343, + "grad_norm": 1.2338588237762451, + "learning_rate": 8.368373822660836e-05, + "loss": 0.6200884580612183, + "step": 4602 + }, + { + "epoch": 1.9426160337552743, + "grad_norm": 1.1756945848464966, + "learning_rate": 8.366598686642582e-05, + "loss": 0.653294026851654, + "step": 4604 + }, + { + "epoch": 1.9434599156118142, + "grad_norm": 1.032018780708313, + "learning_rate": 8.364822774014671e-05, + "loss": 0.5670395493507385, + "step": 4606 + }, + { + "epoch": 1.9443037974683546, + "grad_norm": 1.045280933380127, + "learning_rate": 8.363046085186766e-05, + "loss": 0.6819197535514832, + "step": 4608 + }, + { + "epoch": 1.9451476793248945, + "grad_norm": 1.3223930597305298, + "learning_rate": 8.36126862056872e-05, + "loss": 0.6952820420265198, + "step": 4610 + }, + { + "epoch": 1.9459915611814345, + "grad_norm": 1.0048432350158691, + "learning_rate": 8.359490380570556e-05, + "loss": 0.5291440486907959, + "step": 4612 + }, + { + "epoch": 1.9468354430379748, + "grad_norm": 1.1477346420288086, + "learning_rate": 8.357711365602483e-05, + "loss": 0.6857813000679016, + "step": 4614 + }, + { + "epoch": 1.9476793248945148, + "grad_norm": 0.959985077381134, + "learning_rate": 8.355931576074882e-05, + "loss": 0.5581508278846741, + "step": 4616 + }, + { + "epoch": 1.9485232067510547, + "grad_norm": 1.1104289293289185, + "learning_rate": 8.35415101239832e-05, + "loss": 0.6536211371421814, + "step": 4618 + }, + { + "epoch": 1.9493670886075949, + "grad_norm": 1.2344517707824707, + "learning_rate": 8.352369674983535e-05, + "loss": 0.6570560336112976, + "step": 4620 + }, + { + "epoch": 1.950210970464135, + "grad_norm": 1.3411606550216675, + "learning_rate": 8.350587564241451e-05, + "loss": 0.6070495247840881, + "step": 4622 + }, + { + "epoch": 1.951054852320675, + "grad_norm": 1.1713159084320068, + "learning_rate": 8.348804680583166e-05, + "loss": 0.6444135904312134, + "step": 4624 + }, + { + "epoch": 1.9518987341772152, + "grad_norm": 1.127242922782898, + "learning_rate": 8.347021024419954e-05, + "loss": 0.6517419815063477, + "step": 4626 + }, + { + "epoch": 1.9527426160337553, + "grad_norm": 1.0733028650283813, + "learning_rate": 8.345236596163274e-05, + "loss": 0.6174065470695496, + "step": 4628 + }, + { + "epoch": 1.9535864978902953, + "grad_norm": 1.1114680767059326, + "learning_rate": 8.343451396224757e-05, + "loss": 0.7163593769073486, + "step": 4630 + }, + { + "epoch": 1.9544303797468354, + "grad_norm": 1.0839568376541138, + "learning_rate": 8.341665425016216e-05, + "loss": 0.698553204536438, + "step": 4632 + }, + { + "epoch": 1.9552742616033756, + "grad_norm": 1.17001211643219, + "learning_rate": 8.339878682949638e-05, + "loss": 0.6224857568740845, + "step": 4634 + }, + { + "epoch": 1.9561181434599155, + "grad_norm": 3.483793020248413, + "learning_rate": 8.338091170437193e-05, + "loss": 0.5931200981140137, + "step": 4636 + }, + { + "epoch": 1.9569620253164557, + "grad_norm": 1.1575394868850708, + "learning_rate": 8.336302887891224e-05, + "loss": 0.6031442284584045, + "step": 4638 + }, + { + "epoch": 1.9578059071729959, + "grad_norm": 1.1494992971420288, + "learning_rate": 8.334513835724252e-05, + "loss": 0.6101768016815186, + "step": 4640 + }, + { + "epoch": 1.9586497890295358, + "grad_norm": 1.3858197927474976, + "learning_rate": 8.332724014348981e-05, + "loss": 0.6571711301803589, + "step": 4642 + }, + { + "epoch": 1.959493670886076, + "grad_norm": 1.1094943284988403, + "learning_rate": 8.330933424178284e-05, + "loss": 0.6391071677207947, + "step": 4644 + }, + { + "epoch": 1.9603375527426161, + "grad_norm": 1.1640198230743408, + "learning_rate": 8.329142065625218e-05, + "loss": 0.6542805433273315, + "step": 4646 + }, + { + "epoch": 1.961181434599156, + "grad_norm": 1.1080211400985718, + "learning_rate": 8.327349939103016e-05, + "loss": 0.6053075194358826, + "step": 4648 + }, + { + "epoch": 1.9620253164556962, + "grad_norm": 1.0137052536010742, + "learning_rate": 8.325557045025085e-05, + "loss": 0.6009573340415955, + "step": 4650 + }, + { + "epoch": 1.9628691983122364, + "grad_norm": 1.0867283344268799, + "learning_rate": 8.323763383805012e-05, + "loss": 0.5993483066558838, + "step": 4652 + }, + { + "epoch": 1.9637130801687763, + "grad_norm": 1.0577161312103271, + "learning_rate": 8.321968955856562e-05, + "loss": 0.6788463592529297, + "step": 4654 + }, + { + "epoch": 1.9645569620253165, + "grad_norm": 1.2002183198928833, + "learning_rate": 8.320173761593672e-05, + "loss": 0.5786917209625244, + "step": 4656 + }, + { + "epoch": 1.9654008438818567, + "grad_norm": 1.2266993522644043, + "learning_rate": 8.318377801430461e-05, + "loss": 0.7437994480133057, + "step": 4658 + }, + { + "epoch": 1.9662447257383966, + "grad_norm": 1.007582187652588, + "learning_rate": 8.316581075781223e-05, + "loss": 0.6763550639152527, + "step": 4660 + }, + { + "epoch": 1.9670886075949368, + "grad_norm": 1.2374811172485352, + "learning_rate": 8.314783585060425e-05, + "loss": 0.6953140497207642, + "step": 4662 + }, + { + "epoch": 1.967932489451477, + "grad_norm": 1.1791057586669922, + "learning_rate": 8.312985329682717e-05, + "loss": 0.6867341995239258, + "step": 4664 + }, + { + "epoch": 1.9687763713080169, + "grad_norm": 1.1903331279754639, + "learning_rate": 8.31118631006292e-05, + "loss": 0.6445001363754272, + "step": 4666 + }, + { + "epoch": 1.9696202531645568, + "grad_norm": 1.1731067895889282, + "learning_rate": 8.309386526616034e-05, + "loss": 0.6500589847564697, + "step": 4668 + }, + { + "epoch": 1.9704641350210972, + "grad_norm": 0.9470233917236328, + "learning_rate": 8.307585979757233e-05, + "loss": 0.6215718984603882, + "step": 4670 + }, + { + "epoch": 1.9713080168776371, + "grad_norm": 1.2900800704956055, + "learning_rate": 8.305784669901872e-05, + "loss": 0.6396787762641907, + "step": 4672 + }, + { + "epoch": 1.972151898734177, + "grad_norm": 1.1729133129119873, + "learning_rate": 8.303982597465474e-05, + "loss": 0.6581959128379822, + "step": 4674 + }, + { + "epoch": 1.9729957805907175, + "grad_norm": 1.1450555324554443, + "learning_rate": 8.302179762863746e-05, + "loss": 0.7013490796089172, + "step": 4676 + }, + { + "epoch": 1.9738396624472574, + "grad_norm": 1.1506338119506836, + "learning_rate": 8.300376166512567e-05, + "loss": 0.6796102523803711, + "step": 4678 + }, + { + "epoch": 1.9746835443037973, + "grad_norm": 1.149979591369629, + "learning_rate": 8.298571808827991e-05, + "loss": 0.6960519552230835, + "step": 4680 + }, + { + "epoch": 1.9755274261603377, + "grad_norm": 1.1078912019729614, + "learning_rate": 8.296766690226249e-05, + "loss": 0.6789507865905762, + "step": 4682 + }, + { + "epoch": 1.9763713080168777, + "grad_norm": 1.0199202299118042, + "learning_rate": 8.294960811123747e-05, + "loss": 0.5962659120559692, + "step": 4684 + }, + { + "epoch": 1.9772151898734176, + "grad_norm": 1.2226134538650513, + "learning_rate": 8.293154171937068e-05, + "loss": 0.6483094692230225, + "step": 4686 + }, + { + "epoch": 1.9780590717299578, + "grad_norm": 1.184095025062561, + "learning_rate": 8.291346773082965e-05, + "loss": 0.6750242710113525, + "step": 4688 + }, + { + "epoch": 1.978902953586498, + "grad_norm": 1.1018693447113037, + "learning_rate": 8.289538614978375e-05, + "loss": 0.7094066739082336, + "step": 4690 + }, + { + "epoch": 1.9797468354430379, + "grad_norm": 1.0342390537261963, + "learning_rate": 8.287729698040403e-05, + "loss": 0.6554126739501953, + "step": 4692 + }, + { + "epoch": 1.980590717299578, + "grad_norm": 1.0603563785552979, + "learning_rate": 8.285920022686332e-05, + "loss": 0.5493529438972473, + "step": 4694 + }, + { + "epoch": 1.9814345991561182, + "grad_norm": 1.139609932899475, + "learning_rate": 8.284109589333617e-05, + "loss": 0.6824741363525391, + "step": 4696 + }, + { + "epoch": 1.9822784810126581, + "grad_norm": 1.2167822122573853, + "learning_rate": 8.282298398399895e-05, + "loss": 0.7121000289916992, + "step": 4698 + }, + { + "epoch": 1.9831223628691983, + "grad_norm": 1.109857201576233, + "learning_rate": 8.280486450302968e-05, + "loss": 0.6711249351501465, + "step": 4700 + }, + { + "epoch": 1.9831223628691983, + "eval_loss": 0.6923081278800964, + "eval_runtime": 514.7729, + "eval_samples_per_second": 4.093, + "eval_steps_per_second": 4.093, + "step": 4700 + }, + { + "epoch": 1.9839662447257385, + "grad_norm": 1.1387107372283936, + "learning_rate": 8.27867374546082e-05, + "loss": 0.581635594367981, + "step": 4702 + }, + { + "epoch": 1.9848101265822784, + "grad_norm": 1.2519257068634033, + "learning_rate": 8.27686028429161e-05, + "loss": 0.6867302060127258, + "step": 4704 + }, + { + "epoch": 1.9856540084388186, + "grad_norm": 1.0927205085754395, + "learning_rate": 8.275046067213663e-05, + "loss": 0.6494556665420532, + "step": 4706 + }, + { + "epoch": 1.9864978902953587, + "grad_norm": 1.042035698890686, + "learning_rate": 8.273231094645487e-05, + "loss": 0.6949493288993835, + "step": 4708 + }, + { + "epoch": 1.9873417721518987, + "grad_norm": 1.0220824480056763, + "learning_rate": 8.271415367005762e-05, + "loss": 0.6535884737968445, + "step": 4710 + }, + { + "epoch": 1.9881856540084388, + "grad_norm": 1.3023611307144165, + "learning_rate": 8.269598884713339e-05, + "loss": 0.6635278463363647, + "step": 4712 + }, + { + "epoch": 1.989029535864979, + "grad_norm": 1.2526965141296387, + "learning_rate": 8.267781648187248e-05, + "loss": 0.7194697856903076, + "step": 4714 + }, + { + "epoch": 1.989873417721519, + "grad_norm": 1.0388038158416748, + "learning_rate": 8.265963657846691e-05, + "loss": 0.6355333924293518, + "step": 4716 + }, + { + "epoch": 1.990717299578059, + "grad_norm": 1.0852965116500854, + "learning_rate": 8.264144914111041e-05, + "loss": 0.6898305416107178, + "step": 4718 + }, + { + "epoch": 1.9915611814345993, + "grad_norm": 1.0714049339294434, + "learning_rate": 8.262325417399847e-05, + "loss": 0.6202836036682129, + "step": 4720 + }, + { + "epoch": 1.9924050632911392, + "grad_norm": 1.0767238140106201, + "learning_rate": 8.260505168132835e-05, + "loss": 0.6160458326339722, + "step": 4722 + }, + { + "epoch": 1.9932489451476794, + "grad_norm": 0.9605211615562439, + "learning_rate": 8.258684166729899e-05, + "loss": 0.6049920916557312, + "step": 4724 + }, + { + "epoch": 1.9940928270042195, + "grad_norm": 1.0580185651779175, + "learning_rate": 8.256862413611113e-05, + "loss": 0.5622014999389648, + "step": 4726 + }, + { + "epoch": 1.9949367088607595, + "grad_norm": 1.1039034128189087, + "learning_rate": 8.255039909196713e-05, + "loss": 0.6678924560546875, + "step": 4728 + }, + { + "epoch": 1.9957805907172996, + "grad_norm": 1.1482586860656738, + "learning_rate": 8.253216653907123e-05, + "loss": 0.658260703086853, + "step": 4730 + }, + { + "epoch": 1.9966244725738398, + "grad_norm": 1.135349988937378, + "learning_rate": 8.251392648162929e-05, + "loss": 0.6461613178253174, + "step": 4732 + }, + { + "epoch": 1.9974683544303797, + "grad_norm": 1.0155420303344727, + "learning_rate": 8.249567892384895e-05, + "loss": 0.6837426424026489, + "step": 4734 + }, + { + "epoch": 1.9983122362869197, + "grad_norm": 1.3392970561981201, + "learning_rate": 8.247742386993958e-05, + "loss": 0.6091697812080383, + "step": 4736 + }, + { + "epoch": 1.99915611814346, + "grad_norm": 1.0509974956512451, + "learning_rate": 8.245916132411226e-05, + "loss": 0.6539653539657593, + "step": 4738 + }, + { + "epoch": 2.0, + "grad_norm": 0.9777396321296692, + "learning_rate": 8.244089129057982e-05, + "loss": 0.5630147457122803, + "step": 4740 + }, + { + "epoch": 2.00084388185654, + "grad_norm": 1.1639164686203003, + "learning_rate": 8.24226137735568e-05, + "loss": 0.6190353631973267, + "step": 4742 + }, + { + "epoch": 2.0016877637130803, + "grad_norm": 1.119614839553833, + "learning_rate": 8.240432877725947e-05, + "loss": 0.6282529234886169, + "step": 4744 + }, + { + "epoch": 2.0025316455696203, + "grad_norm": 1.114739179611206, + "learning_rate": 8.238603630590581e-05, + "loss": 0.6176725625991821, + "step": 4746 + }, + { + "epoch": 2.00337552742616, + "grad_norm": 1.0543076992034912, + "learning_rate": 8.236773636371557e-05, + "loss": 0.5182007551193237, + "step": 4748 + }, + { + "epoch": 2.0042194092827006, + "grad_norm": 1.060389518737793, + "learning_rate": 8.234942895491019e-05, + "loss": 0.532536506652832, + "step": 4750 + }, + { + "epoch": 2.0050632911392405, + "grad_norm": 1.0824412107467651, + "learning_rate": 8.233111408371282e-05, + "loss": 0.5474061369895935, + "step": 4752 + }, + { + "epoch": 2.0059071729957805, + "grad_norm": 1.1450858116149902, + "learning_rate": 8.231279175434838e-05, + "loss": 0.586384654045105, + "step": 4754 + }, + { + "epoch": 2.006751054852321, + "grad_norm": 1.1225577592849731, + "learning_rate": 8.229446197104345e-05, + "loss": 0.6469444036483765, + "step": 4756 + }, + { + "epoch": 2.007594936708861, + "grad_norm": 1.7292449474334717, + "learning_rate": 8.227612473802637e-05, + "loss": 0.5371572971343994, + "step": 4758 + }, + { + "epoch": 2.0084388185654007, + "grad_norm": 1.1743781566619873, + "learning_rate": 8.22577800595272e-05, + "loss": 0.558707058429718, + "step": 4760 + }, + { + "epoch": 2.009282700421941, + "grad_norm": 1.0385273694992065, + "learning_rate": 8.223942793977769e-05, + "loss": 0.5943514108657837, + "step": 4762 + }, + { + "epoch": 2.010126582278481, + "grad_norm": 1.1302000284194946, + "learning_rate": 8.222106838301131e-05, + "loss": 0.5630753636360168, + "step": 4764 + }, + { + "epoch": 2.010970464135021, + "grad_norm": 1.140005111694336, + "learning_rate": 8.220270139346327e-05, + "loss": 0.527510404586792, + "step": 4766 + }, + { + "epoch": 2.0118143459915614, + "grad_norm": 1.1979734897613525, + "learning_rate": 8.21843269753705e-05, + "loss": 0.6315013766288757, + "step": 4768 + }, + { + "epoch": 2.0126582278481013, + "grad_norm": 1.3759459257125854, + "learning_rate": 8.21659451329716e-05, + "loss": 0.6225199699401855, + "step": 4770 + }, + { + "epoch": 2.0135021097046413, + "grad_norm": 1.330600380897522, + "learning_rate": 8.21475558705069e-05, + "loss": 0.6838938593864441, + "step": 4772 + }, + { + "epoch": 2.014345991561181, + "grad_norm": 1.2365351915359497, + "learning_rate": 8.21291591922185e-05, + "loss": 0.606302797794342, + "step": 4774 + }, + { + "epoch": 2.0151898734177216, + "grad_norm": 1.1886142492294312, + "learning_rate": 8.211075510235011e-05, + "loss": 0.6194182634353638, + "step": 4776 + }, + { + "epoch": 2.0160337552742615, + "grad_norm": 1.1414743661880493, + "learning_rate": 8.209234360514721e-05, + "loss": 0.639540433883667, + "step": 4778 + }, + { + "epoch": 2.0168776371308015, + "grad_norm": 1.2877455949783325, + "learning_rate": 8.2073924704857e-05, + "loss": 0.6350902318954468, + "step": 4780 + }, + { + "epoch": 2.017721518987342, + "grad_norm": 1.095578908920288, + "learning_rate": 8.205549840572834e-05, + "loss": 0.5152000784873962, + "step": 4782 + }, + { + "epoch": 2.018565400843882, + "grad_norm": 1.0043798685073853, + "learning_rate": 8.203706471201183e-05, + "loss": 0.46245837211608887, + "step": 4784 + }, + { + "epoch": 2.0194092827004217, + "grad_norm": 1.2133857011795044, + "learning_rate": 8.201862362795979e-05, + "loss": 0.6471722722053528, + "step": 4786 + }, + { + "epoch": 2.020253164556962, + "grad_norm": 1.0835390090942383, + "learning_rate": 8.200017515782619e-05, + "loss": 0.5790625214576721, + "step": 4788 + }, + { + "epoch": 2.021097046413502, + "grad_norm": 1.0176091194152832, + "learning_rate": 8.198171930586678e-05, + "loss": 0.5826238989830017, + "step": 4790 + }, + { + "epoch": 2.021940928270042, + "grad_norm": 1.1581370830535889, + "learning_rate": 8.196325607633893e-05, + "loss": 0.5781272649765015, + "step": 4792 + }, + { + "epoch": 2.0227848101265824, + "grad_norm": 1.243381142616272, + "learning_rate": 8.194478547350178e-05, + "loss": 0.6600401997566223, + "step": 4794 + }, + { + "epoch": 2.0236286919831223, + "grad_norm": 1.0718560218811035, + "learning_rate": 8.192630750161612e-05, + "loss": 0.5291268825531006, + "step": 4796 + }, + { + "epoch": 2.0244725738396623, + "grad_norm": 1.2338320016860962, + "learning_rate": 8.190782216494448e-05, + "loss": 0.6564924120903015, + "step": 4798 + }, + { + "epoch": 2.0253164556962027, + "grad_norm": 0.978547990322113, + "learning_rate": 8.188932946775107e-05, + "loss": 0.5471183657646179, + "step": 4800 + }, + { + "epoch": 2.0253164556962027, + "eval_loss": 0.6924457550048828, + "eval_runtime": 514.0427, + "eval_samples_per_second": 4.099, + "eval_steps_per_second": 4.099, + "step": 4800 + }, + { + "epoch": 2.0261603375527426, + "grad_norm": 1.1782792806625366, + "learning_rate": 8.18708294143018e-05, + "loss": 0.567442774772644, + "step": 4802 + }, + { + "epoch": 2.0270042194092825, + "grad_norm": 1.0768574476242065, + "learning_rate": 8.185232200886426e-05, + "loss": 0.6005180478096008, + "step": 4804 + }, + { + "epoch": 2.027848101265823, + "grad_norm": 1.3096717596054077, + "learning_rate": 8.18338072557078e-05, + "loss": 0.616436779499054, + "step": 4806 + }, + { + "epoch": 2.028691983122363, + "grad_norm": 1.0233508348464966, + "learning_rate": 8.181528515910336e-05, + "loss": 0.49587416648864746, + "step": 4808 + }, + { + "epoch": 2.029535864978903, + "grad_norm": 1.0800065994262695, + "learning_rate": 8.179675572332366e-05, + "loss": 0.5758571624755859, + "step": 4810 + }, + { + "epoch": 2.030379746835443, + "grad_norm": 1.09299898147583, + "learning_rate": 8.177821895264309e-05, + "loss": 0.561736524105072, + "step": 4812 + }, + { + "epoch": 2.031223628691983, + "grad_norm": 1.1439210176467896, + "learning_rate": 8.175967485133771e-05, + "loss": 0.5249468088150024, + "step": 4814 + }, + { + "epoch": 2.032067510548523, + "grad_norm": 1.15841805934906, + "learning_rate": 8.174112342368532e-05, + "loss": 0.6429001688957214, + "step": 4816 + }, + { + "epoch": 2.0329113924050635, + "grad_norm": 1.1720670461654663, + "learning_rate": 8.172256467396533e-05, + "loss": 0.60152667760849, + "step": 4818 + }, + { + "epoch": 2.0337552742616034, + "grad_norm": 1.2652091979980469, + "learning_rate": 8.170399860645892e-05, + "loss": 0.5553541779518127, + "step": 4820 + }, + { + "epoch": 2.0345991561181433, + "grad_norm": 1.0768507719039917, + "learning_rate": 8.168542522544893e-05, + "loss": 0.5369323492050171, + "step": 4822 + }, + { + "epoch": 2.0354430379746837, + "grad_norm": 0.9906469583511353, + "learning_rate": 8.166684453521986e-05, + "loss": 0.5468952655792236, + "step": 4824 + }, + { + "epoch": 2.0362869198312237, + "grad_norm": 1.3448988199234009, + "learning_rate": 8.164825654005792e-05, + "loss": 0.5795659422874451, + "step": 4826 + }, + { + "epoch": 2.0371308016877636, + "grad_norm": 1.2502341270446777, + "learning_rate": 8.162966124425103e-05, + "loss": 0.6465779542922974, + "step": 4828 + }, + { + "epoch": 2.037974683544304, + "grad_norm": 1.1512303352355957, + "learning_rate": 8.161105865208875e-05, + "loss": 0.5509394407272339, + "step": 4830 + }, + { + "epoch": 2.038818565400844, + "grad_norm": 1.2513408660888672, + "learning_rate": 8.159244876786232e-05, + "loss": 0.5515735745429993, + "step": 4832 + }, + { + "epoch": 2.039662447257384, + "grad_norm": 1.3035682439804077, + "learning_rate": 8.157383159586473e-05, + "loss": 0.757799506187439, + "step": 4834 + }, + { + "epoch": 2.0405063291139243, + "grad_norm": 1.1136540174484253, + "learning_rate": 8.155520714039056e-05, + "loss": 0.607295036315918, + "step": 4836 + }, + { + "epoch": 2.041350210970464, + "grad_norm": 1.220146656036377, + "learning_rate": 8.153657540573613e-05, + "loss": 0.5769712328910828, + "step": 4838 + }, + { + "epoch": 2.042194092827004, + "grad_norm": 1.2104195356369019, + "learning_rate": 8.151793639619944e-05, + "loss": 0.5746933817863464, + "step": 4840 + }, + { + "epoch": 2.043037974683544, + "grad_norm": 1.241708517074585, + "learning_rate": 8.149929011608014e-05, + "loss": 0.5932332277297974, + "step": 4842 + }, + { + "epoch": 2.0438818565400845, + "grad_norm": 1.1172713041305542, + "learning_rate": 8.148063656967955e-05, + "loss": 0.583284318447113, + "step": 4844 + }, + { + "epoch": 2.0447257383966244, + "grad_norm": 1.0867618322372437, + "learning_rate": 8.14619757613007e-05, + "loss": 0.5589476823806763, + "step": 4846 + }, + { + "epoch": 2.0455696202531644, + "grad_norm": 1.2470483779907227, + "learning_rate": 8.14433076952483e-05, + "loss": 0.6118156313896179, + "step": 4848 + }, + { + "epoch": 2.0464135021097047, + "grad_norm": 1.0908832550048828, + "learning_rate": 8.142463237582868e-05, + "loss": 0.5815895795822144, + "step": 4850 + }, + { + "epoch": 2.0472573839662447, + "grad_norm": 1.2589281797409058, + "learning_rate": 8.140594980734989e-05, + "loss": 0.6232373714447021, + "step": 4852 + }, + { + "epoch": 2.0481012658227846, + "grad_norm": 1.234152913093567, + "learning_rate": 8.138725999412165e-05, + "loss": 0.5992053151130676, + "step": 4854 + }, + { + "epoch": 2.048945147679325, + "grad_norm": 1.3304446935653687, + "learning_rate": 8.136856294045533e-05, + "loss": 0.6494496464729309, + "step": 4856 + }, + { + "epoch": 2.049789029535865, + "grad_norm": 1.1871088743209839, + "learning_rate": 8.134985865066398e-05, + "loss": 0.6263431906700134, + "step": 4858 + }, + { + "epoch": 2.050632911392405, + "grad_norm": 1.1454699039459229, + "learning_rate": 8.133114712906234e-05, + "loss": 0.6036502122879028, + "step": 4860 + }, + { + "epoch": 2.0514767932489453, + "grad_norm": 1.2953420877456665, + "learning_rate": 8.131242837996675e-05, + "loss": 0.5674451589584351, + "step": 4862 + }, + { + "epoch": 2.052320675105485, + "grad_norm": 1.1874405145645142, + "learning_rate": 8.129370240769534e-05, + "loss": 0.5616317987442017, + "step": 4864 + }, + { + "epoch": 2.053164556962025, + "grad_norm": 1.2936227321624756, + "learning_rate": 8.127496921656777e-05, + "loss": 0.6495023369789124, + "step": 4866 + }, + { + "epoch": 2.0540084388185655, + "grad_norm": 1.1935228109359741, + "learning_rate": 8.125622881090544e-05, + "loss": 0.6028099060058594, + "step": 4868 + }, + { + "epoch": 2.0548523206751055, + "grad_norm": 0.9932331442832947, + "learning_rate": 8.123748119503143e-05, + "loss": 0.476296067237854, + "step": 4870 + }, + { + "epoch": 2.0556962025316454, + "grad_norm": 1.3878839015960693, + "learning_rate": 8.121872637327042e-05, + "loss": 0.6191902756690979, + "step": 4872 + }, + { + "epoch": 2.056540084388186, + "grad_norm": 1.1185581684112549, + "learning_rate": 8.11999643499488e-05, + "loss": 0.566487729549408, + "step": 4874 + }, + { + "epoch": 2.0573839662447257, + "grad_norm": 1.3729257583618164, + "learning_rate": 8.118119512939464e-05, + "loss": 0.5970078706741333, + "step": 4876 + }, + { + "epoch": 2.0582278481012657, + "grad_norm": 1.1332688331604004, + "learning_rate": 8.11624187159376e-05, + "loss": 0.570341944694519, + "step": 4878 + }, + { + "epoch": 2.059071729957806, + "grad_norm": 1.2648937702178955, + "learning_rate": 8.114363511390903e-05, + "loss": 0.6302897334098816, + "step": 4880 + }, + { + "epoch": 2.059915611814346, + "grad_norm": 1.250616192817688, + "learning_rate": 8.112484432764197e-05, + "loss": 0.5619142651557922, + "step": 4882 + }, + { + "epoch": 2.060759493670886, + "grad_norm": 0.9710861444473267, + "learning_rate": 8.110604636147109e-05, + "loss": 0.5426228642463684, + "step": 4884 + }, + { + "epoch": 2.0616033755274263, + "grad_norm": 1.1979506015777588, + "learning_rate": 8.108724121973271e-05, + "loss": 0.5498107671737671, + "step": 4886 + }, + { + "epoch": 2.0624472573839663, + "grad_norm": 1.0936485528945923, + "learning_rate": 8.106842890676483e-05, + "loss": 0.5695134401321411, + "step": 4888 + }, + { + "epoch": 2.0632911392405062, + "grad_norm": 1.1246092319488525, + "learning_rate": 8.10496094269071e-05, + "loss": 0.5998331308364868, + "step": 4890 + }, + { + "epoch": 2.0641350210970466, + "grad_norm": 1.244438648223877, + "learning_rate": 8.103078278450075e-05, + "loss": 0.5702623128890991, + "step": 4892 + }, + { + "epoch": 2.0649789029535865, + "grad_norm": 1.1585633754730225, + "learning_rate": 8.101194898388881e-05, + "loss": 0.5392299890518188, + "step": 4894 + }, + { + "epoch": 2.0658227848101265, + "grad_norm": 1.3044285774230957, + "learning_rate": 8.099310802941582e-05, + "loss": 0.5640127658843994, + "step": 4896 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 1.2483032941818237, + "learning_rate": 8.097425992542804e-05, + "loss": 0.6103175282478333, + "step": 4898 + }, + { + "epoch": 2.067510548523207, + "grad_norm": 1.0845462083816528, + "learning_rate": 8.095540467627337e-05, + "loss": 0.5041166543960571, + "step": 4900 + }, + { + "epoch": 2.067510548523207, + "eval_loss": 0.6941288113594055, + "eval_runtime": 513.4497, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 4900 + }, + { + "epoch": 2.0683544303797468, + "grad_norm": 1.2493232488632202, + "learning_rate": 8.093654228630134e-05, + "loss": 0.6253946423530579, + "step": 4902 + }, + { + "epoch": 2.0691983122362867, + "grad_norm": 1.1668756008148193, + "learning_rate": 8.091767275986317e-05, + "loss": 0.523486852645874, + "step": 4904 + }, + { + "epoch": 2.070042194092827, + "grad_norm": 1.1709638833999634, + "learning_rate": 8.089879610131167e-05, + "loss": 0.5569989681243896, + "step": 4906 + }, + { + "epoch": 2.070886075949367, + "grad_norm": 1.1044740676879883, + "learning_rate": 8.087991231500133e-05, + "loss": 0.642728865146637, + "step": 4908 + }, + { + "epoch": 2.071729957805907, + "grad_norm": 1.1032549142837524, + "learning_rate": 8.086102140528828e-05, + "loss": 0.5998259782791138, + "step": 4910 + }, + { + "epoch": 2.0725738396624473, + "grad_norm": 0.9980027079582214, + "learning_rate": 8.08421233765303e-05, + "loss": 0.5460172891616821, + "step": 4912 + }, + { + "epoch": 2.0734177215189873, + "grad_norm": 1.0866090059280396, + "learning_rate": 8.082321823308679e-05, + "loss": 0.5643284916877747, + "step": 4914 + }, + { + "epoch": 2.0742616033755272, + "grad_norm": 1.1942687034606934, + "learning_rate": 8.080430597931878e-05, + "loss": 0.554400622844696, + "step": 4916 + }, + { + "epoch": 2.0751054852320676, + "grad_norm": 1.0680599212646484, + "learning_rate": 8.078538661958901e-05, + "loss": 0.5955621004104614, + "step": 4918 + }, + { + "epoch": 2.0759493670886076, + "grad_norm": 1.20845627784729, + "learning_rate": 8.076646015826179e-05, + "loss": 0.5970203280448914, + "step": 4920 + }, + { + "epoch": 2.0767932489451475, + "grad_norm": 1.8368924856185913, + "learning_rate": 8.074752659970308e-05, + "loss": 0.6467664837837219, + "step": 4922 + }, + { + "epoch": 2.077637130801688, + "grad_norm": 1.3291922807693481, + "learning_rate": 8.072858594828053e-05, + "loss": 0.630719006061554, + "step": 4924 + }, + { + "epoch": 2.078481012658228, + "grad_norm": 1.1496083736419678, + "learning_rate": 8.070963820836333e-05, + "loss": 0.601140022277832, + "step": 4926 + }, + { + "epoch": 2.0793248945147678, + "grad_norm": 1.1562724113464355, + "learning_rate": 8.069068338432239e-05, + "loss": 0.6096881031990051, + "step": 4928 + }, + { + "epoch": 2.080168776371308, + "grad_norm": 1.0115300416946411, + "learning_rate": 8.067172148053021e-05, + "loss": 0.5085908770561218, + "step": 4930 + }, + { + "epoch": 2.081012658227848, + "grad_norm": 1.2181830406188965, + "learning_rate": 8.065275250136097e-05, + "loss": 0.5268720984458923, + "step": 4932 + }, + { + "epoch": 2.081856540084388, + "grad_norm": 1.1249788999557495, + "learning_rate": 8.06337764511904e-05, + "loss": 0.6075665950775146, + "step": 4934 + }, + { + "epoch": 2.0827004219409284, + "grad_norm": 1.1143964529037476, + "learning_rate": 8.061479333439595e-05, + "loss": 0.59170001745224, + "step": 4936 + }, + { + "epoch": 2.0835443037974684, + "grad_norm": 1.4773131608963013, + "learning_rate": 8.059580315535664e-05, + "loss": 0.6689745187759399, + "step": 4938 + }, + { + "epoch": 2.0843881856540083, + "grad_norm": 1.143965244293213, + "learning_rate": 8.057680591845316e-05, + "loss": 0.5409777760505676, + "step": 4940 + }, + { + "epoch": 2.0852320675105487, + "grad_norm": 1.0384942293167114, + "learning_rate": 8.055780162806777e-05, + "loss": 0.5778636336326599, + "step": 4942 + }, + { + "epoch": 2.0860759493670886, + "grad_norm": 1.0102177858352661, + "learning_rate": 8.053879028858442e-05, + "loss": 0.5576038360595703, + "step": 4944 + }, + { + "epoch": 2.0869198312236286, + "grad_norm": 1.3792158365249634, + "learning_rate": 8.051977190438868e-05, + "loss": 0.5873376131057739, + "step": 4946 + }, + { + "epoch": 2.087763713080169, + "grad_norm": 1.4402949810028076, + "learning_rate": 8.050074647986768e-05, + "loss": 0.6067743301391602, + "step": 4948 + }, + { + "epoch": 2.088607594936709, + "grad_norm": 1.2719058990478516, + "learning_rate": 8.048171401941027e-05, + "loss": 0.604671835899353, + "step": 4950 + }, + { + "epoch": 2.089451476793249, + "grad_norm": 1.1054867506027222, + "learning_rate": 8.046267452740683e-05, + "loss": 0.5743544697761536, + "step": 4952 + }, + { + "epoch": 2.090295358649789, + "grad_norm": 1.0521535873413086, + "learning_rate": 8.044362800824944e-05, + "loss": 0.576278567314148, + "step": 4954 + }, + { + "epoch": 2.091139240506329, + "grad_norm": 1.2665088176727295, + "learning_rate": 8.042457446633174e-05, + "loss": 0.5903641581535339, + "step": 4956 + }, + { + "epoch": 2.091983122362869, + "grad_norm": 1.1283398866653442, + "learning_rate": 8.040551390604902e-05, + "loss": 0.5854214429855347, + "step": 4958 + }, + { + "epoch": 2.0928270042194095, + "grad_norm": 1.1194316148757935, + "learning_rate": 8.03864463317982e-05, + "loss": 0.5843619108200073, + "step": 4960 + }, + { + "epoch": 2.0936708860759494, + "grad_norm": 1.3581651449203491, + "learning_rate": 8.036737174797778e-05, + "loss": 0.6115096211433411, + "step": 4962 + }, + { + "epoch": 2.0945147679324894, + "grad_norm": 1.341748595237732, + "learning_rate": 8.034829015898793e-05, + "loss": 0.5998795032501221, + "step": 4964 + }, + { + "epoch": 2.0953586497890297, + "grad_norm": 1.2212611436843872, + "learning_rate": 8.032920156923038e-05, + "loss": 0.628372311592102, + "step": 4966 + }, + { + "epoch": 2.0962025316455697, + "grad_norm": 1.1348317861557007, + "learning_rate": 8.031010598310851e-05, + "loss": 0.5668916702270508, + "step": 4968 + }, + { + "epoch": 2.0970464135021096, + "grad_norm": 1.1106547117233276, + "learning_rate": 8.029100340502731e-05, + "loss": 0.5253881216049194, + "step": 4970 + }, + { + "epoch": 2.09789029535865, + "grad_norm": 1.2471354007720947, + "learning_rate": 8.027189383939339e-05, + "loss": 0.5790762901306152, + "step": 4972 + }, + { + "epoch": 2.09873417721519, + "grad_norm": 1.2477394342422485, + "learning_rate": 8.025277729061492e-05, + "loss": 0.6382888555526733, + "step": 4974 + }, + { + "epoch": 2.09957805907173, + "grad_norm": 1.2716054916381836, + "learning_rate": 8.023365376310176e-05, + "loss": 0.5962072610855103, + "step": 4976 + }, + { + "epoch": 2.10042194092827, + "grad_norm": 1.257820725440979, + "learning_rate": 8.021452326126532e-05, + "loss": 0.5882940292358398, + "step": 4978 + }, + { + "epoch": 2.1012658227848102, + "grad_norm": 1.0924186706542969, + "learning_rate": 8.019538578951864e-05, + "loss": 0.5640701055526733, + "step": 4980 + }, + { + "epoch": 2.10210970464135, + "grad_norm": 1.1250383853912354, + "learning_rate": 8.017624135227637e-05, + "loss": 0.5746428966522217, + "step": 4982 + }, + { + "epoch": 2.10295358649789, + "grad_norm": 1.131323218345642, + "learning_rate": 8.015708995395477e-05, + "loss": 0.5611346960067749, + "step": 4984 + }, + { + "epoch": 2.1037974683544305, + "grad_norm": 1.4267152547836304, + "learning_rate": 8.013793159897171e-05, + "loss": 0.6173797249794006, + "step": 4986 + }, + { + "epoch": 2.1046413502109704, + "grad_norm": 1.41414213180542, + "learning_rate": 8.011876629174662e-05, + "loss": 0.64865642786026, + "step": 4988 + }, + { + "epoch": 2.1054852320675104, + "grad_norm": 1.1498184204101562, + "learning_rate": 8.00995940367006e-05, + "loss": 0.6125827431678772, + "step": 4990 + }, + { + "epoch": 2.1063291139240508, + "grad_norm": 1.2327708005905151, + "learning_rate": 8.00804148382563e-05, + "loss": 0.670495867729187, + "step": 4992 + }, + { + "epoch": 2.1071729957805907, + "grad_norm": 1.2797311544418335, + "learning_rate": 8.0061228700838e-05, + "loss": 0.6020209193229675, + "step": 4994 + }, + { + "epoch": 2.1080168776371306, + "grad_norm": 1.079584002494812, + "learning_rate": 8.004203562887157e-05, + "loss": 0.5974310636520386, + "step": 4996 + }, + { + "epoch": 2.108860759493671, + "grad_norm": 1.4352604150772095, + "learning_rate": 8.002283562678452e-05, + "loss": 0.6424587368965149, + "step": 4998 + }, + { + "epoch": 2.109704641350211, + "grad_norm": 1.0876719951629639, + "learning_rate": 8.000362869900586e-05, + "loss": 0.6185846328735352, + "step": 5000 + }, + { + "epoch": 2.109704641350211, + "eval_loss": 0.6908889412879944, + "eval_runtime": 675.8398, + "eval_samples_per_second": 3.118, + "eval_steps_per_second": 3.118, + "step": 5000 + }, + { + "epoch": 2.110548523206751, + "grad_norm": 1.0125762224197388, + "learning_rate": 7.998441484996631e-05, + "loss": 0.6127280592918396, + "step": 5002 + }, + { + "epoch": 2.1113924050632913, + "grad_norm": 1.0253753662109375, + "learning_rate": 7.99651940840981e-05, + "loss": 0.5495694875717163, + "step": 5004 + }, + { + "epoch": 2.1122362869198312, + "grad_norm": 1.5620673894882202, + "learning_rate": 7.994596640583511e-05, + "loss": 0.6199497580528259, + "step": 5006 + }, + { + "epoch": 2.113080168776371, + "grad_norm": 1.3032969236373901, + "learning_rate": 7.992673181961281e-05, + "loss": 0.5896390676498413, + "step": 5008 + }, + { + "epoch": 2.1139240506329116, + "grad_norm": 1.0933046340942383, + "learning_rate": 7.990749032986821e-05, + "loss": 0.6332341432571411, + "step": 5010 + }, + { + "epoch": 2.1147679324894515, + "grad_norm": 1.3115314245224, + "learning_rate": 7.988824194104e-05, + "loss": 0.5964323282241821, + "step": 5012 + }, + { + "epoch": 2.1156118143459914, + "grad_norm": 1.229978084564209, + "learning_rate": 7.986898665756837e-05, + "loss": 0.5938325524330139, + "step": 5014 + }, + { + "epoch": 2.116455696202532, + "grad_norm": 1.1779940128326416, + "learning_rate": 7.984972448389517e-05, + "loss": 0.5761791467666626, + "step": 5016 + }, + { + "epoch": 2.1172995780590718, + "grad_norm": 1.063490629196167, + "learning_rate": 7.98304554244638e-05, + "loss": 0.6073653101921082, + "step": 5018 + }, + { + "epoch": 2.1181434599156117, + "grad_norm": 1.2390391826629639, + "learning_rate": 7.981117948371927e-05, + "loss": 0.6126761436462402, + "step": 5020 + }, + { + "epoch": 2.118987341772152, + "grad_norm": 1.1946247816085815, + "learning_rate": 7.979189666610818e-05, + "loss": 0.614434003829956, + "step": 5022 + }, + { + "epoch": 2.119831223628692, + "grad_norm": 1.1008374691009521, + "learning_rate": 7.977260697607867e-05, + "loss": 0.5947603583335876, + "step": 5024 + }, + { + "epoch": 2.120675105485232, + "grad_norm": 1.14899480342865, + "learning_rate": 7.975331041808054e-05, + "loss": 0.583965539932251, + "step": 5026 + }, + { + "epoch": 2.1215189873417724, + "grad_norm": 1.1627864837646484, + "learning_rate": 7.973400699656512e-05, + "loss": 0.615121603012085, + "step": 5028 + }, + { + "epoch": 2.1223628691983123, + "grad_norm": 1.3622617721557617, + "learning_rate": 7.971469671598532e-05, + "loss": 0.6268601417541504, + "step": 5030 + }, + { + "epoch": 2.1232067510548522, + "grad_norm": 1.1735879182815552, + "learning_rate": 7.96953795807957e-05, + "loss": 0.6021270155906677, + "step": 5032 + }, + { + "epoch": 2.124050632911392, + "grad_norm": 1.3856201171875, + "learning_rate": 7.96760555954523e-05, + "loss": 0.636816680431366, + "step": 5034 + }, + { + "epoch": 2.1248945147679326, + "grad_norm": 1.1410126686096191, + "learning_rate": 7.965672476441282e-05, + "loss": 0.5324423313140869, + "step": 5036 + }, + { + "epoch": 2.1257383966244725, + "grad_norm": 1.446070909500122, + "learning_rate": 7.963738709213651e-05, + "loss": 0.7433624267578125, + "step": 5038 + }, + { + "epoch": 2.1265822784810124, + "grad_norm": 1.3041753768920898, + "learning_rate": 7.961804258308419e-05, + "loss": 0.6359145641326904, + "step": 5040 + }, + { + "epoch": 2.127426160337553, + "grad_norm": 1.2043813467025757, + "learning_rate": 7.959869124171826e-05, + "loss": 0.6164234280586243, + "step": 5042 + }, + { + "epoch": 2.1282700421940928, + "grad_norm": 1.2375630140304565, + "learning_rate": 7.957933307250273e-05, + "loss": 0.6437279582023621, + "step": 5044 + }, + { + "epoch": 2.1291139240506327, + "grad_norm": 1.210644245147705, + "learning_rate": 7.955996807990314e-05, + "loss": 0.585924506187439, + "step": 5046 + }, + { + "epoch": 2.129957805907173, + "grad_norm": 1.2011489868164062, + "learning_rate": 7.954059626838661e-05, + "loss": 0.6081803441047668, + "step": 5048 + }, + { + "epoch": 2.130801687763713, + "grad_norm": 1.0365782976150513, + "learning_rate": 7.952121764242187e-05, + "loss": 0.5609047412872314, + "step": 5050 + }, + { + "epoch": 2.131645569620253, + "grad_norm": 1.7950767278671265, + "learning_rate": 7.950183220647918e-05, + "loss": 0.5612874031066895, + "step": 5052 + }, + { + "epoch": 2.1324894514767934, + "grad_norm": 1.2933409214019775, + "learning_rate": 7.94824399650304e-05, + "loss": 0.6554630994796753, + "step": 5054 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 1.129828929901123, + "learning_rate": 7.946304092254894e-05, + "loss": 0.5623239278793335, + "step": 5056 + }, + { + "epoch": 2.1341772151898732, + "grad_norm": 1.1060296297073364, + "learning_rate": 7.944363508350978e-05, + "loss": 0.5036910772323608, + "step": 5058 + }, + { + "epoch": 2.1350210970464136, + "grad_norm": 1.2622627019882202, + "learning_rate": 7.94242224523895e-05, + "loss": 0.5840913653373718, + "step": 5060 + }, + { + "epoch": 2.1358649789029536, + "grad_norm": 1.3803153038024902, + "learning_rate": 7.940480303366618e-05, + "loss": 0.6365578770637512, + "step": 5062 + }, + { + "epoch": 2.1367088607594935, + "grad_norm": 1.2524651288986206, + "learning_rate": 7.938537683181955e-05, + "loss": 0.6167916655540466, + "step": 5064 + }, + { + "epoch": 2.137552742616034, + "grad_norm": 1.3320350646972656, + "learning_rate": 7.936594385133083e-05, + "loss": 0.6356930732727051, + "step": 5066 + }, + { + "epoch": 2.138396624472574, + "grad_norm": 1.3180949687957764, + "learning_rate": 7.934650409668285e-05, + "loss": 0.5888242721557617, + "step": 5068 + }, + { + "epoch": 2.1392405063291138, + "grad_norm": 1.1376243829727173, + "learning_rate": 7.932705757235999e-05, + "loss": 0.608725905418396, + "step": 5070 + }, + { + "epoch": 2.140084388185654, + "grad_norm": 1.1734369993209839, + "learning_rate": 7.930760428284817e-05, + "loss": 0.5824158787727356, + "step": 5072 + }, + { + "epoch": 2.140928270042194, + "grad_norm": 1.1038579940795898, + "learning_rate": 7.928814423263493e-05, + "loss": 0.5629416704177856, + "step": 5074 + }, + { + "epoch": 2.141772151898734, + "grad_norm": 1.269780158996582, + "learning_rate": 7.926867742620929e-05, + "loss": 0.5994445085525513, + "step": 5076 + }, + { + "epoch": 2.1426160337552744, + "grad_norm": 1.2274279594421387, + "learning_rate": 7.924920386806188e-05, + "loss": 0.5845475792884827, + "step": 5078 + }, + { + "epoch": 2.1434599156118144, + "grad_norm": 1.168766975402832, + "learning_rate": 7.922972356268488e-05, + "loss": 0.621201753616333, + "step": 5080 + }, + { + "epoch": 2.1443037974683543, + "grad_norm": 1.0057638883590698, + "learning_rate": 7.921023651457203e-05, + "loss": 0.5282597541809082, + "step": 5082 + }, + { + "epoch": 2.1451476793248947, + "grad_norm": 1.432309865951538, + "learning_rate": 7.91907427282186e-05, + "loss": 0.632583737373352, + "step": 5084 + }, + { + "epoch": 2.1459915611814346, + "grad_norm": 1.3939776420593262, + "learning_rate": 7.917124220812144e-05, + "loss": 0.6239289045333862, + "step": 5086 + }, + { + "epoch": 2.1468354430379746, + "grad_norm": 1.3741775751113892, + "learning_rate": 7.915173495877895e-05, + "loss": 0.5749062895774841, + "step": 5088 + }, + { + "epoch": 2.147679324894515, + "grad_norm": 1.3123528957366943, + "learning_rate": 7.913222098469109e-05, + "loss": 0.6011738181114197, + "step": 5090 + }, + { + "epoch": 2.148523206751055, + "grad_norm": 1.3473498821258545, + "learning_rate": 7.911270029035932e-05, + "loss": 0.5804699659347534, + "step": 5092 + }, + { + "epoch": 2.149367088607595, + "grad_norm": 1.0873067378997803, + "learning_rate": 7.909317288028673e-05, + "loss": 0.6446103453636169, + "step": 5094 + }, + { + "epoch": 2.1502109704641352, + "grad_norm": 1.1374083757400513, + "learning_rate": 7.907363875897789e-05, + "loss": 0.6136524677276611, + "step": 5096 + }, + { + "epoch": 2.151054852320675, + "grad_norm": 1.1356533765792847, + "learning_rate": 7.905409793093896e-05, + "loss": 0.5107976794242859, + "step": 5098 + }, + { + "epoch": 2.151898734177215, + "grad_norm": 1.2579567432403564, + "learning_rate": 7.903455040067763e-05, + "loss": 0.6073099374771118, + "step": 5100 + }, + { + "epoch": 2.151898734177215, + "eval_loss": 0.6902023553848267, + "eval_runtime": 733.915, + "eval_samples_per_second": 2.871, + "eval_steps_per_second": 2.871, + "step": 5100 + }, + { + "epoch": 2.1527426160337555, + "grad_norm": 1.2401398420333862, + "learning_rate": 7.901499617270315e-05, + "loss": 0.5562406182289124, + "step": 5102 + }, + { + "epoch": 2.1535864978902954, + "grad_norm": 1.086590051651001, + "learning_rate": 7.899543525152628e-05, + "loss": 0.5749467015266418, + "step": 5104 + }, + { + "epoch": 2.1544303797468354, + "grad_norm": 1.206458568572998, + "learning_rate": 7.897586764165939e-05, + "loss": 0.6326877474784851, + "step": 5106 + }, + { + "epoch": 2.1552742616033758, + "grad_norm": 1.030740737915039, + "learning_rate": 7.895629334761632e-05, + "loss": 0.5616445541381836, + "step": 5108 + }, + { + "epoch": 2.1561181434599157, + "grad_norm": 1.3338581323623657, + "learning_rate": 7.89367123739125e-05, + "loss": 0.6307384371757507, + "step": 5110 + }, + { + "epoch": 2.1569620253164556, + "grad_norm": 1.2684671878814697, + "learning_rate": 7.891712472506485e-05, + "loss": 0.6087653636932373, + "step": 5112 + }, + { + "epoch": 2.1578059071729956, + "grad_norm": 1.1610581874847412, + "learning_rate": 7.889753040559188e-05, + "loss": 0.5747998952865601, + "step": 5114 + }, + { + "epoch": 2.158649789029536, + "grad_norm": 1.4069275856018066, + "learning_rate": 7.887792942001366e-05, + "loss": 0.6143770217895508, + "step": 5116 + }, + { + "epoch": 2.159493670886076, + "grad_norm": 1.0858227014541626, + "learning_rate": 7.885832177285173e-05, + "loss": 0.552534282207489, + "step": 5118 + }, + { + "epoch": 2.160337552742616, + "grad_norm": 1.067070722579956, + "learning_rate": 7.88387074686292e-05, + "loss": 0.5781989693641663, + "step": 5120 + }, + { + "epoch": 2.1611814345991562, + "grad_norm": 1.139981746673584, + "learning_rate": 7.881908651187072e-05, + "loss": 0.5521422624588013, + "step": 5122 + }, + { + "epoch": 2.162025316455696, + "grad_norm": 1.0987457036972046, + "learning_rate": 7.879945890710245e-05, + "loss": 0.5755025744438171, + "step": 5124 + }, + { + "epoch": 2.162869198312236, + "grad_norm": 1.1530758142471313, + "learning_rate": 7.877982465885214e-05, + "loss": 0.5783509612083435, + "step": 5126 + }, + { + "epoch": 2.1637130801687765, + "grad_norm": 1.2285696268081665, + "learning_rate": 7.876018377164899e-05, + "loss": 0.5942281484603882, + "step": 5128 + }, + { + "epoch": 2.1645569620253164, + "grad_norm": 1.1283711194992065, + "learning_rate": 7.874053625002378e-05, + "loss": 0.5539707541465759, + "step": 5130 + }, + { + "epoch": 2.1654008438818564, + "grad_norm": 1.3213335275650024, + "learning_rate": 7.872088209850885e-05, + "loss": 0.5955292582511902, + "step": 5132 + }, + { + "epoch": 2.1662447257383968, + "grad_norm": 1.1748592853546143, + "learning_rate": 7.8701221321638e-05, + "loss": 0.5422899723052979, + "step": 5134 + }, + { + "epoch": 2.1670886075949367, + "grad_norm": 1.0752148628234863, + "learning_rate": 7.868155392394662e-05, + "loss": 0.5547205209732056, + "step": 5136 + }, + { + "epoch": 2.1679324894514767, + "grad_norm": 1.1814554929733276, + "learning_rate": 7.86618799099716e-05, + "loss": 0.5938948392868042, + "step": 5138 + }, + { + "epoch": 2.168776371308017, + "grad_norm": 1.3455278873443604, + "learning_rate": 7.864219928425132e-05, + "loss": 0.6468925476074219, + "step": 5140 + }, + { + "epoch": 2.169620253164557, + "grad_norm": 1.2695354223251343, + "learning_rate": 7.862251205132576e-05, + "loss": 0.5704391002655029, + "step": 5142 + }, + { + "epoch": 2.170464135021097, + "grad_norm": 1.1529468297958374, + "learning_rate": 7.860281821573638e-05, + "loss": 0.6057283878326416, + "step": 5144 + }, + { + "epoch": 2.1713080168776373, + "grad_norm": 1.3461004495620728, + "learning_rate": 7.858311778202616e-05, + "loss": 0.6135527491569519, + "step": 5146 + }, + { + "epoch": 2.1721518987341772, + "grad_norm": 1.1258536577224731, + "learning_rate": 7.856341075473962e-05, + "loss": 0.5585638880729675, + "step": 5148 + }, + { + "epoch": 2.172995780590717, + "grad_norm": 1.254898190498352, + "learning_rate": 7.854369713842279e-05, + "loss": 0.5780918002128601, + "step": 5150 + }, + { + "epoch": 2.1738396624472576, + "grad_norm": 1.2730201482772827, + "learning_rate": 7.852397693762321e-05, + "loss": 0.595267117023468, + "step": 5152 + }, + { + "epoch": 2.1746835443037975, + "grad_norm": 1.1875078678131104, + "learning_rate": 7.850425015688999e-05, + "loss": 0.5636162161827087, + "step": 5154 + }, + { + "epoch": 2.1755274261603375, + "grad_norm": 1.0930945873260498, + "learning_rate": 7.848451680077366e-05, + "loss": 0.6362089514732361, + "step": 5156 + }, + { + "epoch": 2.176371308016878, + "grad_norm": 1.2274452447891235, + "learning_rate": 7.846477687382639e-05, + "loss": 0.6268675327301025, + "step": 5158 + }, + { + "epoch": 2.1772151898734178, + "grad_norm": 1.2023133039474487, + "learning_rate": 7.844503038060176e-05, + "loss": 0.6014906167984009, + "step": 5160 + }, + { + "epoch": 2.1780590717299577, + "grad_norm": 1.2616889476776123, + "learning_rate": 7.842527732565491e-05, + "loss": 0.6180019974708557, + "step": 5162 + }, + { + "epoch": 2.1789029535864977, + "grad_norm": 1.1046907901763916, + "learning_rate": 7.84055177135425e-05, + "loss": 0.5400100946426392, + "step": 5164 + }, + { + "epoch": 2.179746835443038, + "grad_norm": 1.1664032936096191, + "learning_rate": 7.83857515488227e-05, + "loss": 0.5713199973106384, + "step": 5166 + }, + { + "epoch": 2.180590717299578, + "grad_norm": 1.2526558637619019, + "learning_rate": 7.836597883605519e-05, + "loss": 0.5741307735443115, + "step": 5168 + }, + { + "epoch": 2.181434599156118, + "grad_norm": 1.0457103252410889, + "learning_rate": 7.834619957980112e-05, + "loss": 0.47188031673431396, + "step": 5170 + }, + { + "epoch": 2.1822784810126583, + "grad_norm": 1.1978110074996948, + "learning_rate": 7.832641378462319e-05, + "loss": 0.6149471998214722, + "step": 5172 + }, + { + "epoch": 2.1831223628691983, + "grad_norm": 1.2231460809707642, + "learning_rate": 7.830662145508567e-05, + "loss": 0.5520018339157104, + "step": 5174 + }, + { + "epoch": 2.183966244725738, + "grad_norm": 1.4367618560791016, + "learning_rate": 7.828682259575417e-05, + "loss": 0.6536548733711243, + "step": 5176 + }, + { + "epoch": 2.1848101265822786, + "grad_norm": 1.0891374349594116, + "learning_rate": 7.826701721119598e-05, + "loss": 0.5324372053146362, + "step": 5178 + }, + { + "epoch": 2.1856540084388185, + "grad_norm": 1.118695616722107, + "learning_rate": 7.82472053059798e-05, + "loss": 0.6127952337265015, + "step": 5180 + }, + { + "epoch": 2.1864978902953585, + "grad_norm": 1.1116070747375488, + "learning_rate": 7.822738688467585e-05, + "loss": 0.505962610244751, + "step": 5182 + }, + { + "epoch": 2.187341772151899, + "grad_norm": 1.2140545845031738, + "learning_rate": 7.820756195185586e-05, + "loss": 0.6210073232650757, + "step": 5184 + }, + { + "epoch": 2.188185654008439, + "grad_norm": 1.2135601043701172, + "learning_rate": 7.818773051209307e-05, + "loss": 0.6517674326896667, + "step": 5186 + }, + { + "epoch": 2.1890295358649787, + "grad_norm": 1.3875514268875122, + "learning_rate": 7.816789256996218e-05, + "loss": 0.5577492117881775, + "step": 5188 + }, + { + "epoch": 2.189873417721519, + "grad_norm": 1.181325912475586, + "learning_rate": 7.814804813003949e-05, + "loss": 0.6010199189186096, + "step": 5190 + }, + { + "epoch": 2.190717299578059, + "grad_norm": 1.102044701576233, + "learning_rate": 7.812819719690265e-05, + "loss": 0.5635302662849426, + "step": 5192 + }, + { + "epoch": 2.191561181434599, + "grad_norm": 1.4227958917617798, + "learning_rate": 7.810833977513094e-05, + "loss": 0.5804321765899658, + "step": 5194 + }, + { + "epoch": 2.1924050632911394, + "grad_norm": 1.2573446035385132, + "learning_rate": 7.80884758693051e-05, + "loss": 0.6005555987358093, + "step": 5196 + }, + { + "epoch": 2.1932489451476793, + "grad_norm": 1.3534085750579834, + "learning_rate": 7.80686054840073e-05, + "loss": 0.6263643503189087, + "step": 5198 + }, + { + "epoch": 2.1940928270042193, + "grad_norm": 1.6895852088928223, + "learning_rate": 7.804872862382131e-05, + "loss": 0.6235764622688293, + "step": 5200 + }, + { + "epoch": 2.1940928270042193, + "eval_loss": 0.6915348172187805, + "eval_runtime": 1167.9782, + "eval_samples_per_second": 1.804, + "eval_steps_per_second": 1.804, + "step": 5200 + }, + { + "epoch": 2.1949367088607596, + "grad_norm": 1.138973593711853, + "learning_rate": 7.802884529333227e-05, + "loss": 0.5586035847663879, + "step": 5202 + }, + { + "epoch": 2.1957805907172996, + "grad_norm": 1.3664026260375977, + "learning_rate": 7.800895549712697e-05, + "loss": 0.5768917202949524, + "step": 5204 + }, + { + "epoch": 2.1966244725738395, + "grad_norm": 1.2182449102401733, + "learning_rate": 7.798905923979353e-05, + "loss": 0.6046215891838074, + "step": 5206 + }, + { + "epoch": 2.19746835443038, + "grad_norm": 1.2692211866378784, + "learning_rate": 7.796915652592167e-05, + "loss": 0.5412904024124146, + "step": 5208 + }, + { + "epoch": 2.19831223628692, + "grad_norm": 1.200822114944458, + "learning_rate": 7.794924736010256e-05, + "loss": 0.5328584909439087, + "step": 5210 + }, + { + "epoch": 2.19915611814346, + "grad_norm": 1.1093779802322388, + "learning_rate": 7.792933174692886e-05, + "loss": 0.5497913360595703, + "step": 5212 + }, + { + "epoch": 2.2, + "grad_norm": 1.3838921785354614, + "learning_rate": 7.790940969099471e-05, + "loss": 0.5908066034317017, + "step": 5214 + }, + { + "epoch": 2.20084388185654, + "grad_norm": 1.1411913633346558, + "learning_rate": 7.788948119689576e-05, + "loss": 0.6117307543754578, + "step": 5216 + }, + { + "epoch": 2.20168776371308, + "grad_norm": 1.5668916702270508, + "learning_rate": 7.786954626922913e-05, + "loss": 0.5788605809211731, + "step": 5218 + }, + { + "epoch": 2.2025316455696204, + "grad_norm": 1.195027232170105, + "learning_rate": 7.784960491259344e-05, + "loss": 0.5948591828346252, + "step": 5220 + }, + { + "epoch": 2.2033755274261604, + "grad_norm": 1.2665271759033203, + "learning_rate": 7.782965713158872e-05, + "loss": 0.6321669220924377, + "step": 5222 + }, + { + "epoch": 2.2042194092827003, + "grad_norm": 1.123711109161377, + "learning_rate": 7.78097029308166e-05, + "loss": 0.5853859186172485, + "step": 5224 + }, + { + "epoch": 2.2050632911392407, + "grad_norm": 1.9381071329116821, + "learning_rate": 7.77897423148801e-05, + "loss": 0.6485977172851562, + "step": 5226 + }, + { + "epoch": 2.2059071729957807, + "grad_norm": 1.4062265157699585, + "learning_rate": 7.776977528838376e-05, + "loss": 0.6243517398834229, + "step": 5228 + }, + { + "epoch": 2.2067510548523206, + "grad_norm": 1.2127182483673096, + "learning_rate": 7.774980185593358e-05, + "loss": 0.5770578980445862, + "step": 5230 + }, + { + "epoch": 2.207594936708861, + "grad_norm": 1.250847578048706, + "learning_rate": 7.772982202213709e-05, + "loss": 0.6521194577217102, + "step": 5232 + }, + { + "epoch": 2.208438818565401, + "grad_norm": 1.2568131685256958, + "learning_rate": 7.77098357916032e-05, + "loss": 0.5755271911621094, + "step": 5234 + }, + { + "epoch": 2.209282700421941, + "grad_norm": 1.2422975301742554, + "learning_rate": 7.768984316894236e-05, + "loss": 0.5486469864845276, + "step": 5236 + }, + { + "epoch": 2.2101265822784812, + "grad_norm": 1.1018635034561157, + "learning_rate": 7.766984415876652e-05, + "loss": 0.5512928366661072, + "step": 5238 + }, + { + "epoch": 2.210970464135021, + "grad_norm": 1.2261123657226562, + "learning_rate": 7.764983876568903e-05, + "loss": 0.5753499269485474, + "step": 5240 + }, + { + "epoch": 2.211814345991561, + "grad_norm": 1.2222342491149902, + "learning_rate": 7.762982699432474e-05, + "loss": 0.5404848456382751, + "step": 5242 + }, + { + "epoch": 2.212658227848101, + "grad_norm": 1.231494426727295, + "learning_rate": 7.760980884929004e-05, + "loss": 0.5999218821525574, + "step": 5244 + }, + { + "epoch": 2.2135021097046415, + "grad_norm": 1.1530078649520874, + "learning_rate": 7.758978433520268e-05, + "loss": 0.6123101115226746, + "step": 5246 + }, + { + "epoch": 2.2143459915611814, + "grad_norm": 1.182706594467163, + "learning_rate": 7.756975345668194e-05, + "loss": 0.5945886969566345, + "step": 5248 + }, + { + "epoch": 2.2151898734177213, + "grad_norm": 1.0788652896881104, + "learning_rate": 7.754971621834857e-05, + "loss": 0.5698213577270508, + "step": 5250 + }, + { + "epoch": 2.2160337552742617, + "grad_norm": 1.2243359088897705, + "learning_rate": 7.752967262482477e-05, + "loss": 0.5959678888320923, + "step": 5252 + }, + { + "epoch": 2.2168776371308017, + "grad_norm": 1.4292869567871094, + "learning_rate": 7.750962268073421e-05, + "loss": 0.586794376373291, + "step": 5254 + }, + { + "epoch": 2.2177215189873416, + "grad_norm": 1.1809570789337158, + "learning_rate": 7.748956639070204e-05, + "loss": 0.5513298511505127, + "step": 5256 + }, + { + "epoch": 2.218565400843882, + "grad_norm": 1.485813856124878, + "learning_rate": 7.746950375935484e-05, + "loss": 0.6402831673622131, + "step": 5258 + }, + { + "epoch": 2.219409282700422, + "grad_norm": 1.0851374864578247, + "learning_rate": 7.744943479132069e-05, + "loss": 0.5729117393493652, + "step": 5260 + }, + { + "epoch": 2.220253164556962, + "grad_norm": 1.4308949708938599, + "learning_rate": 7.742935949122911e-05, + "loss": 0.6239725947380066, + "step": 5262 + }, + { + "epoch": 2.2210970464135023, + "grad_norm": 1.379258155822754, + "learning_rate": 7.740927786371107e-05, + "loss": 0.6260181069374084, + "step": 5264 + }, + { + "epoch": 2.221940928270042, + "grad_norm": 1.1661925315856934, + "learning_rate": 7.738918991339905e-05, + "loss": 0.6074157357215881, + "step": 5266 + }, + { + "epoch": 2.222784810126582, + "grad_norm": 1.168901801109314, + "learning_rate": 7.736909564492694e-05, + "loss": 0.6119515895843506, + "step": 5268 + }, + { + "epoch": 2.2236286919831225, + "grad_norm": 1.1451057195663452, + "learning_rate": 7.734899506293008e-05, + "loss": 0.5505842566490173, + "step": 5270 + }, + { + "epoch": 2.2244725738396625, + "grad_norm": 1.2303991317749023, + "learning_rate": 7.732888817204533e-05, + "loss": 0.6117991805076599, + "step": 5272 + }, + { + "epoch": 2.2253164556962024, + "grad_norm": 1.04572331905365, + "learning_rate": 7.730877497691092e-05, + "loss": 0.5589770078659058, + "step": 5274 + }, + { + "epoch": 2.226160337552743, + "grad_norm": 1.2047234773635864, + "learning_rate": 7.72886554821666e-05, + "loss": 0.6288654208183289, + "step": 5276 + }, + { + "epoch": 2.2270042194092827, + "grad_norm": 1.2036652565002441, + "learning_rate": 7.726852969245355e-05, + "loss": 0.6174501776695251, + "step": 5278 + }, + { + "epoch": 2.2278481012658227, + "grad_norm": 1.1740167140960693, + "learning_rate": 7.72483976124144e-05, + "loss": 0.6027677655220032, + "step": 5280 + }, + { + "epoch": 2.228691983122363, + "grad_norm": 1.0600008964538574, + "learning_rate": 7.722825924669326e-05, + "loss": 0.6016151309013367, + "step": 5282 + }, + { + "epoch": 2.229535864978903, + "grad_norm": 1.2631008625030518, + "learning_rate": 7.720811459993562e-05, + "loss": 0.5905849933624268, + "step": 5284 + }, + { + "epoch": 2.230379746835443, + "grad_norm": 1.1024738550186157, + "learning_rate": 7.718796367678848e-05, + "loss": 0.5129587054252625, + "step": 5286 + }, + { + "epoch": 2.2312236286919833, + "grad_norm": 1.23116934299469, + "learning_rate": 7.716780648190028e-05, + "loss": 0.5709586143493652, + "step": 5288 + }, + { + "epoch": 2.2320675105485233, + "grad_norm": 1.2739102840423584, + "learning_rate": 7.714764301992088e-05, + "loss": 0.5454761385917664, + "step": 5290 + }, + { + "epoch": 2.232911392405063, + "grad_norm": 1.303963303565979, + "learning_rate": 7.712747329550162e-05, + "loss": 0.537248969078064, + "step": 5292 + }, + { + "epoch": 2.233755274261603, + "grad_norm": 1.2454309463500977, + "learning_rate": 7.710729731329529e-05, + "loss": 0.6364415884017944, + "step": 5294 + }, + { + "epoch": 2.2345991561181435, + "grad_norm": 1.2401882410049438, + "learning_rate": 7.708711507795605e-05, + "loss": 0.5640100240707397, + "step": 5296 + }, + { + "epoch": 2.2354430379746835, + "grad_norm": 1.197432041168213, + "learning_rate": 7.706692659413959e-05, + "loss": 0.5919729471206665, + "step": 5298 + }, + { + "epoch": 2.2362869198312234, + "grad_norm": 1.1779764890670776, + "learning_rate": 7.704673186650298e-05, + "loss": 0.5569849014282227, + "step": 5300 + }, + { + "epoch": 2.2362869198312234, + "eval_loss": 0.6898328065872192, + "eval_runtime": 739.3794, + "eval_samples_per_second": 2.85, + "eval_steps_per_second": 2.85, + "step": 5300 + }, + { + "epoch": 2.237130801687764, + "grad_norm": 1.1371463537216187, + "learning_rate": 7.702653089970479e-05, + "loss": 0.5823061466217041, + "step": 5302 + }, + { + "epoch": 2.2379746835443037, + "grad_norm": 1.1877846717834473, + "learning_rate": 7.700632369840497e-05, + "loss": 0.5556252002716064, + "step": 5304 + }, + { + "epoch": 2.2388185654008437, + "grad_norm": 1.1580896377563477, + "learning_rate": 7.698611026726492e-05, + "loss": 0.5794119834899902, + "step": 5306 + }, + { + "epoch": 2.239662447257384, + "grad_norm": 1.29141366481781, + "learning_rate": 7.696589061094755e-05, + "loss": 0.5828680396080017, + "step": 5308 + }, + { + "epoch": 2.240506329113924, + "grad_norm": 1.1286728382110596, + "learning_rate": 7.694566473411706e-05, + "loss": 0.6161736845970154, + "step": 5310 + }, + { + "epoch": 2.241350210970464, + "grad_norm": 1.0969985723495483, + "learning_rate": 7.692543264143925e-05, + "loss": 0.570767879486084, + "step": 5312 + }, + { + "epoch": 2.2421940928270043, + "grad_norm": 1.2902227640151978, + "learning_rate": 7.690519433758123e-05, + "loss": 0.631476104259491, + "step": 5314 + }, + { + "epoch": 2.2430379746835443, + "grad_norm": 1.432735800743103, + "learning_rate": 7.68849498272116e-05, + "loss": 0.6142309904098511, + "step": 5316 + }, + { + "epoch": 2.243881856540084, + "grad_norm": 1.0824161767959595, + "learning_rate": 7.686469911500038e-05, + "loss": 0.5871514081954956, + "step": 5318 + }, + { + "epoch": 2.2447257383966246, + "grad_norm": 1.1694978475570679, + "learning_rate": 7.684444220561902e-05, + "loss": 0.6144557595252991, + "step": 5320 + }, + { + "epoch": 2.2455696202531645, + "grad_norm": 1.2981040477752686, + "learning_rate": 7.68241791037404e-05, + "loss": 0.6049425601959229, + "step": 5322 + }, + { + "epoch": 2.2464135021097045, + "grad_norm": 1.132128357887268, + "learning_rate": 7.680390981403885e-05, + "loss": 0.5571867823600769, + "step": 5324 + }, + { + "epoch": 2.247257383966245, + "grad_norm": 1.1760079860687256, + "learning_rate": 7.678363434119005e-05, + "loss": 0.5710517168045044, + "step": 5326 + }, + { + "epoch": 2.248101265822785, + "grad_norm": 1.1918572187423706, + "learning_rate": 7.67633526898712e-05, + "loss": 0.5508866906166077, + "step": 5328 + }, + { + "epoch": 2.2489451476793247, + "grad_norm": 1.1837294101715088, + "learning_rate": 7.674306486476091e-05, + "loss": 0.6242696046829224, + "step": 5330 + }, + { + "epoch": 2.249789029535865, + "grad_norm": 1.384918212890625, + "learning_rate": 7.672277087053914e-05, + "loss": 0.5821678042411804, + "step": 5332 + }, + { + "epoch": 2.250632911392405, + "grad_norm": 1.1248877048492432, + "learning_rate": 7.670247071188738e-05, + "loss": 0.5415928363800049, + "step": 5334 + }, + { + "epoch": 2.251476793248945, + "grad_norm": 1.228140950202942, + "learning_rate": 7.668216439348843e-05, + "loss": 0.5475174188613892, + "step": 5336 + }, + { + "epoch": 2.2523206751054854, + "grad_norm": 1.3816046714782715, + "learning_rate": 7.666185192002662e-05, + "loss": 0.5793306231498718, + "step": 5338 + }, + { + "epoch": 2.2531645569620253, + "grad_norm": 1.2446565628051758, + "learning_rate": 7.664153329618759e-05, + "loss": 0.6221131682395935, + "step": 5340 + }, + { + "epoch": 2.2540084388185653, + "grad_norm": 1.1677669286727905, + "learning_rate": 7.662120852665852e-05, + "loss": 0.5403847694396973, + "step": 5342 + }, + { + "epoch": 2.2548523206751057, + "grad_norm": 1.2485873699188232, + "learning_rate": 7.66008776161279e-05, + "loss": 0.620201587677002, + "step": 5344 + }, + { + "epoch": 2.2556962025316456, + "grad_norm": 1.2486802339553833, + "learning_rate": 7.658054056928568e-05, + "loss": 0.5969216227531433, + "step": 5346 + }, + { + "epoch": 2.2565400843881855, + "grad_norm": 1.2621372938156128, + "learning_rate": 7.656019739082326e-05, + "loss": 0.6376339793205261, + "step": 5348 + }, + { + "epoch": 2.257383966244726, + "grad_norm": 1.238633155822754, + "learning_rate": 7.65398480854334e-05, + "loss": 0.6374872326850891, + "step": 5350 + }, + { + "epoch": 2.258227848101266, + "grad_norm": 1.3031803369522095, + "learning_rate": 7.651949265781029e-05, + "loss": 0.6348551511764526, + "step": 5352 + }, + { + "epoch": 2.259071729957806, + "grad_norm": 1.3735158443450928, + "learning_rate": 7.649913111264952e-05, + "loss": 0.6267750859260559, + "step": 5354 + }, + { + "epoch": 2.259915611814346, + "grad_norm": 1.1227772235870361, + "learning_rate": 7.647876345464817e-05, + "loss": 0.623030960559845, + "step": 5356 + }, + { + "epoch": 2.260759493670886, + "grad_norm": 1.4555678367614746, + "learning_rate": 7.645838968850459e-05, + "loss": 0.5810713171958923, + "step": 5358 + }, + { + "epoch": 2.261603375527426, + "grad_norm": 1.227725863456726, + "learning_rate": 7.643800981891867e-05, + "loss": 0.6150093078613281, + "step": 5360 + }, + { + "epoch": 2.2624472573839665, + "grad_norm": 1.0648300647735596, + "learning_rate": 7.641762385059161e-05, + "loss": 0.5350445508956909, + "step": 5362 + }, + { + "epoch": 2.2632911392405064, + "grad_norm": 1.179452896118164, + "learning_rate": 7.639723178822613e-05, + "loss": 0.6253421306610107, + "step": 5364 + }, + { + "epoch": 2.2641350210970463, + "grad_norm": 1.0983240604400635, + "learning_rate": 7.637683363652621e-05, + "loss": 0.5512562990188599, + "step": 5366 + }, + { + "epoch": 2.2649789029535867, + "grad_norm": 1.1825451850891113, + "learning_rate": 7.635642940019736e-05, + "loss": 0.5584151148796082, + "step": 5368 + }, + { + "epoch": 2.2658227848101267, + "grad_norm": 1.1022000312805176, + "learning_rate": 7.633601908394643e-05, + "loss": 0.5881790518760681, + "step": 5370 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 1.1935697793960571, + "learning_rate": 7.631560269248169e-05, + "loss": 0.6060683131217957, + "step": 5372 + }, + { + "epoch": 2.267510548523207, + "grad_norm": 1.1174103021621704, + "learning_rate": 7.62951802305128e-05, + "loss": 0.5877062678337097, + "step": 5374 + }, + { + "epoch": 2.268354430379747, + "grad_norm": 1.3934977054595947, + "learning_rate": 7.627475170275086e-05, + "loss": 0.5145504474639893, + "step": 5376 + }, + { + "epoch": 2.269198312236287, + "grad_norm": 1.2637842893600464, + "learning_rate": 7.625431711390831e-05, + "loss": 0.6194025874137878, + "step": 5378 + }, + { + "epoch": 2.270042194092827, + "grad_norm": 1.2034388780593872, + "learning_rate": 7.623387646869902e-05, + "loss": 0.6205627918243408, + "step": 5380 + }, + { + "epoch": 2.270886075949367, + "grad_norm": 0.953880250453949, + "learning_rate": 7.621342977183826e-05, + "loss": 0.5609696507453918, + "step": 5382 + }, + { + "epoch": 2.271729957805907, + "grad_norm": 1.2841949462890625, + "learning_rate": 7.619297702804272e-05, + "loss": 0.6044906377792358, + "step": 5384 + }, + { + "epoch": 2.272573839662447, + "grad_norm": 1.146804690361023, + "learning_rate": 7.617251824203037e-05, + "loss": 0.5420435667037964, + "step": 5386 + }, + { + "epoch": 2.2734177215189875, + "grad_norm": 1.2225698232650757, + "learning_rate": 7.615205341852076e-05, + "loss": 0.6230710744857788, + "step": 5388 + }, + { + "epoch": 2.2742616033755274, + "grad_norm": 1.3423371315002441, + "learning_rate": 7.613158256223467e-05, + "loss": 0.6486349701881409, + "step": 5390 + }, + { + "epoch": 2.2751054852320673, + "grad_norm": 1.0840023756027222, + "learning_rate": 7.611110567789435e-05, + "loss": 0.6527825593948364, + "step": 5392 + }, + { + "epoch": 2.2759493670886077, + "grad_norm": 1.342466950416565, + "learning_rate": 7.609062277022341e-05, + "loss": 0.6859483122825623, + "step": 5394 + }, + { + "epoch": 2.2767932489451477, + "grad_norm": 1.0406129360198975, + "learning_rate": 7.607013384394691e-05, + "loss": 0.5536003708839417, + "step": 5396 + }, + { + "epoch": 2.2776371308016876, + "grad_norm": 1.0853544473648071, + "learning_rate": 7.604963890379118e-05, + "loss": 0.5488654971122742, + "step": 5398 + }, + { + "epoch": 2.278481012658228, + "grad_norm": 1.0330145359039307, + "learning_rate": 7.602913795448407e-05, + "loss": 0.6072142720222473, + "step": 5400 + }, + { + "epoch": 2.278481012658228, + "eval_loss": 0.6875645518302917, + "eval_runtime": 861.3558, + "eval_samples_per_second": 2.446, + "eval_steps_per_second": 2.446, + "step": 5400 + }, + { + "epoch": 2.279324894514768, + "grad_norm": 1.1858742237091064, + "learning_rate": 7.600863100075472e-05, + "loss": 0.5420109033584595, + "step": 5402 + }, + { + "epoch": 2.280168776371308, + "grad_norm": 1.2126039266586304, + "learning_rate": 7.598811804733373e-05, + "loss": 0.6109243631362915, + "step": 5404 + }, + { + "epoch": 2.2810126582278483, + "grad_norm": 1.1290241479873657, + "learning_rate": 7.5967599098953e-05, + "loss": 0.5889696478843689, + "step": 5406 + }, + { + "epoch": 2.281856540084388, + "grad_norm": 1.320263147354126, + "learning_rate": 7.594707416034586e-05, + "loss": 0.6548630595207214, + "step": 5408 + }, + { + "epoch": 2.282700421940928, + "grad_norm": 1.346169114112854, + "learning_rate": 7.592654323624703e-05, + "loss": 0.6556787490844727, + "step": 5410 + }, + { + "epoch": 2.2835443037974685, + "grad_norm": 1.2104716300964355, + "learning_rate": 7.590600633139265e-05, + "loss": 0.5631673336029053, + "step": 5412 + }, + { + "epoch": 2.2843881856540085, + "grad_norm": 1.3298237323760986, + "learning_rate": 7.58854634505201e-05, + "loss": 0.5931088328361511, + "step": 5414 + }, + { + "epoch": 2.2852320675105484, + "grad_norm": 1.4201204776763916, + "learning_rate": 7.586491459836829e-05, + "loss": 0.6966755986213684, + "step": 5416 + }, + { + "epoch": 2.286075949367089, + "grad_norm": 1.253135323524475, + "learning_rate": 7.584435977967743e-05, + "loss": 0.6172569394111633, + "step": 5418 + }, + { + "epoch": 2.2869198312236287, + "grad_norm": 1.133144736289978, + "learning_rate": 7.582379899918911e-05, + "loss": 0.5376655459403992, + "step": 5420 + }, + { + "epoch": 2.2877637130801687, + "grad_norm": 1.1103745698928833, + "learning_rate": 7.580323226164632e-05, + "loss": 0.6138498187065125, + "step": 5422 + }, + { + "epoch": 2.2886075949367086, + "grad_norm": 1.091636300086975, + "learning_rate": 7.57826595717934e-05, + "loss": 0.5049096345901489, + "step": 5424 + }, + { + "epoch": 2.289451476793249, + "grad_norm": 1.2486571073532104, + "learning_rate": 7.57620809343761e-05, + "loss": 0.5666115283966064, + "step": 5426 + }, + { + "epoch": 2.290295358649789, + "grad_norm": 1.510684847831726, + "learning_rate": 7.57414963541415e-05, + "loss": 0.49512919783592224, + "step": 5428 + }, + { + "epoch": 2.291139240506329, + "grad_norm": 1.1142191886901855, + "learning_rate": 7.572090583583805e-05, + "loss": 0.558807373046875, + "step": 5430 + }, + { + "epoch": 2.2919831223628693, + "grad_norm": 1.1162657737731934, + "learning_rate": 7.57003093842156e-05, + "loss": 0.6245265603065491, + "step": 5432 + }, + { + "epoch": 2.292827004219409, + "grad_norm": 1.2784614562988281, + "learning_rate": 7.567970700402537e-05, + "loss": 0.5505527853965759, + "step": 5434 + }, + { + "epoch": 2.293670886075949, + "grad_norm": 1.3142638206481934, + "learning_rate": 7.565909870001992e-05, + "loss": 0.6137702465057373, + "step": 5436 + }, + { + "epoch": 2.2945147679324895, + "grad_norm": 1.072805404663086, + "learning_rate": 7.563848447695318e-05, + "loss": 0.540766716003418, + "step": 5438 + }, + { + "epoch": 2.2953586497890295, + "grad_norm": 1.2861377000808716, + "learning_rate": 7.561786433958048e-05, + "loss": 0.6806555986404419, + "step": 5440 + }, + { + "epoch": 2.2962025316455694, + "grad_norm": 1.3193045854568481, + "learning_rate": 7.559723829265847e-05, + "loss": 0.6191258430480957, + "step": 5442 + }, + { + "epoch": 2.29704641350211, + "grad_norm": 1.1969127655029297, + "learning_rate": 7.55766063409452e-05, + "loss": 0.6067718863487244, + "step": 5444 + }, + { + "epoch": 2.2978902953586497, + "grad_norm": 1.2129666805267334, + "learning_rate": 7.555596848920006e-05, + "loss": 0.5673627257347107, + "step": 5446 + }, + { + "epoch": 2.2987341772151897, + "grad_norm": 1.1639961004257202, + "learning_rate": 7.553532474218379e-05, + "loss": 0.61825031042099, + "step": 5448 + }, + { + "epoch": 2.29957805907173, + "grad_norm": 1.3893283605575562, + "learning_rate": 7.551467510465852e-05, + "loss": 0.6096790432929993, + "step": 5450 + }, + { + "epoch": 2.30042194092827, + "grad_norm": 1.0708417892456055, + "learning_rate": 7.549401958138772e-05, + "loss": 0.6121414303779602, + "step": 5452 + }, + { + "epoch": 2.30126582278481, + "grad_norm": 1.3299298286437988, + "learning_rate": 7.547335817713624e-05, + "loss": 0.6504668593406677, + "step": 5454 + }, + { + "epoch": 2.3021097046413503, + "grad_norm": 1.3594682216644287, + "learning_rate": 7.545269089667022e-05, + "loss": 0.5761144161224365, + "step": 5456 + }, + { + "epoch": 2.3029535864978903, + "grad_norm": 1.1089586019515991, + "learning_rate": 7.543201774475726e-05, + "loss": 0.5457773804664612, + "step": 5458 + }, + { + "epoch": 2.3037974683544302, + "grad_norm": 1.3472918272018433, + "learning_rate": 7.541133872616624e-05, + "loss": 0.6014775037765503, + "step": 5460 + }, + { + "epoch": 2.3046413502109706, + "grad_norm": 1.2757689952850342, + "learning_rate": 7.53906538456674e-05, + "loss": 0.6246467232704163, + "step": 5462 + }, + { + "epoch": 2.3054852320675105, + "grad_norm": 1.4598166942596436, + "learning_rate": 7.536996310803236e-05, + "loss": 0.6583935022354126, + "step": 5464 + }, + { + "epoch": 2.3063291139240505, + "grad_norm": 1.2861602306365967, + "learning_rate": 7.534926651803407e-05, + "loss": 0.562523603439331, + "step": 5466 + }, + { + "epoch": 2.307172995780591, + "grad_norm": 1.0953221321105957, + "learning_rate": 7.532856408044684e-05, + "loss": 0.6093505620956421, + "step": 5468 + }, + { + "epoch": 2.308016877637131, + "grad_norm": 1.0982829332351685, + "learning_rate": 7.530785580004631e-05, + "loss": 0.6196447014808655, + "step": 5470 + }, + { + "epoch": 2.3088607594936708, + "grad_norm": 1.2224280834197998, + "learning_rate": 7.52871416816095e-05, + "loss": 0.6360989212989807, + "step": 5472 + }, + { + "epoch": 2.309704641350211, + "grad_norm": 1.244486927986145, + "learning_rate": 7.526642172991476e-05, + "loss": 0.6189543008804321, + "step": 5474 + }, + { + "epoch": 2.310548523206751, + "grad_norm": 1.2408053874969482, + "learning_rate": 7.524569594974178e-05, + "loss": 0.6137582659721375, + "step": 5476 + }, + { + "epoch": 2.311392405063291, + "grad_norm": 1.3323272466659546, + "learning_rate": 7.522496434587157e-05, + "loss": 0.6462169289588928, + "step": 5478 + }, + { + "epoch": 2.3122362869198314, + "grad_norm": 1.1076425313949585, + "learning_rate": 7.520422692308657e-05, + "loss": 0.5495362877845764, + "step": 5480 + }, + { + "epoch": 2.3130801687763713, + "grad_norm": 1.3298509120941162, + "learning_rate": 7.518348368617046e-05, + "loss": 0.5560636520385742, + "step": 5482 + }, + { + "epoch": 2.3139240506329113, + "grad_norm": 1.0740195512771606, + "learning_rate": 7.516273463990832e-05, + "loss": 0.5763371586799622, + "step": 5484 + }, + { + "epoch": 2.3147679324894517, + "grad_norm": 1.0748567581176758, + "learning_rate": 7.514197978908657e-05, + "loss": 0.5111498832702637, + "step": 5486 + }, + { + "epoch": 2.3156118143459916, + "grad_norm": 1.2047218084335327, + "learning_rate": 7.512121913849294e-05, + "loss": 0.6599951982498169, + "step": 5488 + }, + { + "epoch": 2.3164556962025316, + "grad_norm": 1.2956700325012207, + "learning_rate": 7.510045269291651e-05, + "loss": 0.6409770846366882, + "step": 5490 + }, + { + "epoch": 2.317299578059072, + "grad_norm": 1.241860032081604, + "learning_rate": 7.50796804571477e-05, + "loss": 0.5967662334442139, + "step": 5492 + }, + { + "epoch": 2.318143459915612, + "grad_norm": 1.1612682342529297, + "learning_rate": 7.50589024359783e-05, + "loss": 0.5856342315673828, + "step": 5494 + }, + { + "epoch": 2.318987341772152, + "grad_norm": 1.0895500183105469, + "learning_rate": 7.503811863420135e-05, + "loss": 0.5652023553848267, + "step": 5496 + }, + { + "epoch": 2.319831223628692, + "grad_norm": 1.3374481201171875, + "learning_rate": 7.50173290566113e-05, + "loss": 0.6777268648147583, + "step": 5498 + }, + { + "epoch": 2.320675105485232, + "grad_norm": 1.192614197731018, + "learning_rate": 7.499653370800391e-05, + "loss": 0.6052314043045044, + "step": 5500 + }, + { + "epoch": 2.320675105485232, + "eval_loss": 0.6867148876190186, + "eval_runtime": 941.3545, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 2.238, + "step": 5500 + }, + { + "epoch": 2.321518987341772, + "grad_norm": 1.1008832454681396, + "learning_rate": 7.497573259317625e-05, + "loss": 0.5208253860473633, + "step": 5502 + }, + { + "epoch": 2.3223628691983125, + "grad_norm": 1.2141541242599487, + "learning_rate": 7.495492571692677e-05, + "loss": 0.6352296471595764, + "step": 5504 + }, + { + "epoch": 2.3232067510548524, + "grad_norm": 1.2588802576065063, + "learning_rate": 7.493411308405517e-05, + "loss": 0.6132256388664246, + "step": 5506 + }, + { + "epoch": 2.3240506329113924, + "grad_norm": 1.348765254020691, + "learning_rate": 7.491329469936258e-05, + "loss": 0.571265697479248, + "step": 5508 + }, + { + "epoch": 2.3248945147679323, + "grad_norm": 1.266377329826355, + "learning_rate": 7.489247056765135e-05, + "loss": 0.5433708429336548, + "step": 5510 + }, + { + "epoch": 2.3257383966244727, + "grad_norm": 1.2920128107070923, + "learning_rate": 7.487164069372523e-05, + "loss": 0.6193158030509949, + "step": 5512 + }, + { + "epoch": 2.3265822784810126, + "grad_norm": 1.068169116973877, + "learning_rate": 7.485080508238928e-05, + "loss": 0.5817977786064148, + "step": 5514 + }, + { + "epoch": 2.3274261603375526, + "grad_norm": 1.2941710948944092, + "learning_rate": 7.482996373844985e-05, + "loss": 0.6558082103729248, + "step": 5516 + }, + { + "epoch": 2.328270042194093, + "grad_norm": 1.2143336534500122, + "learning_rate": 7.480911666671467e-05, + "loss": 0.5569961667060852, + "step": 5518 + }, + { + "epoch": 2.329113924050633, + "grad_norm": 1.3364789485931396, + "learning_rate": 7.478826387199274e-05, + "loss": 0.6497300863265991, + "step": 5520 + }, + { + "epoch": 2.329957805907173, + "grad_norm": 1.057530403137207, + "learning_rate": 7.47674053590944e-05, + "loss": 0.5793087482452393, + "step": 5522 + }, + { + "epoch": 2.330801687763713, + "grad_norm": 1.1543176174163818, + "learning_rate": 7.47465411328313e-05, + "loss": 0.5583140850067139, + "step": 5524 + }, + { + "epoch": 2.331645569620253, + "grad_norm": 1.3409180641174316, + "learning_rate": 7.472567119801645e-05, + "loss": 0.6318784952163696, + "step": 5526 + }, + { + "epoch": 2.332489451476793, + "grad_norm": 1.2899413108825684, + "learning_rate": 7.47047955594641e-05, + "loss": 0.5950855612754822, + "step": 5528 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 1.329220175743103, + "learning_rate": 7.468391422198989e-05, + "loss": 0.6181023716926575, + "step": 5530 + }, + { + "epoch": 2.3341772151898734, + "grad_norm": 1.202129602432251, + "learning_rate": 7.466302719041073e-05, + "loss": 0.6384578943252563, + "step": 5532 + }, + { + "epoch": 2.3350210970464134, + "grad_norm": 1.1890549659729004, + "learning_rate": 7.464213446954487e-05, + "loss": 0.6059293746948242, + "step": 5534 + }, + { + "epoch": 2.3358649789029537, + "grad_norm": 1.2041429281234741, + "learning_rate": 7.462123606421183e-05, + "loss": 0.6432797908782959, + "step": 5536 + }, + { + "epoch": 2.3367088607594937, + "grad_norm": 1.3827080726623535, + "learning_rate": 7.460033197923249e-05, + "loss": 0.6796717047691345, + "step": 5538 + }, + { + "epoch": 2.3375527426160336, + "grad_norm": 1.2323482036590576, + "learning_rate": 7.457942221942903e-05, + "loss": 0.5772476196289062, + "step": 5540 + }, + { + "epoch": 2.338396624472574, + "grad_norm": 1.2011388540267944, + "learning_rate": 7.455850678962493e-05, + "loss": 0.5964269042015076, + "step": 5542 + }, + { + "epoch": 2.339240506329114, + "grad_norm": 1.1133569478988647, + "learning_rate": 7.453758569464495e-05, + "loss": 0.6416608095169067, + "step": 5544 + }, + { + "epoch": 2.340084388185654, + "grad_norm": 1.1257679462432861, + "learning_rate": 7.451665893931521e-05, + "loss": 0.5668829679489136, + "step": 5546 + }, + { + "epoch": 2.3409282700421943, + "grad_norm": 1.3494724035263062, + "learning_rate": 7.449572652846311e-05, + "loss": 0.6029916405677795, + "step": 5548 + }, + { + "epoch": 2.3417721518987342, + "grad_norm": 1.2199759483337402, + "learning_rate": 7.447478846691735e-05, + "loss": 0.6336984634399414, + "step": 5550 + }, + { + "epoch": 2.342616033755274, + "grad_norm": 1.2806570529937744, + "learning_rate": 7.445384475950792e-05, + "loss": 0.579140305519104, + "step": 5552 + }, + { + "epoch": 2.343459915611814, + "grad_norm": 0.9874221086502075, + "learning_rate": 7.443289541106616e-05, + "loss": 0.6061640381813049, + "step": 5554 + }, + { + "epoch": 2.3443037974683545, + "grad_norm": 1.2271486520767212, + "learning_rate": 7.441194042642467e-05, + "loss": 0.5502339601516724, + "step": 5556 + }, + { + "epoch": 2.3451476793248944, + "grad_norm": 1.2522462606430054, + "learning_rate": 7.439097981041738e-05, + "loss": 0.5774438381195068, + "step": 5558 + }, + { + "epoch": 2.3459915611814344, + "grad_norm": 1.267204761505127, + "learning_rate": 7.437001356787945e-05, + "loss": 0.6091527342796326, + "step": 5560 + }, + { + "epoch": 2.3468354430379748, + "grad_norm": 1.1711935997009277, + "learning_rate": 7.434904170364747e-05, + "loss": 0.5443631410598755, + "step": 5562 + }, + { + "epoch": 2.3476793248945147, + "grad_norm": 1.085097074508667, + "learning_rate": 7.432806422255918e-05, + "loss": 0.5255029201507568, + "step": 5564 + }, + { + "epoch": 2.3485232067510546, + "grad_norm": 1.3244949579238892, + "learning_rate": 7.430708112945369e-05, + "loss": 0.5197238922119141, + "step": 5566 + }, + { + "epoch": 2.349367088607595, + "grad_norm": 1.3646879196166992, + "learning_rate": 7.428609242917141e-05, + "loss": 0.5576170682907104, + "step": 5568 + }, + { + "epoch": 2.350210970464135, + "grad_norm": 1.339190125465393, + "learning_rate": 7.426509812655406e-05, + "loss": 0.6254662275314331, + "step": 5570 + }, + { + "epoch": 2.351054852320675, + "grad_norm": 1.4624155759811401, + "learning_rate": 7.424409822644457e-05, + "loss": 0.6593500375747681, + "step": 5572 + }, + { + "epoch": 2.3518987341772153, + "grad_norm": 1.1931114196777344, + "learning_rate": 7.422309273368722e-05, + "loss": 0.6102238297462463, + "step": 5574 + }, + { + "epoch": 2.3527426160337552, + "grad_norm": 1.789340615272522, + "learning_rate": 7.420208165312762e-05, + "loss": 0.6695854067802429, + "step": 5576 + }, + { + "epoch": 2.353586497890295, + "grad_norm": 1.2364262342453003, + "learning_rate": 7.418106498961258e-05, + "loss": 0.578844428062439, + "step": 5578 + }, + { + "epoch": 2.3544303797468356, + "grad_norm": 1.1568509340286255, + "learning_rate": 7.416004274799027e-05, + "loss": 0.5717503428459167, + "step": 5580 + }, + { + "epoch": 2.3552742616033755, + "grad_norm": 1.1744630336761475, + "learning_rate": 7.413901493311009e-05, + "loss": 0.6170201897621155, + "step": 5582 + }, + { + "epoch": 2.3561181434599154, + "grad_norm": 1.0684332847595215, + "learning_rate": 7.411798154982275e-05, + "loss": 0.6482691764831543, + "step": 5584 + }, + { + "epoch": 2.356962025316456, + "grad_norm": 1.046196460723877, + "learning_rate": 7.409694260298025e-05, + "loss": 0.572839617729187, + "step": 5586 + }, + { + "epoch": 2.3578059071729958, + "grad_norm": 1.0110210180282593, + "learning_rate": 7.407589809743591e-05, + "loss": 0.5645976662635803, + "step": 5588 + }, + { + "epoch": 2.3586497890295357, + "grad_norm": 1.0801016092300415, + "learning_rate": 7.405484803804425e-05, + "loss": 0.5653133392333984, + "step": 5590 + }, + { + "epoch": 2.359493670886076, + "grad_norm": 1.0934380292892456, + "learning_rate": 7.403379242966116e-05, + "loss": 0.5972150564193726, + "step": 5592 + }, + { + "epoch": 2.360337552742616, + "grad_norm": 1.3722410202026367, + "learning_rate": 7.40127312771437e-05, + "loss": 0.5927542448043823, + "step": 5594 + }, + { + "epoch": 2.361181434599156, + "grad_norm": 1.1567236185073853, + "learning_rate": 7.399166458535032e-05, + "loss": 0.547027051448822, + "step": 5596 + }, + { + "epoch": 2.3620253164556964, + "grad_norm": 1.2254211902618408, + "learning_rate": 7.397059235914067e-05, + "loss": 0.5356617569923401, + "step": 5598 + }, + { + "epoch": 2.3628691983122363, + "grad_norm": 1.1529103517532349, + "learning_rate": 7.394951460337575e-05, + "loss": 0.5424175262451172, + "step": 5600 + }, + { + "epoch": 2.3628691983122363, + "eval_loss": 0.6851074695587158, + "eval_runtime": 938.5536, + "eval_samples_per_second": 2.245, + "eval_steps_per_second": 2.245, + "step": 5600 + }, + { + "epoch": 2.3637130801687762, + "grad_norm": 1.2050299644470215, + "learning_rate": 7.392843132291777e-05, + "loss": 0.5834107398986816, + "step": 5602 + }, + { + "epoch": 2.3645569620253166, + "grad_norm": 1.264567494392395, + "learning_rate": 7.390734252263024e-05, + "loss": 0.5445035099983215, + "step": 5604 + }, + { + "epoch": 2.3654008438818566, + "grad_norm": 1.357791781425476, + "learning_rate": 7.388624820737791e-05, + "loss": 0.6207653880119324, + "step": 5606 + }, + { + "epoch": 2.3662447257383965, + "grad_norm": 1.2246928215026855, + "learning_rate": 7.386514838202689e-05, + "loss": 0.6628696322441101, + "step": 5608 + }, + { + "epoch": 2.367088607594937, + "grad_norm": 1.1455399990081787, + "learning_rate": 7.384404305144447e-05, + "loss": 0.5870704054832458, + "step": 5610 + }, + { + "epoch": 2.367932489451477, + "grad_norm": 1.2338638305664062, + "learning_rate": 7.382293222049925e-05, + "loss": 0.6160538792610168, + "step": 5612 + }, + { + "epoch": 2.3687763713080168, + "grad_norm": 1.231271505355835, + "learning_rate": 7.38018158940611e-05, + "loss": 0.6274036765098572, + "step": 5614 + }, + { + "epoch": 2.369620253164557, + "grad_norm": 1.022050380706787, + "learning_rate": 7.378069407700114e-05, + "loss": 0.5623515248298645, + "step": 5616 + }, + { + "epoch": 2.370464135021097, + "grad_norm": 1.2040951251983643, + "learning_rate": 7.375956677419178e-05, + "loss": 0.5505564212799072, + "step": 5618 + }, + { + "epoch": 2.371308016877637, + "grad_norm": 1.1754523515701294, + "learning_rate": 7.373843399050668e-05, + "loss": 0.6537002921104431, + "step": 5620 + }, + { + "epoch": 2.3721518987341774, + "grad_norm": 1.1710485219955444, + "learning_rate": 7.371729573082073e-05, + "loss": 0.6224458813667297, + "step": 5622 + }, + { + "epoch": 2.3729957805907174, + "grad_norm": 1.1629483699798584, + "learning_rate": 7.36961520000102e-05, + "loss": 0.6297177076339722, + "step": 5624 + }, + { + "epoch": 2.3738396624472573, + "grad_norm": 1.1069440841674805, + "learning_rate": 7.367500280295248e-05, + "loss": 0.5202008485794067, + "step": 5626 + }, + { + "epoch": 2.3746835443037977, + "grad_norm": 1.0068297386169434, + "learning_rate": 7.36538481445263e-05, + "loss": 0.5256102681159973, + "step": 5628 + }, + { + "epoch": 2.3755274261603376, + "grad_norm": 1.1103417873382568, + "learning_rate": 7.363268802961161e-05, + "loss": 0.5460903644561768, + "step": 5630 + }, + { + "epoch": 2.3763713080168776, + "grad_norm": 1.2885268926620483, + "learning_rate": 7.361152246308969e-05, + "loss": 0.5817124247550964, + "step": 5632 + }, + { + "epoch": 2.377215189873418, + "grad_norm": 1.233831524848938, + "learning_rate": 7.359035144984302e-05, + "loss": 0.5415143966674805, + "step": 5634 + }, + { + "epoch": 2.378059071729958, + "grad_norm": 1.3451908826828003, + "learning_rate": 7.35691749947553e-05, + "loss": 0.6837685108184814, + "step": 5636 + }, + { + "epoch": 2.378902953586498, + "grad_norm": 1.1320621967315674, + "learning_rate": 7.354799310271159e-05, + "loss": 0.5966196656227112, + "step": 5638 + }, + { + "epoch": 2.379746835443038, + "grad_norm": 1.1884461641311646, + "learning_rate": 7.35268057785981e-05, + "loss": 0.5607479214668274, + "step": 5640 + }, + { + "epoch": 2.380590717299578, + "grad_norm": 1.2710856199264526, + "learning_rate": 7.350561302730236e-05, + "loss": 0.595242977142334, + "step": 5642 + }, + { + "epoch": 2.381434599156118, + "grad_norm": 1.3110458850860596, + "learning_rate": 7.348441485371314e-05, + "loss": 0.6208752393722534, + "step": 5644 + }, + { + "epoch": 2.382278481012658, + "grad_norm": 1.1734380722045898, + "learning_rate": 7.346321126272044e-05, + "loss": 0.6173125505447388, + "step": 5646 + }, + { + "epoch": 2.3831223628691984, + "grad_norm": 1.2024762630462646, + "learning_rate": 7.34420022592155e-05, + "loss": 0.6013050675392151, + "step": 5648 + }, + { + "epoch": 2.3839662447257384, + "grad_norm": 1.1305288076400757, + "learning_rate": 7.342078784809086e-05, + "loss": 0.5919594764709473, + "step": 5650 + }, + { + "epoch": 2.3848101265822783, + "grad_norm": 1.075323462486267, + "learning_rate": 7.339956803424028e-05, + "loss": 0.5399283766746521, + "step": 5652 + }, + { + "epoch": 2.3856540084388187, + "grad_norm": 1.2035599946975708, + "learning_rate": 7.337834282255873e-05, + "loss": 0.6253576874732971, + "step": 5654 + }, + { + "epoch": 2.3864978902953586, + "grad_norm": 1.0572105646133423, + "learning_rate": 7.335711221794251e-05, + "loss": 0.5247007608413696, + "step": 5656 + }, + { + "epoch": 2.3873417721518986, + "grad_norm": 1.2701191902160645, + "learning_rate": 7.333587622528906e-05, + "loss": 0.5800243020057678, + "step": 5658 + }, + { + "epoch": 2.388185654008439, + "grad_norm": 1.1772741079330444, + "learning_rate": 7.331463484949716e-05, + "loss": 0.589645504951477, + "step": 5660 + }, + { + "epoch": 2.389029535864979, + "grad_norm": 1.0562703609466553, + "learning_rate": 7.329338809546674e-05, + "loss": 0.5820419192314148, + "step": 5662 + }, + { + "epoch": 2.389873417721519, + "grad_norm": 1.1634355783462524, + "learning_rate": 7.327213596809906e-05, + "loss": 0.591435432434082, + "step": 5664 + }, + { + "epoch": 2.3907172995780592, + "grad_norm": 1.2220302820205688, + "learning_rate": 7.325087847229655e-05, + "loss": 0.5630883574485779, + "step": 5666 + }, + { + "epoch": 2.391561181434599, + "grad_norm": 1.4087659120559692, + "learning_rate": 7.322961561296294e-05, + "loss": 0.6050130128860474, + "step": 5668 + }, + { + "epoch": 2.392405063291139, + "grad_norm": 1.1126172542572021, + "learning_rate": 7.320834739500313e-05, + "loss": 0.56146240234375, + "step": 5670 + }, + { + "epoch": 2.3932489451476795, + "grad_norm": 0.99373859167099, + "learning_rate": 7.31870738233233e-05, + "loss": 0.5507852435112, + "step": 5672 + }, + { + "epoch": 2.3940928270042194, + "grad_norm": 1.14408540725708, + "learning_rate": 7.316579490283085e-05, + "loss": 0.5895347595214844, + "step": 5674 + }, + { + "epoch": 2.3949367088607594, + "grad_norm": 1.1728581190109253, + "learning_rate": 7.314451063843443e-05, + "loss": 0.5304404497146606, + "step": 5676 + }, + { + "epoch": 2.3957805907172998, + "grad_norm": 1.1721378564834595, + "learning_rate": 7.31232210350439e-05, + "loss": 0.5805793404579163, + "step": 5678 + }, + { + "epoch": 2.3966244725738397, + "grad_norm": 1.0499866008758545, + "learning_rate": 7.310192609757038e-05, + "loss": 0.5671767592430115, + "step": 5680 + }, + { + "epoch": 2.3974683544303796, + "grad_norm": 1.0959177017211914, + "learning_rate": 7.308062583092617e-05, + "loss": 0.6335723400115967, + "step": 5682 + }, + { + "epoch": 2.3983122362869196, + "grad_norm": 1.31142258644104, + "learning_rate": 7.305932024002487e-05, + "loss": 0.6032374501228333, + "step": 5684 + }, + { + "epoch": 2.39915611814346, + "grad_norm": 0.9212818741798401, + "learning_rate": 7.303800932978124e-05, + "loss": 0.5492936372756958, + "step": 5686 + }, + { + "epoch": 2.4, + "grad_norm": 1.1956428289413452, + "learning_rate": 7.301669310511132e-05, + "loss": 0.5533297061920166, + "step": 5688 + }, + { + "epoch": 2.40084388185654, + "grad_norm": 1.4048634767532349, + "learning_rate": 7.299537157093232e-05, + "loss": 0.5859368443489075, + "step": 5690 + }, + { + "epoch": 2.4016877637130802, + "grad_norm": 1.0580679178237915, + "learning_rate": 7.297404473216277e-05, + "loss": 0.5099439024925232, + "step": 5692 + }, + { + "epoch": 2.40253164556962, + "grad_norm": 1.2450575828552246, + "learning_rate": 7.29527125937223e-05, + "loss": 0.5631486177444458, + "step": 5694 + }, + { + "epoch": 2.40337552742616, + "grad_norm": 1.338466763496399, + "learning_rate": 7.293137516053187e-05, + "loss": 0.6045404672622681, + "step": 5696 + }, + { + "epoch": 2.4042194092827005, + "grad_norm": 1.198588252067566, + "learning_rate": 7.291003243751358e-05, + "loss": 0.6063475608825684, + "step": 5698 + }, + { + "epoch": 2.4050632911392404, + "grad_norm": 1.2315080165863037, + "learning_rate": 7.288868442959081e-05, + "loss": 0.5734809041023254, + "step": 5700 + }, + { + "epoch": 2.4050632911392404, + "eval_loss": 0.6841402053833008, + "eval_runtime": 941.6641, + "eval_samples_per_second": 2.238, + "eval_steps_per_second": 2.238, + "step": 5700 + }, + { + "epoch": 2.4059071729957804, + "grad_norm": 1.1494885683059692, + "learning_rate": 7.286733114168812e-05, + "loss": 0.5744594931602478, + "step": 5702 + }, + { + "epoch": 2.4067510548523208, + "grad_norm": 1.3769505023956299, + "learning_rate": 7.284597257873132e-05, + "loss": 0.611789882183075, + "step": 5704 + }, + { + "epoch": 2.4075949367088607, + "grad_norm": 1.2326449155807495, + "learning_rate": 7.28246087456474e-05, + "loss": 0.6091431975364685, + "step": 5706 + }, + { + "epoch": 2.4084388185654007, + "grad_norm": 1.1960830688476562, + "learning_rate": 7.28032396473646e-05, + "loss": 0.49431973695755005, + "step": 5708 + }, + { + "epoch": 2.409282700421941, + "grad_norm": 1.1672827005386353, + "learning_rate": 7.278186528881237e-05, + "loss": 0.5344718098640442, + "step": 5710 + }, + { + "epoch": 2.410126582278481, + "grad_norm": 1.1923719644546509, + "learning_rate": 7.276048567492136e-05, + "loss": 0.6011165380477905, + "step": 5712 + }, + { + "epoch": 2.410970464135021, + "grad_norm": 1.2314990758895874, + "learning_rate": 7.273910081062341e-05, + "loss": 0.6300925016403198, + "step": 5714 + }, + { + "epoch": 2.4118143459915613, + "grad_norm": 0.8976680040359497, + "learning_rate": 7.27177107008516e-05, + "loss": 0.56329345703125, + "step": 5716 + }, + { + "epoch": 2.4126582278481012, + "grad_norm": 1.2954038381576538, + "learning_rate": 7.269631535054026e-05, + "loss": 0.6266427040100098, + "step": 5718 + }, + { + "epoch": 2.413502109704641, + "grad_norm": 1.3357585668563843, + "learning_rate": 7.267491476462485e-05, + "loss": 0.6234018802642822, + "step": 5720 + }, + { + "epoch": 2.4143459915611816, + "grad_norm": 1.1913645267486572, + "learning_rate": 7.265350894804209e-05, + "loss": 0.5909059047698975, + "step": 5722 + }, + { + "epoch": 2.4151898734177215, + "grad_norm": 1.3425955772399902, + "learning_rate": 7.263209790572986e-05, + "loss": 0.5708479285240173, + "step": 5724 + }, + { + "epoch": 2.4160337552742615, + "grad_norm": 1.2258507013320923, + "learning_rate": 7.261068164262734e-05, + "loss": 0.5810034871101379, + "step": 5726 + }, + { + "epoch": 2.416877637130802, + "grad_norm": 1.348794937133789, + "learning_rate": 7.258926016367479e-05, + "loss": 0.5939235687255859, + "step": 5728 + }, + { + "epoch": 2.4177215189873418, + "grad_norm": 1.0896574258804321, + "learning_rate": 7.256783347381375e-05, + "loss": 0.6298259496688843, + "step": 5730 + }, + { + "epoch": 2.4185654008438817, + "grad_norm": 1.164866328239441, + "learning_rate": 7.254640157798696e-05, + "loss": 0.5277430415153503, + "step": 5732 + }, + { + "epoch": 2.419409282700422, + "grad_norm": 1.1215453147888184, + "learning_rate": 7.252496448113833e-05, + "loss": 0.5724055767059326, + "step": 5734 + }, + { + "epoch": 2.420253164556962, + "grad_norm": 1.0640764236450195, + "learning_rate": 7.2503522188213e-05, + "loss": 0.5439977645874023, + "step": 5736 + }, + { + "epoch": 2.421097046413502, + "grad_norm": 1.4874604940414429, + "learning_rate": 7.248207470415729e-05, + "loss": 0.7568614482879639, + "step": 5738 + }, + { + "epoch": 2.4219409282700424, + "grad_norm": 1.2611099481582642, + "learning_rate": 7.246062203391873e-05, + "loss": 0.6389632225036621, + "step": 5740 + }, + { + "epoch": 2.4227848101265823, + "grad_norm": 1.185644507408142, + "learning_rate": 7.243916418244602e-05, + "loss": 0.6180628538131714, + "step": 5742 + }, + { + "epoch": 2.4236286919831223, + "grad_norm": 1.1648430824279785, + "learning_rate": 7.241770115468909e-05, + "loss": 0.619799017906189, + "step": 5744 + }, + { + "epoch": 2.4244725738396626, + "grad_norm": 1.1974445581436157, + "learning_rate": 7.239623295559903e-05, + "loss": 0.6446201205253601, + "step": 5746 + }, + { + "epoch": 2.4253164556962026, + "grad_norm": 1.140477180480957, + "learning_rate": 7.237475959012818e-05, + "loss": 0.5839580297470093, + "step": 5748 + }, + { + "epoch": 2.4261603375527425, + "grad_norm": 1.1374423503875732, + "learning_rate": 7.235328106322998e-05, + "loss": 0.48815420269966125, + "step": 5750 + }, + { + "epoch": 2.427004219409283, + "grad_norm": 1.411432147026062, + "learning_rate": 7.233179737985916e-05, + "loss": 0.638519287109375, + "step": 5752 + }, + { + "epoch": 2.427848101265823, + "grad_norm": 1.1232497692108154, + "learning_rate": 7.231030854497157e-05, + "loss": 0.5776677131652832, + "step": 5754 + }, + { + "epoch": 2.428691983122363, + "grad_norm": 1.0815738439559937, + "learning_rate": 7.228881456352428e-05, + "loss": 0.5297027230262756, + "step": 5756 + }, + { + "epoch": 2.429535864978903, + "grad_norm": 1.2230733633041382, + "learning_rate": 7.226731544047553e-05, + "loss": 0.5630011558532715, + "step": 5758 + }, + { + "epoch": 2.430379746835443, + "grad_norm": 1.2033147811889648, + "learning_rate": 7.224581118078476e-05, + "loss": 0.5772101283073425, + "step": 5760 + }, + { + "epoch": 2.431223628691983, + "grad_norm": 1.2150053977966309, + "learning_rate": 7.22243017894126e-05, + "loss": 0.5412847399711609, + "step": 5762 + }, + { + "epoch": 2.4320675105485234, + "grad_norm": 1.0494824647903442, + "learning_rate": 7.220278727132083e-05, + "loss": 0.5568405389785767, + "step": 5764 + }, + { + "epoch": 2.4329113924050634, + "grad_norm": 1.2803306579589844, + "learning_rate": 7.218126763147244e-05, + "loss": 0.6022217869758606, + "step": 5766 + }, + { + "epoch": 2.4337552742616033, + "grad_norm": 1.0832798480987549, + "learning_rate": 7.215974287483163e-05, + "loss": 0.5568796396255493, + "step": 5768 + }, + { + "epoch": 2.4345991561181437, + "grad_norm": 1.1829264163970947, + "learning_rate": 7.213821300636372e-05, + "loss": 0.5607990026473999, + "step": 5770 + }, + { + "epoch": 2.4354430379746836, + "grad_norm": 2.3017473220825195, + "learning_rate": 7.211667803103523e-05, + "loss": 0.6382274031639099, + "step": 5772 + }, + { + "epoch": 2.4362869198312236, + "grad_norm": 1.1701387166976929, + "learning_rate": 7.209513795381388e-05, + "loss": 0.5748776793479919, + "step": 5774 + }, + { + "epoch": 2.4371308016877635, + "grad_norm": 1.0480856895446777, + "learning_rate": 7.207359277966856e-05, + "loss": 0.5760934352874756, + "step": 5776 + }, + { + "epoch": 2.437974683544304, + "grad_norm": 1.2263693809509277, + "learning_rate": 7.20520425135693e-05, + "loss": 0.6387208104133606, + "step": 5778 + }, + { + "epoch": 2.438818565400844, + "grad_norm": 1.219246506690979, + "learning_rate": 7.203048716048737e-05, + "loss": 0.6078037619590759, + "step": 5780 + }, + { + "epoch": 2.439662447257384, + "grad_norm": 1.2452640533447266, + "learning_rate": 7.200892672539515e-05, + "loss": 0.606924831867218, + "step": 5782 + }, + { + "epoch": 2.440506329113924, + "grad_norm": 1.3469732999801636, + "learning_rate": 7.198736121326621e-05, + "loss": 0.585297703742981, + "step": 5784 + }, + { + "epoch": 2.441350210970464, + "grad_norm": 1.151127576828003, + "learning_rate": 7.196579062907533e-05, + "loss": 0.5849902033805847, + "step": 5786 + }, + { + "epoch": 2.442194092827004, + "grad_norm": 1.0669564008712769, + "learning_rate": 7.19442149777984e-05, + "loss": 0.6150397062301636, + "step": 5788 + }, + { + "epoch": 2.4430379746835444, + "grad_norm": 1.1700209379196167, + "learning_rate": 7.192263426441252e-05, + "loss": 0.6324567794799805, + "step": 5790 + }, + { + "epoch": 2.4438818565400844, + "grad_norm": 1.2832094430923462, + "learning_rate": 7.190104849389597e-05, + "loss": 0.6202381253242493, + "step": 5792 + }, + { + "epoch": 2.4447257383966243, + "grad_norm": 1.2046177387237549, + "learning_rate": 7.187945767122813e-05, + "loss": 0.6156684756278992, + "step": 5794 + }, + { + "epoch": 2.4455696202531647, + "grad_norm": 1.031133770942688, + "learning_rate": 7.185786180138961e-05, + "loss": 0.5763497352600098, + "step": 5796 + }, + { + "epoch": 2.4464135021097047, + "grad_norm": 1.2803475856781006, + "learning_rate": 7.183626088936216e-05, + "loss": 0.5419677495956421, + "step": 5798 + }, + { + "epoch": 2.4472573839662446, + "grad_norm": 1.2407588958740234, + "learning_rate": 7.181465494012869e-05, + "loss": 0.629108190536499, + "step": 5800 + }, + { + "epoch": 2.4472573839662446, + "eval_loss": 0.6835155487060547, + "eval_runtime": 758.407, + "eval_samples_per_second": 2.778, + "eval_steps_per_second": 2.778, + "step": 5800 + }, + { + "epoch": 2.448101265822785, + "grad_norm": 1.3525878190994263, + "learning_rate": 7.17930439586733e-05, + "loss": 0.6146516799926758, + "step": 5802 + }, + { + "epoch": 2.448945147679325, + "grad_norm": 1.255921721458435, + "learning_rate": 7.177142794998121e-05, + "loss": 0.5796315670013428, + "step": 5804 + }, + { + "epoch": 2.449789029535865, + "grad_norm": 1.2135448455810547, + "learning_rate": 7.174980691903881e-05, + "loss": 0.5978766679763794, + "step": 5806 + }, + { + "epoch": 2.4506329113924052, + "grad_norm": 1.117942214012146, + "learning_rate": 7.172818087083367e-05, + "loss": 0.5941054821014404, + "step": 5808 + }, + { + "epoch": 2.451476793248945, + "grad_norm": 1.2917672395706177, + "learning_rate": 7.17065498103545e-05, + "loss": 0.6213865876197815, + "step": 5810 + }, + { + "epoch": 2.452320675105485, + "grad_norm": 1.2287952899932861, + "learning_rate": 7.168491374259118e-05, + "loss": 0.627090573310852, + "step": 5812 + }, + { + "epoch": 2.453164556962025, + "grad_norm": 1.2427480220794678, + "learning_rate": 7.16632726725347e-05, + "loss": 0.605871319770813, + "step": 5814 + }, + { + "epoch": 2.4540084388185655, + "grad_norm": 1.2568929195404053, + "learning_rate": 7.16416266051773e-05, + "loss": 0.5961518883705139, + "step": 5816 + }, + { + "epoch": 2.4548523206751054, + "grad_norm": 1.2202998399734497, + "learning_rate": 7.161997554551226e-05, + "loss": 0.585054874420166, + "step": 5818 + }, + { + "epoch": 2.4556962025316453, + "grad_norm": 1.2326043844223022, + "learning_rate": 7.159831949853409e-05, + "loss": 0.6219096779823303, + "step": 5820 + }, + { + "epoch": 2.4565400843881857, + "grad_norm": 1.2161623239517212, + "learning_rate": 7.15766584692384e-05, + "loss": 0.641189455986023, + "step": 5822 + }, + { + "epoch": 2.4573839662447257, + "grad_norm": 1.2391023635864258, + "learning_rate": 7.1554992462622e-05, + "loss": 0.577190101146698, + "step": 5824 + }, + { + "epoch": 2.4582278481012656, + "grad_norm": 1.0883333683013916, + "learning_rate": 7.153332148368281e-05, + "loss": 0.5264694690704346, + "step": 5826 + }, + { + "epoch": 2.459071729957806, + "grad_norm": 1.2129524946212769, + "learning_rate": 7.15116455374199e-05, + "loss": 0.631437361240387, + "step": 5828 + }, + { + "epoch": 2.459915611814346, + "grad_norm": 1.0476374626159668, + "learning_rate": 7.148996462883352e-05, + "loss": 0.5025489926338196, + "step": 5830 + }, + { + "epoch": 2.460759493670886, + "grad_norm": 1.1389570236206055, + "learning_rate": 7.146827876292502e-05, + "loss": 0.5903586745262146, + "step": 5832 + }, + { + "epoch": 2.4616033755274263, + "grad_norm": 1.4385539293289185, + "learning_rate": 7.14465879446969e-05, + "loss": 0.633786141872406, + "step": 5834 + }, + { + "epoch": 2.462447257383966, + "grad_norm": 1.1184585094451904, + "learning_rate": 7.142489217915283e-05, + "loss": 0.5889136791229248, + "step": 5836 + }, + { + "epoch": 2.463291139240506, + "grad_norm": 1.2257685661315918, + "learning_rate": 7.140319147129763e-05, + "loss": 0.5774597525596619, + "step": 5838 + }, + { + "epoch": 2.4641350210970465, + "grad_norm": 0.9524238109588623, + "learning_rate": 7.13814858261372e-05, + "loss": 0.5220611095428467, + "step": 5840 + }, + { + "epoch": 2.4649789029535865, + "grad_norm": 1.2814422845840454, + "learning_rate": 7.135977524867861e-05, + "loss": 0.5724858641624451, + "step": 5842 + }, + { + "epoch": 2.4658227848101264, + "grad_norm": 1.0978140830993652, + "learning_rate": 7.133805974393013e-05, + "loss": 0.5469759702682495, + "step": 5844 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 1.310279130935669, + "learning_rate": 7.131633931690104e-05, + "loss": 0.6554312705993652, + "step": 5846 + }, + { + "epoch": 2.4675105485232067, + "grad_norm": 1.286189317703247, + "learning_rate": 7.129461397260187e-05, + "loss": 0.6166019439697266, + "step": 5848 + }, + { + "epoch": 2.4683544303797467, + "grad_norm": 1.1586377620697021, + "learning_rate": 7.127288371604424e-05, + "loss": 0.6301121711730957, + "step": 5850 + }, + { + "epoch": 2.469198312236287, + "grad_norm": 1.1684564352035522, + "learning_rate": 7.125114855224087e-05, + "loss": 0.6022663712501526, + "step": 5852 + }, + { + "epoch": 2.470042194092827, + "grad_norm": 1.182511329650879, + "learning_rate": 7.122940848620567e-05, + "loss": 0.5959302186965942, + "step": 5854 + }, + { + "epoch": 2.470886075949367, + "grad_norm": 1.2383002042770386, + "learning_rate": 7.120766352295366e-05, + "loss": 0.6251413822174072, + "step": 5856 + }, + { + "epoch": 2.4717299578059073, + "grad_norm": 1.2001979351043701, + "learning_rate": 7.118591366750097e-05, + "loss": 0.6332544088363647, + "step": 5858 + }, + { + "epoch": 2.4725738396624473, + "grad_norm": 1.2166392803192139, + "learning_rate": 7.116415892486488e-05, + "loss": 0.5797795057296753, + "step": 5860 + }, + { + "epoch": 2.473417721518987, + "grad_norm": 1.2235382795333862, + "learning_rate": 7.114239930006379e-05, + "loss": 0.5335313081741333, + "step": 5862 + }, + { + "epoch": 2.4742616033755276, + "grad_norm": 1.2405973672866821, + "learning_rate": 7.112063479811724e-05, + "loss": 0.5536905527114868, + "step": 5864 + }, + { + "epoch": 2.4751054852320675, + "grad_norm": 1.116328477859497, + "learning_rate": 7.109886542404585e-05, + "loss": 0.554654061794281, + "step": 5866 + }, + { + "epoch": 2.4759493670886075, + "grad_norm": 1.2757837772369385, + "learning_rate": 7.107709118287143e-05, + "loss": 0.6017873287200928, + "step": 5868 + }, + { + "epoch": 2.476793248945148, + "grad_norm": 1.3445937633514404, + "learning_rate": 7.105531207961686e-05, + "loss": 0.6479908227920532, + "step": 5870 + }, + { + "epoch": 2.477637130801688, + "grad_norm": 1.1464542150497437, + "learning_rate": 7.103352811930619e-05, + "loss": 0.5829157829284668, + "step": 5872 + }, + { + "epoch": 2.4784810126582277, + "grad_norm": 1.3275130987167358, + "learning_rate": 7.101173930696453e-05, + "loss": 0.54380863904953, + "step": 5874 + }, + { + "epoch": 2.479324894514768, + "grad_norm": 1.006990909576416, + "learning_rate": 7.098994564761813e-05, + "loss": 0.6313910484313965, + "step": 5876 + }, + { + "epoch": 2.480168776371308, + "grad_norm": 1.1358299255371094, + "learning_rate": 7.09681471462944e-05, + "loss": 0.5343483090400696, + "step": 5878 + }, + { + "epoch": 2.481012658227848, + "grad_norm": 1.1456117630004883, + "learning_rate": 7.094634380802184e-05, + "loss": 0.49450409412384033, + "step": 5880 + }, + { + "epoch": 2.4818565400843884, + "grad_norm": 1.2961846590042114, + "learning_rate": 7.092453563783003e-05, + "loss": 0.6378757357597351, + "step": 5882 + }, + { + "epoch": 2.4827004219409283, + "grad_norm": 0.983889102935791, + "learning_rate": 7.090272264074972e-05, + "loss": 0.5937124490737915, + "step": 5884 + }, + { + "epoch": 2.4835443037974683, + "grad_norm": 1.0205817222595215, + "learning_rate": 7.088090482181273e-05, + "loss": 0.5301283597946167, + "step": 5886 + }, + { + "epoch": 2.4843881856540087, + "grad_norm": 1.1721397638320923, + "learning_rate": 7.085908218605204e-05, + "loss": 0.6191756129264832, + "step": 5888 + }, + { + "epoch": 2.4852320675105486, + "grad_norm": 1.2432814836502075, + "learning_rate": 7.083725473850168e-05, + "loss": 0.5928890109062195, + "step": 5890 + }, + { + "epoch": 2.4860759493670885, + "grad_norm": 1.252125859260559, + "learning_rate": 7.081542248419686e-05, + "loss": 0.6136764287948608, + "step": 5892 + }, + { + "epoch": 2.486919831223629, + "grad_norm": 1.3686699867248535, + "learning_rate": 7.079358542817382e-05, + "loss": 0.6084910035133362, + "step": 5894 + }, + { + "epoch": 2.487763713080169, + "grad_norm": 1.0877282619476318, + "learning_rate": 7.077174357546996e-05, + "loss": 0.5862250924110413, + "step": 5896 + }, + { + "epoch": 2.488607594936709, + "grad_norm": 1.164095401763916, + "learning_rate": 7.074989693112381e-05, + "loss": 0.6300894021987915, + "step": 5898 + }, + { + "epoch": 2.489451476793249, + "grad_norm": 1.1169507503509521, + "learning_rate": 7.072804550017493e-05, + "loss": 0.5508570075035095, + "step": 5900 + }, + { + "epoch": 2.489451476793249, + "eval_loss": 0.6820966005325317, + "eval_runtime": 513.3515, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 5900 + }, + { + "epoch": 2.490295358649789, + "grad_norm": 1.1718615293502808, + "learning_rate": 7.070618928766406e-05, + "loss": 0.550847589969635, + "step": 5902 + }, + { + "epoch": 2.491139240506329, + "grad_norm": 1.4725650548934937, + "learning_rate": 7.068432829863298e-05, + "loss": 0.5663347840309143, + "step": 5904 + }, + { + "epoch": 2.491983122362869, + "grad_norm": 1.042083978652954, + "learning_rate": 7.066246253812462e-05, + "loss": 0.5506191849708557, + "step": 5906 + }, + { + "epoch": 2.4928270042194094, + "grad_norm": 1.2020974159240723, + "learning_rate": 7.064059201118297e-05, + "loss": 0.5656929612159729, + "step": 5908 + }, + { + "epoch": 2.4936708860759493, + "grad_norm": 1.1040663719177246, + "learning_rate": 7.061871672285317e-05, + "loss": 0.5159370303153992, + "step": 5910 + }, + { + "epoch": 2.4945147679324893, + "grad_norm": 1.3681589365005493, + "learning_rate": 7.05968366781814e-05, + "loss": 0.6161949634552002, + "step": 5912 + }, + { + "epoch": 2.4953586497890297, + "grad_norm": 1.26628839969635, + "learning_rate": 7.057495188221498e-05, + "loss": 0.6357758641242981, + "step": 5914 + }, + { + "epoch": 2.4962025316455696, + "grad_norm": 1.2714020013809204, + "learning_rate": 7.05530623400023e-05, + "loss": 0.5467366576194763, + "step": 5916 + }, + { + "epoch": 2.4970464135021095, + "grad_norm": 1.2255018949508667, + "learning_rate": 7.053116805659287e-05, + "loss": 0.592526376247406, + "step": 5918 + }, + { + "epoch": 2.49789029535865, + "grad_norm": 1.2816206216812134, + "learning_rate": 7.050926903703729e-05, + "loss": 0.5819981694221497, + "step": 5920 + }, + { + "epoch": 2.49873417721519, + "grad_norm": 1.1938221454620361, + "learning_rate": 7.048736528638722e-05, + "loss": 0.6037712693214417, + "step": 5922 + }, + { + "epoch": 2.49957805907173, + "grad_norm": 1.1330323219299316, + "learning_rate": 7.046545680969545e-05, + "loss": 0.5567215085029602, + "step": 5924 + }, + { + "epoch": 2.50042194092827, + "grad_norm": 1.233564019203186, + "learning_rate": 7.044354361201585e-05, + "loss": 0.5626974105834961, + "step": 5926 + }, + { + "epoch": 2.50126582278481, + "grad_norm": 1.1913540363311768, + "learning_rate": 7.042162569840336e-05, + "loss": 0.5672739744186401, + "step": 5928 + }, + { + "epoch": 2.50210970464135, + "grad_norm": 1.060952067375183, + "learning_rate": 7.039970307391402e-05, + "loss": 0.5965602993965149, + "step": 5930 + }, + { + "epoch": 2.5029535864978905, + "grad_norm": 1.2003182172775269, + "learning_rate": 7.037777574360497e-05, + "loss": 0.590932309627533, + "step": 5932 + }, + { + "epoch": 2.5037974683544304, + "grad_norm": 1.073434829711914, + "learning_rate": 7.035584371253441e-05, + "loss": 0.5736868381500244, + "step": 5934 + }, + { + "epoch": 2.5046413502109703, + "grad_norm": 1.2641130685806274, + "learning_rate": 7.033390698576166e-05, + "loss": 0.614703357219696, + "step": 5936 + }, + { + "epoch": 2.5054852320675103, + "grad_norm": 1.2406511306762695, + "learning_rate": 7.031196556834708e-05, + "loss": 0.5866397023200989, + "step": 5938 + }, + { + "epoch": 2.5063291139240507, + "grad_norm": 1.231619119644165, + "learning_rate": 7.029001946535215e-05, + "loss": 0.5792667865753174, + "step": 5940 + }, + { + "epoch": 2.5071729957805906, + "grad_norm": 1.419447660446167, + "learning_rate": 7.026806868183939e-05, + "loss": 0.5686604976654053, + "step": 5942 + }, + { + "epoch": 2.5080168776371305, + "grad_norm": 1.139244556427002, + "learning_rate": 7.024611322287245e-05, + "loss": 0.5860661268234253, + "step": 5944 + }, + { + "epoch": 2.508860759493671, + "grad_norm": 1.070517897605896, + "learning_rate": 7.022415309351602e-05, + "loss": 0.5823250412940979, + "step": 5946 + }, + { + "epoch": 2.509704641350211, + "grad_norm": 1.0775398015975952, + "learning_rate": 7.020218829883589e-05, + "loss": 0.5291389226913452, + "step": 5948 + }, + { + "epoch": 2.510548523206751, + "grad_norm": 1.339716911315918, + "learning_rate": 7.018021884389892e-05, + "loss": 0.6215447783470154, + "step": 5950 + }, + { + "epoch": 2.511392405063291, + "grad_norm": 1.3589707612991333, + "learning_rate": 7.0158244733773e-05, + "loss": 0.5419909358024597, + "step": 5952 + }, + { + "epoch": 2.512236286919831, + "grad_norm": 1.1664098501205444, + "learning_rate": 7.01362659735272e-05, + "loss": 0.5476977229118347, + "step": 5954 + }, + { + "epoch": 2.513080168776371, + "grad_norm": 1.1184223890304565, + "learning_rate": 7.011428256823154e-05, + "loss": 0.5896323919296265, + "step": 5956 + }, + { + "epoch": 2.5139240506329115, + "grad_norm": 1.4071170091629028, + "learning_rate": 7.00922945229572e-05, + "loss": 0.6353691220283508, + "step": 5958 + }, + { + "epoch": 2.5147679324894514, + "grad_norm": 1.3740885257720947, + "learning_rate": 7.007030184277641e-05, + "loss": 0.6605582237243652, + "step": 5960 + }, + { + "epoch": 2.5156118143459913, + "grad_norm": 1.071395754814148, + "learning_rate": 7.004830453276241e-05, + "loss": 0.6399887800216675, + "step": 5962 + }, + { + "epoch": 2.5164556962025317, + "grad_norm": 1.2292311191558838, + "learning_rate": 7.002630259798962e-05, + "loss": 0.5992775559425354, + "step": 5964 + }, + { + "epoch": 2.5172995780590717, + "grad_norm": 1.0133391618728638, + "learning_rate": 7.000429604353341e-05, + "loss": 0.5716721415519714, + "step": 5966 + }, + { + "epoch": 2.5181434599156116, + "grad_norm": 1.2669343948364258, + "learning_rate": 6.998228487447032e-05, + "loss": 0.5455520749092102, + "step": 5968 + }, + { + "epoch": 2.518987341772152, + "grad_norm": 1.2026386260986328, + "learning_rate": 6.996026909587785e-05, + "loss": 0.6411572694778442, + "step": 5970 + }, + { + "epoch": 2.519831223628692, + "grad_norm": 1.359923243522644, + "learning_rate": 6.993824871283465e-05, + "loss": 0.6687750220298767, + "step": 5972 + }, + { + "epoch": 2.520675105485232, + "grad_norm": 1.1265650987625122, + "learning_rate": 6.99162237304204e-05, + "loss": 0.6271382570266724, + "step": 5974 + }, + { + "epoch": 2.5215189873417723, + "grad_norm": 1.197667121887207, + "learning_rate": 6.989419415371583e-05, + "loss": 0.6191279888153076, + "step": 5976 + }, + { + "epoch": 2.522362869198312, + "grad_norm": 1.169992446899414, + "learning_rate": 6.987215998780275e-05, + "loss": 0.6313687562942505, + "step": 5978 + }, + { + "epoch": 2.523206751054852, + "grad_norm": 1.2706433534622192, + "learning_rate": 6.9850121237764e-05, + "loss": 0.6058336496353149, + "step": 5980 + }, + { + "epoch": 2.5240506329113925, + "grad_norm": 1.322376012802124, + "learning_rate": 6.982807790868352e-05, + "loss": 0.6466464400291443, + "step": 5982 + }, + { + "epoch": 2.5248945147679325, + "grad_norm": 1.2398571968078613, + "learning_rate": 6.980603000564626e-05, + "loss": 0.5730098485946655, + "step": 5984 + }, + { + "epoch": 2.5257383966244724, + "grad_norm": 1.2035216093063354, + "learning_rate": 6.978397753373826e-05, + "loss": 0.5305635333061218, + "step": 5986 + }, + { + "epoch": 2.526582278481013, + "grad_norm": 1.1951299905776978, + "learning_rate": 6.976192049804661e-05, + "loss": 0.5601096153259277, + "step": 5988 + }, + { + "epoch": 2.5274261603375527, + "grad_norm": 0.9950459599494934, + "learning_rate": 6.973985890365945e-05, + "loss": 0.5049516558647156, + "step": 5990 + }, + { + "epoch": 2.5282700421940927, + "grad_norm": 1.2581008672714233, + "learning_rate": 6.971779275566593e-05, + "loss": 0.5456960797309875, + "step": 5992 + }, + { + "epoch": 2.529113924050633, + "grad_norm": 1.2196903228759766, + "learning_rate": 6.969572205915632e-05, + "loss": 0.6026827096939087, + "step": 5994 + }, + { + "epoch": 2.529957805907173, + "grad_norm": 1.3109357357025146, + "learning_rate": 6.967364681922189e-05, + "loss": 0.597453236579895, + "step": 5996 + }, + { + "epoch": 2.530801687763713, + "grad_norm": 1.016904354095459, + "learning_rate": 6.965156704095498e-05, + "loss": 0.5304323434829712, + "step": 5998 + }, + { + "epoch": 2.5316455696202533, + "grad_norm": 1.2363858222961426, + "learning_rate": 6.962948272944896e-05, + "loss": 0.5748253464698792, + "step": 6000 + }, + { + "epoch": 2.5316455696202533, + "eval_loss": 0.6813357472419739, + "eval_runtime": 513.5491, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 6000 + }, + { + "epoch": 2.5324894514767933, + "grad_norm": 1.1766576766967773, + "learning_rate": 6.960739388979827e-05, + "loss": 0.613327145576477, + "step": 6002 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 1.4065337181091309, + "learning_rate": 6.95853005270984e-05, + "loss": 0.6648217439651489, + "step": 6004 + }, + { + "epoch": 2.5341772151898736, + "grad_norm": 0.9513862133026123, + "learning_rate": 6.956320264644582e-05, + "loss": 0.5165349841117859, + "step": 6006 + }, + { + "epoch": 2.5350210970464135, + "grad_norm": 1.1104962825775146, + "learning_rate": 6.95411002529381e-05, + "loss": 0.5594159364700317, + "step": 6008 + }, + { + "epoch": 2.5358649789029535, + "grad_norm": 1.1698877811431885, + "learning_rate": 6.951899335167386e-05, + "loss": 0.5662833452224731, + "step": 6010 + }, + { + "epoch": 2.536708860759494, + "grad_norm": 1.2051950693130493, + "learning_rate": 6.949688194775272e-05, + "loss": 0.5780806541442871, + "step": 6012 + }, + { + "epoch": 2.537552742616034, + "grad_norm": 1.2434250116348267, + "learning_rate": 6.947476604627536e-05, + "loss": 0.6112543344497681, + "step": 6014 + }, + { + "epoch": 2.5383966244725737, + "grad_norm": 1.1473076343536377, + "learning_rate": 6.945264565234348e-05, + "loss": 0.5556519031524658, + "step": 6016 + }, + { + "epoch": 2.539240506329114, + "grad_norm": 1.3139631748199463, + "learning_rate": 6.943052077105987e-05, + "loss": 0.6664283275604248, + "step": 6018 + }, + { + "epoch": 2.540084388185654, + "grad_norm": 1.3407402038574219, + "learning_rate": 6.940839140752825e-05, + "loss": 0.6358945369720459, + "step": 6020 + }, + { + "epoch": 2.540928270042194, + "grad_norm": 1.2223491668701172, + "learning_rate": 6.938625756685352e-05, + "loss": 0.6310063600540161, + "step": 6022 + }, + { + "epoch": 2.5417721518987344, + "grad_norm": 1.3984094858169556, + "learning_rate": 6.936411925414146e-05, + "loss": 0.6090726256370544, + "step": 6024 + }, + { + "epoch": 2.5426160337552743, + "grad_norm": 1.1876440048217773, + "learning_rate": 6.9341976474499e-05, + "loss": 0.585586428642273, + "step": 6026 + }, + { + "epoch": 2.5434599156118143, + "grad_norm": 1.2213155031204224, + "learning_rate": 6.931982923303402e-05, + "loss": 0.6382114887237549, + "step": 6028 + }, + { + "epoch": 2.5443037974683547, + "grad_norm": 1.0637959241867065, + "learning_rate": 6.92976775348555e-05, + "loss": 0.5851555466651917, + "step": 6030 + }, + { + "epoch": 2.5451476793248946, + "grad_norm": 1.150227665901184, + "learning_rate": 6.927552138507337e-05, + "loss": 0.5867910385131836, + "step": 6032 + }, + { + "epoch": 2.5459915611814345, + "grad_norm": 1.1405255794525146, + "learning_rate": 6.925336078879865e-05, + "loss": 0.5876969695091248, + "step": 6034 + }, + { + "epoch": 2.546835443037975, + "grad_norm": 1.0269757509231567, + "learning_rate": 6.923119575114339e-05, + "loss": 0.626306414604187, + "step": 6036 + }, + { + "epoch": 2.547679324894515, + "grad_norm": 1.1978809833526611, + "learning_rate": 6.920902627722059e-05, + "loss": 0.645074188709259, + "step": 6038 + }, + { + "epoch": 2.548523206751055, + "grad_norm": 1.1684149503707886, + "learning_rate": 6.918685237214435e-05, + "loss": 0.6284276247024536, + "step": 6040 + }, + { + "epoch": 2.549367088607595, + "grad_norm": 1.2538992166519165, + "learning_rate": 6.916467404102977e-05, + "loss": 0.5770997405052185, + "step": 6042 + }, + { + "epoch": 2.550210970464135, + "grad_norm": 1.2381856441497803, + "learning_rate": 6.914249128899294e-05, + "loss": 0.5501131415367126, + "step": 6044 + }, + { + "epoch": 2.551054852320675, + "grad_norm": 1.0487099885940552, + "learning_rate": 6.912030412115101e-05, + "loss": 0.5362627506256104, + "step": 6046 + }, + { + "epoch": 2.5518987341772155, + "grad_norm": 1.3471804857254028, + "learning_rate": 6.909811254262213e-05, + "loss": 0.6694624423980713, + "step": 6048 + }, + { + "epoch": 2.5527426160337554, + "grad_norm": 1.4262096881866455, + "learning_rate": 6.907591655852547e-05, + "loss": 0.642368733882904, + "step": 6050 + }, + { + "epoch": 2.5535864978902953, + "grad_norm": 1.171004295349121, + "learning_rate": 6.905371617398122e-05, + "loss": 0.6266166567802429, + "step": 6052 + }, + { + "epoch": 2.5544303797468353, + "grad_norm": 1.1249992847442627, + "learning_rate": 6.90315113941106e-05, + "loss": 0.5518985986709595, + "step": 6054 + }, + { + "epoch": 2.5552742616033757, + "grad_norm": 1.3049964904785156, + "learning_rate": 6.900930222403579e-05, + "loss": 0.5367884039878845, + "step": 6056 + }, + { + "epoch": 2.5561181434599156, + "grad_norm": 1.3548237085342407, + "learning_rate": 6.898708866888005e-05, + "loss": 0.6057673096656799, + "step": 6058 + }, + { + "epoch": 2.5569620253164556, + "grad_norm": 1.1422157287597656, + "learning_rate": 6.89648707337676e-05, + "loss": 0.5493726134300232, + "step": 6060 + }, + { + "epoch": 2.557805907172996, + "grad_norm": 1.0179574489593506, + "learning_rate": 6.89426484238237e-05, + "loss": 0.5055251717567444, + "step": 6062 + }, + { + "epoch": 2.558649789029536, + "grad_norm": 1.2062081098556519, + "learning_rate": 6.89204217441746e-05, + "loss": 0.6099714040756226, + "step": 6064 + }, + { + "epoch": 2.559493670886076, + "grad_norm": 1.3043999671936035, + "learning_rate": 6.889819069994759e-05, + "loss": 0.6432347893714905, + "step": 6066 + }, + { + "epoch": 2.5603375527426158, + "grad_norm": 1.241347074508667, + "learning_rate": 6.887595529627093e-05, + "loss": 0.6052974462509155, + "step": 6068 + }, + { + "epoch": 2.561181434599156, + "grad_norm": 1.2502845525741577, + "learning_rate": 6.88537155382739e-05, + "loss": 0.6239711046218872, + "step": 6070 + }, + { + "epoch": 2.562025316455696, + "grad_norm": 1.0815852880477905, + "learning_rate": 6.883147143108679e-05, + "loss": 0.5462124347686768, + "step": 6072 + }, + { + "epoch": 2.562869198312236, + "grad_norm": 1.1990602016448975, + "learning_rate": 6.880922297984087e-05, + "loss": 0.5727240443229675, + "step": 6074 + }, + { + "epoch": 2.5637130801687764, + "grad_norm": 1.016781210899353, + "learning_rate": 6.878697018966846e-05, + "loss": 0.5160089731216431, + "step": 6076 + }, + { + "epoch": 2.5645569620253164, + "grad_norm": 1.1946886777877808, + "learning_rate": 6.876471306570286e-05, + "loss": 0.6344075798988342, + "step": 6078 + }, + { + "epoch": 2.5654008438818563, + "grad_norm": 1.1460139751434326, + "learning_rate": 6.87424516130783e-05, + "loss": 0.6142247319221497, + "step": 6080 + }, + { + "epoch": 2.5662447257383967, + "grad_norm": 1.3636937141418457, + "learning_rate": 6.872018583693013e-05, + "loss": 0.6330769658088684, + "step": 6082 + }, + { + "epoch": 2.5670886075949366, + "grad_norm": 1.3545513153076172, + "learning_rate": 6.869791574239463e-05, + "loss": 0.6386255621910095, + "step": 6084 + }, + { + "epoch": 2.5679324894514766, + "grad_norm": 1.1196715831756592, + "learning_rate": 6.867564133460904e-05, + "loss": 0.5527385473251343, + "step": 6086 + }, + { + "epoch": 2.568776371308017, + "grad_norm": 1.0583977699279785, + "learning_rate": 6.865336261871168e-05, + "loss": 0.5689145922660828, + "step": 6088 + }, + { + "epoch": 2.569620253164557, + "grad_norm": 1.2963348627090454, + "learning_rate": 6.86310795998418e-05, + "loss": 0.5756540298461914, + "step": 6090 + }, + { + "epoch": 2.570464135021097, + "grad_norm": 1.122214436531067, + "learning_rate": 6.860879228313968e-05, + "loss": 0.6062834858894348, + "step": 6092 + }, + { + "epoch": 2.571308016877637, + "grad_norm": 1.1313230991363525, + "learning_rate": 6.858650067374657e-05, + "loss": 0.5526617169380188, + "step": 6094 + }, + { + "epoch": 2.572151898734177, + "grad_norm": 1.6992650032043457, + "learning_rate": 6.856420477680471e-05, + "loss": 0.5911332964897156, + "step": 6096 + }, + { + "epoch": 2.572995780590717, + "grad_norm": 1.2622860670089722, + "learning_rate": 6.854190459745735e-05, + "loss": 0.5730270743370056, + "step": 6098 + }, + { + "epoch": 2.5738396624472575, + "grad_norm": 1.1420512199401855, + "learning_rate": 6.851960014084868e-05, + "loss": 0.597838282585144, + "step": 6100 + }, + { + "epoch": 2.5738396624472575, + "eval_loss": 0.6812278628349304, + "eval_runtime": 513.4749, + "eval_samples_per_second": 4.103, + "eval_steps_per_second": 4.103, + "step": 6100 + }, + { + "epoch": 2.5746835443037974, + "grad_norm": 1.129335641860962, + "learning_rate": 6.849729141212396e-05, + "loss": 0.6048991084098816, + "step": 6102 + }, + { + "epoch": 2.5755274261603374, + "grad_norm": 1.161284327507019, + "learning_rate": 6.847497841642935e-05, + "loss": 0.6359057426452637, + "step": 6104 + }, + { + "epoch": 2.5763713080168777, + "grad_norm": 1.285344123840332, + "learning_rate": 6.845266115891203e-05, + "loss": 0.5858902335166931, + "step": 6106 + }, + { + "epoch": 2.5772151898734177, + "grad_norm": 1.085143804550171, + "learning_rate": 6.843033964472018e-05, + "loss": 0.5742247700691223, + "step": 6108 + }, + { + "epoch": 2.5780590717299576, + "grad_norm": 1.1920831203460693, + "learning_rate": 6.840801387900291e-05, + "loss": 0.6738532185554504, + "step": 6110 + }, + { + "epoch": 2.578902953586498, + "grad_norm": 1.2750232219696045, + "learning_rate": 6.838568386691042e-05, + "loss": 0.6046389937400818, + "step": 6112 + }, + { + "epoch": 2.579746835443038, + "grad_norm": 1.1027764081954956, + "learning_rate": 6.836334961359373e-05, + "loss": 0.6231611967086792, + "step": 6114 + }, + { + "epoch": 2.580590717299578, + "grad_norm": 1.2996546030044556, + "learning_rate": 6.834101112420497e-05, + "loss": 0.5848191380500793, + "step": 6116 + }, + { + "epoch": 2.5814345991561183, + "grad_norm": 1.2683454751968384, + "learning_rate": 6.831866840389719e-05, + "loss": 0.6160622835159302, + "step": 6118 + }, + { + "epoch": 2.5822784810126582, + "grad_norm": 1.049797534942627, + "learning_rate": 6.829632145782441e-05, + "loss": 0.5220097899436951, + "step": 6120 + }, + { + "epoch": 2.583122362869198, + "grad_norm": 1.1798468828201294, + "learning_rate": 6.827397029114168e-05, + "loss": 0.5709835290908813, + "step": 6122 + }, + { + "epoch": 2.5839662447257385, + "grad_norm": 1.0136369466781616, + "learning_rate": 6.825161490900495e-05, + "loss": 0.5086703300476074, + "step": 6124 + }, + { + "epoch": 2.5848101265822785, + "grad_norm": 1.147735595703125, + "learning_rate": 6.822925531657119e-05, + "loss": 0.5904423594474792, + "step": 6126 + }, + { + "epoch": 2.5856540084388184, + "grad_norm": 0.9979357123374939, + "learning_rate": 6.820689151899833e-05, + "loss": 0.5002011060714722, + "step": 6128 + }, + { + "epoch": 2.586497890295359, + "grad_norm": 1.4129728078842163, + "learning_rate": 6.818452352144527e-05, + "loss": 0.5694814920425415, + "step": 6130 + }, + { + "epoch": 2.5873417721518988, + "grad_norm": 1.1388975381851196, + "learning_rate": 6.816215132907186e-05, + "loss": 0.5448270440101624, + "step": 6132 + }, + { + "epoch": 2.5881856540084387, + "grad_norm": 1.268865942955017, + "learning_rate": 6.813977494703896e-05, + "loss": 0.6184739470481873, + "step": 6134 + }, + { + "epoch": 2.589029535864979, + "grad_norm": 1.2403846979141235, + "learning_rate": 6.811739438050835e-05, + "loss": 0.6493034958839417, + "step": 6136 + }, + { + "epoch": 2.589873417721519, + "grad_norm": 1.108298659324646, + "learning_rate": 6.809500963464282e-05, + "loss": 0.6168854236602783, + "step": 6138 + }, + { + "epoch": 2.590717299578059, + "grad_norm": 1.106427788734436, + "learning_rate": 6.807262071460609e-05, + "loss": 0.5734958052635193, + "step": 6140 + }, + { + "epoch": 2.5915611814345993, + "grad_norm": 1.147791862487793, + "learning_rate": 6.805022762556286e-05, + "loss": 0.5422238111495972, + "step": 6142 + }, + { + "epoch": 2.5924050632911393, + "grad_norm": 1.214465856552124, + "learning_rate": 6.802783037267874e-05, + "loss": 0.6511701345443726, + "step": 6144 + }, + { + "epoch": 2.5932489451476792, + "grad_norm": 1.087735891342163, + "learning_rate": 6.800542896112043e-05, + "loss": 0.5978493094444275, + "step": 6146 + }, + { + "epoch": 2.5940928270042196, + "grad_norm": 1.0772241353988647, + "learning_rate": 6.798302339605544e-05, + "loss": 0.5656765699386597, + "step": 6148 + }, + { + "epoch": 2.5949367088607596, + "grad_norm": 1.1666499376296997, + "learning_rate": 6.796061368265231e-05, + "loss": 0.6147777438163757, + "step": 6150 + }, + { + "epoch": 2.5957805907172995, + "grad_norm": 0.9949467182159424, + "learning_rate": 6.793819982608057e-05, + "loss": 0.502659022808075, + "step": 6152 + }, + { + "epoch": 2.59662447257384, + "grad_norm": 1.311484456062317, + "learning_rate": 6.791578183151061e-05, + "loss": 0.6019812226295471, + "step": 6154 + }, + { + "epoch": 2.59746835443038, + "grad_norm": 0.9594855904579163, + "learning_rate": 6.789335970411387e-05, + "loss": 0.625690221786499, + "step": 6156 + }, + { + "epoch": 2.5983122362869198, + "grad_norm": 1.2252063751220703, + "learning_rate": 6.78709334490627e-05, + "loss": 0.628356397151947, + "step": 6158 + }, + { + "epoch": 2.59915611814346, + "grad_norm": 1.089603304862976, + "learning_rate": 6.784850307153043e-05, + "loss": 0.5447192192077637, + "step": 6160 + }, + { + "epoch": 2.6, + "grad_norm": 1.1035163402557373, + "learning_rate": 6.782606857669125e-05, + "loss": 0.5400487184524536, + "step": 6162 + }, + { + "epoch": 2.60084388185654, + "grad_norm": 1.2329976558685303, + "learning_rate": 6.780362996972042e-05, + "loss": 0.5795643329620361, + "step": 6164 + }, + { + "epoch": 2.6016877637130804, + "grad_norm": 1.2984000444412231, + "learning_rate": 6.778118725579408e-05, + "loss": 0.5664985775947571, + "step": 6166 + }, + { + "epoch": 2.6025316455696204, + "grad_norm": 1.3563600778579712, + "learning_rate": 6.775874044008933e-05, + "loss": 0.5406283140182495, + "step": 6168 + }, + { + "epoch": 2.6033755274261603, + "grad_norm": 1.1897385120391846, + "learning_rate": 6.773628952778421e-05, + "loss": 0.5362374782562256, + "step": 6170 + }, + { + "epoch": 2.6042194092827007, + "grad_norm": 1.1492685079574585, + "learning_rate": 6.771383452405773e-05, + "loss": 0.5942689180374146, + "step": 6172 + }, + { + "epoch": 2.6050632911392406, + "grad_norm": 1.2306408882141113, + "learning_rate": 6.769137543408985e-05, + "loss": 0.6144227981567383, + "step": 6174 + }, + { + "epoch": 2.6059071729957806, + "grad_norm": 1.1260589361190796, + "learning_rate": 6.766891226306143e-05, + "loss": 0.5147640705108643, + "step": 6176 + }, + { + "epoch": 2.606751054852321, + "grad_norm": 1.214007019996643, + "learning_rate": 6.764644501615427e-05, + "loss": 0.6822091341018677, + "step": 6178 + }, + { + "epoch": 2.607594936708861, + "grad_norm": 1.2251341342926025, + "learning_rate": 6.762397369855116e-05, + "loss": 0.5330857038497925, + "step": 6180 + }, + { + "epoch": 2.608438818565401, + "grad_norm": 1.3556525707244873, + "learning_rate": 6.760149831543578e-05, + "loss": 0.58979332447052, + "step": 6182 + }, + { + "epoch": 2.6092827004219408, + "grad_norm": 1.286598563194275, + "learning_rate": 6.757901887199278e-05, + "loss": 0.5667334198951721, + "step": 6184 + }, + { + "epoch": 2.610126582278481, + "grad_norm": 1.2515888214111328, + "learning_rate": 6.755653537340776e-05, + "loss": 0.6028750538825989, + "step": 6186 + }, + { + "epoch": 2.610970464135021, + "grad_norm": 1.1090617179870605, + "learning_rate": 6.753404782486719e-05, + "loss": 0.604102611541748, + "step": 6188 + }, + { + "epoch": 2.611814345991561, + "grad_norm": 1.1782273054122925, + "learning_rate": 6.751155623155853e-05, + "loss": 0.5486276745796204, + "step": 6190 + }, + { + "epoch": 2.6126582278481014, + "grad_norm": 1.5475431680679321, + "learning_rate": 6.748906059867018e-05, + "loss": 0.630682110786438, + "step": 6192 + }, + { + "epoch": 2.6135021097046414, + "grad_norm": 1.237891435623169, + "learning_rate": 6.746656093139143e-05, + "loss": 0.571597695350647, + "step": 6194 + }, + { + "epoch": 2.6143459915611813, + "grad_norm": 1.2367130517959595, + "learning_rate": 6.744405723491253e-05, + "loss": 0.6020040512084961, + "step": 6196 + }, + { + "epoch": 2.6151898734177212, + "grad_norm": 1.0747612714767456, + "learning_rate": 6.742154951442464e-05, + "loss": 0.5520704984664917, + "step": 6198 + }, + { + "epoch": 2.6160337552742616, + "grad_norm": 1.3944035768508911, + "learning_rate": 6.739903777511985e-05, + "loss": 0.7312755584716797, + "step": 6200 + }, + { + "epoch": 2.6160337552742616, + "eval_loss": 0.6795271039009094, + "eval_runtime": 513.2393, + "eval_samples_per_second": 4.105, + "eval_steps_per_second": 4.105, + "step": 6200 + }, + { + "epoch": 2.6168776371308016, + "grad_norm": 1.3716613054275513, + "learning_rate": 6.737652202219121e-05, + "loss": 0.617123007774353, + "step": 6202 + }, + { + "epoch": 2.6177215189873415, + "grad_norm": 1.1962300539016724, + "learning_rate": 6.735400226083267e-05, + "loss": 0.5791950225830078, + "step": 6204 + }, + { + "epoch": 2.618565400843882, + "grad_norm": 1.2570394277572632, + "learning_rate": 6.733147849623909e-05, + "loss": 0.5941018462181091, + "step": 6206 + }, + { + "epoch": 2.619409282700422, + "grad_norm": 1.2903523445129395, + "learning_rate": 6.730895073360628e-05, + "loss": 0.5417253971099854, + "step": 6208 + }, + { + "epoch": 2.620253164556962, + "grad_norm": 1.0618562698364258, + "learning_rate": 6.728641897813096e-05, + "loss": 0.536359965801239, + "step": 6210 + }, + { + "epoch": 2.621097046413502, + "grad_norm": 1.307300090789795, + "learning_rate": 6.726388323501077e-05, + "loss": 0.6409479975700378, + "step": 6212 + }, + { + "epoch": 2.621940928270042, + "grad_norm": 1.3672584295272827, + "learning_rate": 6.72413435094443e-05, + "loss": 0.66277676820755, + "step": 6214 + }, + { + "epoch": 2.622784810126582, + "grad_norm": 1.2156232595443726, + "learning_rate": 6.721879980663098e-05, + "loss": 0.6193054914474487, + "step": 6216 + }, + { + "epoch": 2.6236286919831224, + "grad_norm": 1.1575636863708496, + "learning_rate": 6.719625213177124e-05, + "loss": 0.5773701667785645, + "step": 6218 + }, + { + "epoch": 2.6244725738396624, + "grad_norm": 1.2327474355697632, + "learning_rate": 6.71737004900664e-05, + "loss": 0.6913977265357971, + "step": 6220 + }, + { + "epoch": 2.6253164556962023, + "grad_norm": 1.1316778659820557, + "learning_rate": 6.715114488671869e-05, + "loss": 0.5773524045944214, + "step": 6222 + }, + { + "epoch": 2.6261603375527427, + "grad_norm": 1.1508816480636597, + "learning_rate": 6.712858532693125e-05, + "loss": 0.5554601550102234, + "step": 6224 + }, + { + "epoch": 2.6270042194092826, + "grad_norm": 1.2404967546463013, + "learning_rate": 6.710602181590812e-05, + "loss": 0.6090670824050903, + "step": 6226 + }, + { + "epoch": 2.6278481012658226, + "grad_norm": 1.0721718072891235, + "learning_rate": 6.70834543588543e-05, + "loss": 0.5546537637710571, + "step": 6228 + }, + { + "epoch": 2.628691983122363, + "grad_norm": 1.2788114547729492, + "learning_rate": 6.706088296097564e-05, + "loss": 0.5939876437187195, + "step": 6230 + }, + { + "epoch": 2.629535864978903, + "grad_norm": 1.1952526569366455, + "learning_rate": 6.703830762747896e-05, + "loss": 0.5291836857795715, + "step": 6232 + }, + { + "epoch": 2.630379746835443, + "grad_norm": 1.0261807441711426, + "learning_rate": 6.701572836357191e-05, + "loss": 0.518436074256897, + "step": 6234 + }, + { + "epoch": 2.6312236286919832, + "grad_norm": 1.1804791688919067, + "learning_rate": 6.699314517446316e-05, + "loss": 0.5830684900283813, + "step": 6236 + }, + { + "epoch": 2.632067510548523, + "grad_norm": 1.2079823017120361, + "learning_rate": 6.697055806536214e-05, + "loss": 0.5899971127510071, + "step": 6238 + }, + { + "epoch": 2.632911392405063, + "grad_norm": 1.1989154815673828, + "learning_rate": 6.694796704147932e-05, + "loss": 0.6533132791519165, + "step": 6240 + }, + { + "epoch": 2.6337552742616035, + "grad_norm": 1.0621024370193481, + "learning_rate": 6.692537210802598e-05, + "loss": 0.5341002345085144, + "step": 6242 + }, + { + "epoch": 2.6345991561181434, + "grad_norm": 1.2911880016326904, + "learning_rate": 6.690277327021436e-05, + "loss": 0.6795719861984253, + "step": 6244 + }, + { + "epoch": 2.6354430379746834, + "grad_norm": 1.3586145639419556, + "learning_rate": 6.688017053325757e-05, + "loss": 0.5390555262565613, + "step": 6246 + }, + { + "epoch": 2.6362869198312238, + "grad_norm": 1.31569242477417, + "learning_rate": 6.685756390236964e-05, + "loss": 0.5935586094856262, + "step": 6248 + }, + { + "epoch": 2.6371308016877637, + "grad_norm": 1.0801384449005127, + "learning_rate": 6.683495338276547e-05, + "loss": 0.5845919847488403, + "step": 6250 + }, + { + "epoch": 2.6379746835443036, + "grad_norm": 1.179715633392334, + "learning_rate": 6.681233897966087e-05, + "loss": 0.6017906665802002, + "step": 6252 + }, + { + "epoch": 2.638818565400844, + "grad_norm": 1.1927930116653442, + "learning_rate": 6.678972069827255e-05, + "loss": 0.6637946367263794, + "step": 6254 + }, + { + "epoch": 2.639662447257384, + "grad_norm": 1.2167247533798218, + "learning_rate": 6.676709854381812e-05, + "loss": 0.5572535991668701, + "step": 6256 + }, + { + "epoch": 2.640506329113924, + "grad_norm": 1.2026311159133911, + "learning_rate": 6.674447252151608e-05, + "loss": 0.5426514148712158, + "step": 6258 + }, + { + "epoch": 2.6413502109704643, + "grad_norm": 1.101891279220581, + "learning_rate": 6.672184263658579e-05, + "loss": 0.5123113989830017, + "step": 6260 + }, + { + "epoch": 2.6421940928270042, + "grad_norm": 1.3467986583709717, + "learning_rate": 6.669920889424758e-05, + "loss": 0.6018276214599609, + "step": 6262 + }, + { + "epoch": 2.643037974683544, + "grad_norm": 1.2477779388427734, + "learning_rate": 6.667657129972257e-05, + "loss": 0.5618380308151245, + "step": 6264 + }, + { + "epoch": 2.6438818565400846, + "grad_norm": 1.1284273862838745, + "learning_rate": 6.665392985823287e-05, + "loss": 0.5541924834251404, + "step": 6266 + }, + { + "epoch": 2.6447257383966245, + "grad_norm": 1.2376370429992676, + "learning_rate": 6.663128457500137e-05, + "loss": 0.5534335970878601, + "step": 6268 + }, + { + "epoch": 2.6455696202531644, + "grad_norm": 1.3205965757369995, + "learning_rate": 6.660863545525196e-05, + "loss": 0.6160520315170288, + "step": 6270 + }, + { + "epoch": 2.646413502109705, + "grad_norm": 1.175926685333252, + "learning_rate": 6.65859825042093e-05, + "loss": 0.6035991311073303, + "step": 6272 + }, + { + "epoch": 2.6472573839662448, + "grad_norm": 1.2805176973342896, + "learning_rate": 6.656332572709901e-05, + "loss": 0.6101992130279541, + "step": 6274 + }, + { + "epoch": 2.6481012658227847, + "grad_norm": 1.2493922710418701, + "learning_rate": 6.65406651291476e-05, + "loss": 0.5665684342384338, + "step": 6276 + }, + { + "epoch": 2.648945147679325, + "grad_norm": 1.3103299140930176, + "learning_rate": 6.65180007155824e-05, + "loss": 0.682868242263794, + "step": 6278 + }, + { + "epoch": 2.649789029535865, + "grad_norm": 1.3098952770233154, + "learning_rate": 6.649533249163167e-05, + "loss": 0.6398087739944458, + "step": 6280 + }, + { + "epoch": 2.650632911392405, + "grad_norm": 1.230396032333374, + "learning_rate": 6.647266046252454e-05, + "loss": 0.5410205721855164, + "step": 6282 + }, + { + "epoch": 2.6514767932489454, + "grad_norm": 1.1755880117416382, + "learning_rate": 6.6449984633491e-05, + "loss": 0.6019781231880188, + "step": 6284 + }, + { + "epoch": 2.6523206751054853, + "grad_norm": 1.1013081073760986, + "learning_rate": 6.642730500976193e-05, + "loss": 0.5327204465866089, + "step": 6286 + }, + { + "epoch": 2.6531645569620252, + "grad_norm": 1.1285136938095093, + "learning_rate": 6.640462159656908e-05, + "loss": 0.6458070278167725, + "step": 6288 + }, + { + "epoch": 2.6540084388185656, + "grad_norm": 1.5320124626159668, + "learning_rate": 6.638193439914512e-05, + "loss": 0.6038496494293213, + "step": 6290 + }, + { + "epoch": 2.6548523206751056, + "grad_norm": 1.0231032371520996, + "learning_rate": 6.635924342272349e-05, + "loss": 0.5353283286094666, + "step": 6292 + }, + { + "epoch": 2.6556962025316455, + "grad_norm": 1.1871505975723267, + "learning_rate": 6.633654867253858e-05, + "loss": 0.644368588924408, + "step": 6294 + }, + { + "epoch": 2.656540084388186, + "grad_norm": 1.0641425848007202, + "learning_rate": 6.631385015382565e-05, + "loss": 0.5251830220222473, + "step": 6296 + }, + { + "epoch": 2.657383966244726, + "grad_norm": 0.8980898261070251, + "learning_rate": 6.62911478718208e-05, + "loss": 0.527733564376831, + "step": 6298 + }, + { + "epoch": 2.6582278481012658, + "grad_norm": 1.1694822311401367, + "learning_rate": 6.626844183176102e-05, + "loss": 0.5868222117424011, + "step": 6300 + }, + { + "epoch": 2.6582278481012658, + "eval_loss": 0.6781066656112671, + "eval_runtime": 512.3669, + "eval_samples_per_second": 4.112, + "eval_steps_per_second": 4.112, + "step": 6300 + }, + { + "epoch": 2.659071729957806, + "grad_norm": 1.3010352849960327, + "learning_rate": 6.624573203888413e-05, + "loss": 0.5965607166290283, + "step": 6302 + }, + { + "epoch": 2.659915611814346, + "grad_norm": 1.074964165687561, + "learning_rate": 6.62230184984289e-05, + "loss": 0.5776658654212952, + "step": 6304 + }, + { + "epoch": 2.660759493670886, + "grad_norm": 1.0930451154708862, + "learning_rate": 6.620030121563484e-05, + "loss": 0.584223210811615, + "step": 6306 + }, + { + "epoch": 2.6616033755274264, + "grad_norm": 1.1418803930282593, + "learning_rate": 6.617758019574243e-05, + "loss": 0.534063994884491, + "step": 6308 + }, + { + "epoch": 2.6624472573839664, + "grad_norm": 1.1602790355682373, + "learning_rate": 6.615485544399298e-05, + "loss": 0.5719610452651978, + "step": 6310 + }, + { + "epoch": 2.6632911392405063, + "grad_norm": 1.0926544666290283, + "learning_rate": 6.613212696562863e-05, + "loss": 0.5489934682846069, + "step": 6312 + }, + { + "epoch": 2.6641350210970463, + "grad_norm": 1.2560242414474487, + "learning_rate": 6.610939476589239e-05, + "loss": 0.5568612217903137, + "step": 6314 + }, + { + "epoch": 2.6649789029535866, + "grad_norm": 1.110960602760315, + "learning_rate": 6.60866588500282e-05, + "loss": 0.6019266247749329, + "step": 6316 + }, + { + "epoch": 2.6658227848101266, + "grad_norm": 1.333012342453003, + "learning_rate": 6.606391922328074e-05, + "loss": 0.6083081364631653, + "step": 6318 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.1256170272827148, + "learning_rate": 6.604117589089564e-05, + "loss": 0.5586183071136475, + "step": 6320 + }, + { + "epoch": 2.667510548523207, + "grad_norm": 1.2877609729766846, + "learning_rate": 6.601842885811934e-05, + "loss": 0.5676470994949341, + "step": 6322 + }, + { + "epoch": 2.668354430379747, + "grad_norm": 1.305034875869751, + "learning_rate": 6.599567813019914e-05, + "loss": 0.6470263600349426, + "step": 6324 + }, + { + "epoch": 2.669198312236287, + "grad_norm": 1.1695195436477661, + "learning_rate": 6.597292371238318e-05, + "loss": 0.588540256023407, + "step": 6326 + }, + { + "epoch": 2.670042194092827, + "grad_norm": 1.084652304649353, + "learning_rate": 6.59501656099205e-05, + "loss": 0.602922260761261, + "step": 6328 + }, + { + "epoch": 2.670886075949367, + "grad_norm": 1.1664962768554688, + "learning_rate": 6.592740382806094e-05, + "loss": 0.5613425970077515, + "step": 6330 + }, + { + "epoch": 2.671729957805907, + "grad_norm": 1.2208726406097412, + "learning_rate": 6.590463837205522e-05, + "loss": 0.5850927829742432, + "step": 6332 + }, + { + "epoch": 2.672573839662447, + "grad_norm": 1.0662479400634766, + "learning_rate": 6.588186924715488e-05, + "loss": 0.503675639629364, + "step": 6334 + }, + { + "epoch": 2.6734177215189874, + "grad_norm": 1.5318000316619873, + "learning_rate": 6.58590964586123e-05, + "loss": 0.6245100498199463, + "step": 6336 + }, + { + "epoch": 2.6742616033755273, + "grad_norm": 1.402784824371338, + "learning_rate": 6.583632001168077e-05, + "loss": 0.6556243896484375, + "step": 6338 + }, + { + "epoch": 2.6751054852320673, + "grad_norm": 1.2293213605880737, + "learning_rate": 6.581353991161435e-05, + "loss": 0.6398119926452637, + "step": 6340 + }, + { + "epoch": 2.6759493670886076, + "grad_norm": 1.2687599658966064, + "learning_rate": 6.579075616366797e-05, + "loss": 0.5792493224143982, + "step": 6342 + }, + { + "epoch": 2.6767932489451476, + "grad_norm": 1.2112480401992798, + "learning_rate": 6.576796877309741e-05, + "loss": 0.6669304966926575, + "step": 6344 + }, + { + "epoch": 2.6776371308016875, + "grad_norm": 1.3074487447738647, + "learning_rate": 6.574517774515929e-05, + "loss": 0.6012452840805054, + "step": 6346 + }, + { + "epoch": 2.678481012658228, + "grad_norm": 1.3157081604003906, + "learning_rate": 6.572238308511106e-05, + "loss": 0.6556297540664673, + "step": 6348 + }, + { + "epoch": 2.679324894514768, + "grad_norm": 1.0735292434692383, + "learning_rate": 6.569958479821099e-05, + "loss": 0.5607976317405701, + "step": 6350 + }, + { + "epoch": 2.680168776371308, + "grad_norm": 1.1896809339523315, + "learning_rate": 6.567678288971825e-05, + "loss": 0.6040812730789185, + "step": 6352 + }, + { + "epoch": 2.681012658227848, + "grad_norm": 1.1350760459899902, + "learning_rate": 6.565397736489274e-05, + "loss": 0.5807676911354065, + "step": 6354 + }, + { + "epoch": 2.681856540084388, + "grad_norm": 1.3865782022476196, + "learning_rate": 6.563116822899532e-05, + "loss": 0.5877989530563354, + "step": 6356 + }, + { + "epoch": 2.682700421940928, + "grad_norm": 1.218682050704956, + "learning_rate": 6.560835548728758e-05, + "loss": 0.614531397819519, + "step": 6358 + }, + { + "epoch": 2.6835443037974684, + "grad_norm": 1.06162691116333, + "learning_rate": 6.5585539145032e-05, + "loss": 0.5880973935127258, + "step": 6360 + }, + { + "epoch": 2.6843881856540084, + "grad_norm": 1.264328956604004, + "learning_rate": 6.556271920749187e-05, + "loss": 0.5795428156852722, + "step": 6362 + }, + { + "epoch": 2.6852320675105483, + "grad_norm": 1.335652470588684, + "learning_rate": 6.553989567993129e-05, + "loss": 0.5927176475524902, + "step": 6364 + }, + { + "epoch": 2.6860759493670887, + "grad_norm": 1.1110745668411255, + "learning_rate": 6.551706856761524e-05, + "loss": 0.5814473628997803, + "step": 6366 + }, + { + "epoch": 2.6869198312236287, + "grad_norm": 1.1731220483779907, + "learning_rate": 6.549423787580947e-05, + "loss": 0.557738184928894, + "step": 6368 + }, + { + "epoch": 2.6877637130801686, + "grad_norm": 1.2679874897003174, + "learning_rate": 6.54714036097806e-05, + "loss": 0.5947291254997253, + "step": 6370 + }, + { + "epoch": 2.688607594936709, + "grad_norm": 1.112322211265564, + "learning_rate": 6.544856577479606e-05, + "loss": 0.5769563317298889, + "step": 6372 + }, + { + "epoch": 2.689451476793249, + "grad_norm": 1.3385759592056274, + "learning_rate": 6.542572437612408e-05, + "loss": 0.6077675223350525, + "step": 6374 + }, + { + "epoch": 2.690295358649789, + "grad_norm": 1.0953450202941895, + "learning_rate": 6.540287941903375e-05, + "loss": 0.5600538849830627, + "step": 6376 + }, + { + "epoch": 2.6911392405063292, + "grad_norm": 1.2455042600631714, + "learning_rate": 6.538003090879495e-05, + "loss": 0.5828459858894348, + "step": 6378 + }, + { + "epoch": 2.691983122362869, + "grad_norm": 1.2563562393188477, + "learning_rate": 6.53571788506784e-05, + "loss": 0.5844002366065979, + "step": 6380 + }, + { + "epoch": 2.692827004219409, + "grad_norm": 1.3466061353683472, + "learning_rate": 6.533432324995563e-05, + "loss": 0.6632003784179688, + "step": 6382 + }, + { + "epoch": 2.6936708860759495, + "grad_norm": 1.2467784881591797, + "learning_rate": 6.531146411189899e-05, + "loss": 0.5532103180885315, + "step": 6384 + }, + { + "epoch": 2.6945147679324895, + "grad_norm": 1.344250202178955, + "learning_rate": 6.528860144178163e-05, + "loss": 0.5722881555557251, + "step": 6386 + }, + { + "epoch": 2.6953586497890294, + "grad_norm": 1.3688865900039673, + "learning_rate": 6.526573524487756e-05, + "loss": 0.6424282789230347, + "step": 6388 + }, + { + "epoch": 2.6962025316455698, + "grad_norm": 1.4252339601516724, + "learning_rate": 6.524286552646153e-05, + "loss": 0.5986620783805847, + "step": 6390 + }, + { + "epoch": 2.6970464135021097, + "grad_norm": 1.4102380275726318, + "learning_rate": 6.52199922918092e-05, + "loss": 0.6466318368911743, + "step": 6392 + }, + { + "epoch": 2.6978902953586497, + "grad_norm": 1.184442400932312, + "learning_rate": 6.519711554619692e-05, + "loss": 0.6259894371032715, + "step": 6394 + }, + { + "epoch": 2.69873417721519, + "grad_norm": 1.2751896381378174, + "learning_rate": 6.517423529490198e-05, + "loss": 0.5682622194290161, + "step": 6396 + }, + { + "epoch": 2.69957805907173, + "grad_norm": 1.3333114385604858, + "learning_rate": 6.515135154320236e-05, + "loss": 0.573390007019043, + "step": 6398 + }, + { + "epoch": 2.70042194092827, + "grad_norm": 1.2505477666854858, + "learning_rate": 6.512846429637693e-05, + "loss": 0.5839408040046692, + "step": 6400 + }, + { + "epoch": 2.70042194092827, + "eval_loss": 0.6764505505561829, + "eval_runtime": 512.7682, + "eval_samples_per_second": 4.109, + "eval_steps_per_second": 4.109, + "step": 6400 + }, + { + "epoch": 2.7012658227848103, + "grad_norm": 1.2822065353393555, + "learning_rate": 6.510557355970534e-05, + "loss": 0.6000106334686279, + "step": 6402 + }, + { + "epoch": 2.7021097046413503, + "grad_norm": 1.2144463062286377, + "learning_rate": 6.508267933846803e-05, + "loss": 0.5796633362770081, + "step": 6404 + }, + { + "epoch": 2.70295358649789, + "grad_norm": 1.189985990524292, + "learning_rate": 6.505978163794628e-05, + "loss": 0.5976626873016357, + "step": 6406 + }, + { + "epoch": 2.7037974683544306, + "grad_norm": 1.0484727621078491, + "learning_rate": 6.503688046342212e-05, + "loss": 0.5054599642753601, + "step": 6408 + }, + { + "epoch": 2.7046413502109705, + "grad_norm": 1.4333025217056274, + "learning_rate": 6.501397582017844e-05, + "loss": 0.6539149284362793, + "step": 6410 + }, + { + "epoch": 2.7054852320675105, + "grad_norm": 1.1808522939682007, + "learning_rate": 6.499106771349887e-05, + "loss": 0.5220640301704407, + "step": 6412 + }, + { + "epoch": 2.706329113924051, + "grad_norm": 2.8626298904418945, + "learning_rate": 6.496815614866791e-05, + "loss": 0.6019118428230286, + "step": 6414 + }, + { + "epoch": 2.707172995780591, + "grad_norm": 1.1092768907546997, + "learning_rate": 6.494524113097078e-05, + "loss": 0.5754269361495972, + "step": 6416 + }, + { + "epoch": 2.7080168776371307, + "grad_norm": 1.2416579723358154, + "learning_rate": 6.492232266569353e-05, + "loss": 0.5548025369644165, + "step": 6418 + }, + { + "epoch": 2.708860759493671, + "grad_norm": 1.012360692024231, + "learning_rate": 6.489940075812306e-05, + "loss": 0.5706405639648438, + "step": 6420 + }, + { + "epoch": 2.709704641350211, + "grad_norm": 1.376641869544983, + "learning_rate": 6.487647541354698e-05, + "loss": 0.5862169861793518, + "step": 6422 + }, + { + "epoch": 2.710548523206751, + "grad_norm": 1.2425684928894043, + "learning_rate": 6.485354663725374e-05, + "loss": 0.5928428769111633, + "step": 6424 + }, + { + "epoch": 2.7113924050632914, + "grad_norm": 1.0926302671432495, + "learning_rate": 6.483061443453254e-05, + "loss": 0.5903078317642212, + "step": 6426 + }, + { + "epoch": 2.7122362869198313, + "grad_norm": 1.3698115348815918, + "learning_rate": 6.480767881067342e-05, + "loss": 0.5848883986473083, + "step": 6428 + }, + { + "epoch": 2.7130801687763713, + "grad_norm": 1.2949504852294922, + "learning_rate": 6.478473977096718e-05, + "loss": 0.5285207629203796, + "step": 6430 + }, + { + "epoch": 2.7139240506329116, + "grad_norm": 1.3662208318710327, + "learning_rate": 6.476179732070543e-05, + "loss": 0.5965171456336975, + "step": 6432 + }, + { + "epoch": 2.7147679324894516, + "grad_norm": 1.3127343654632568, + "learning_rate": 6.473885146518055e-05, + "loss": 0.6549378037452698, + "step": 6434 + }, + { + "epoch": 2.7156118143459915, + "grad_norm": 1.199431300163269, + "learning_rate": 6.471590220968568e-05, + "loss": 0.574461042881012, + "step": 6436 + }, + { + "epoch": 2.716455696202532, + "grad_norm": 1.1624091863632202, + "learning_rate": 6.469294955951481e-05, + "loss": 0.6142178177833557, + "step": 6438 + }, + { + "epoch": 2.717299578059072, + "grad_norm": 1.2685147523880005, + "learning_rate": 6.466999351996266e-05, + "loss": 0.5775829553604126, + "step": 6440 + }, + { + "epoch": 2.718143459915612, + "grad_norm": 1.0987834930419922, + "learning_rate": 6.464703409632476e-05, + "loss": 0.5400159955024719, + "step": 6442 + }, + { + "epoch": 2.7189873417721517, + "grad_norm": 1.2638986110687256, + "learning_rate": 6.462407129389736e-05, + "loss": 0.558712899684906, + "step": 6444 + }, + { + "epoch": 2.719831223628692, + "grad_norm": 1.174168586730957, + "learning_rate": 6.46011051179776e-05, + "loss": 0.5465238094329834, + "step": 6446 + }, + { + "epoch": 2.720675105485232, + "grad_norm": 1.2185649871826172, + "learning_rate": 6.457813557386331e-05, + "loss": 0.629173219203949, + "step": 6448 + }, + { + "epoch": 2.721518987341772, + "grad_norm": 1.1563167572021484, + "learning_rate": 6.455516266685311e-05, + "loss": 0.5557543039321899, + "step": 6450 + }, + { + "epoch": 2.7223628691983124, + "grad_norm": 1.2934051752090454, + "learning_rate": 6.453218640224642e-05, + "loss": 0.6350696682929993, + "step": 6452 + }, + { + "epoch": 2.7232067510548523, + "grad_norm": 1.045218825340271, + "learning_rate": 6.450920678534342e-05, + "loss": 0.544219434261322, + "step": 6454 + }, + { + "epoch": 2.7240506329113923, + "grad_norm": 1.3102771043777466, + "learning_rate": 6.44862238214451e-05, + "loss": 0.6312481760978699, + "step": 6456 + }, + { + "epoch": 2.7248945147679327, + "grad_norm": 1.3338704109191895, + "learning_rate": 6.446323751585312e-05, + "loss": 0.5772860050201416, + "step": 6458 + }, + { + "epoch": 2.7257383966244726, + "grad_norm": 1.1826046705245972, + "learning_rate": 6.444024787387003e-05, + "loss": 0.5450227856636047, + "step": 6460 + }, + { + "epoch": 2.7265822784810125, + "grad_norm": 1.2449530363082886, + "learning_rate": 6.441725490079908e-05, + "loss": 0.5775642395019531, + "step": 6462 + }, + { + "epoch": 2.7274261603375525, + "grad_norm": 1.1204898357391357, + "learning_rate": 6.439425860194432e-05, + "loss": 0.5795316100120544, + "step": 6464 + }, + { + "epoch": 2.728270042194093, + "grad_norm": 1.179542064666748, + "learning_rate": 6.437125898261056e-05, + "loss": 0.6187583804130554, + "step": 6466 + }, + { + "epoch": 2.729113924050633, + "grad_norm": 1.2231724262237549, + "learning_rate": 6.434825604810333e-05, + "loss": 0.581790566444397, + "step": 6468 + }, + { + "epoch": 2.7299578059071727, + "grad_norm": 1.178859829902649, + "learning_rate": 6.432524980372902e-05, + "loss": 0.5470858812332153, + "step": 6470 + }, + { + "epoch": 2.730801687763713, + "grad_norm": 1.2092641592025757, + "learning_rate": 6.430224025479469e-05, + "loss": 0.591381311416626, + "step": 6472 + }, + { + "epoch": 2.731645569620253, + "grad_norm": 1.395704746246338, + "learning_rate": 6.42792274066082e-05, + "loss": 0.6809561252593994, + "step": 6474 + }, + { + "epoch": 2.732489451476793, + "grad_norm": 1.1937509775161743, + "learning_rate": 6.42562112644782e-05, + "loss": 0.5667102932929993, + "step": 6476 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 1.2181694507598877, + "learning_rate": 6.423319183371405e-05, + "loss": 0.5832397937774658, + "step": 6478 + }, + { + "epoch": 2.7341772151898733, + "grad_norm": 0.9961143732070923, + "learning_rate": 6.42101691196259e-05, + "loss": 0.5432526469230652, + "step": 6480 + }, + { + "epoch": 2.7350210970464133, + "grad_norm": 1.2029842138290405, + "learning_rate": 6.418714312752466e-05, + "loss": 0.5740163326263428, + "step": 6482 + }, + { + "epoch": 2.7358649789029537, + "grad_norm": 1.4317080974578857, + "learning_rate": 6.416411386272196e-05, + "loss": 0.6384599804878235, + "step": 6484 + }, + { + "epoch": 2.7367088607594936, + "grad_norm": 1.2837908267974854, + "learning_rate": 6.414108133053022e-05, + "loss": 0.6619245409965515, + "step": 6486 + }, + { + "epoch": 2.7375527426160335, + "grad_norm": 1.1140583753585815, + "learning_rate": 6.41180455362626e-05, + "loss": 0.5453745126724243, + "step": 6488 + }, + { + "epoch": 2.738396624472574, + "grad_norm": 1.1226048469543457, + "learning_rate": 6.409500648523302e-05, + "loss": 0.6225460171699524, + "step": 6490 + }, + { + "epoch": 2.739240506329114, + "grad_norm": 1.2367178201675415, + "learning_rate": 6.407196418275613e-05, + "loss": 0.5767168402671814, + "step": 6492 + }, + { + "epoch": 2.740084388185654, + "grad_norm": 1.4078115224838257, + "learning_rate": 6.404891863414736e-05, + "loss": 0.6131237745285034, + "step": 6494 + }, + { + "epoch": 2.740928270042194, + "grad_norm": 1.21550452709198, + "learning_rate": 6.40258698447229e-05, + "loss": 0.5236409306526184, + "step": 6496 + }, + { + "epoch": 2.741772151898734, + "grad_norm": 1.22257661819458, + "learning_rate": 6.400281781979962e-05, + "loss": 0.5483267307281494, + "step": 6498 + }, + { + "epoch": 2.742616033755274, + "grad_norm": 1.1525336503982544, + "learning_rate": 6.39797625646952e-05, + "loss": 0.6161116361618042, + "step": 6500 + }, + { + "epoch": 2.742616033755274, + "eval_loss": 0.6768895387649536, + "eval_runtime": 513.0657, + "eval_samples_per_second": 4.107, + "eval_steps_per_second": 4.107, + "step": 6500 + }, + { + "epoch": 2.7434599156118145, + "grad_norm": 1.094993233680725, + "learning_rate": 6.395670408472804e-05, + "loss": 0.5587809681892395, + "step": 6502 + }, + { + "epoch": 2.7443037974683544, + "grad_norm": 1.1560120582580566, + "learning_rate": 6.393364238521731e-05, + "loss": 0.6118067502975464, + "step": 6504 + }, + { + "epoch": 2.7451476793248943, + "grad_norm": 1.3500670194625854, + "learning_rate": 6.391057747148285e-05, + "loss": 0.6314222812652588, + "step": 6506 + }, + { + "epoch": 2.7459915611814347, + "grad_norm": 1.2182261943817139, + "learning_rate": 6.388750934884535e-05, + "loss": 0.5695898532867432, + "step": 6508 + }, + { + "epoch": 2.7468354430379747, + "grad_norm": 1.3393630981445312, + "learning_rate": 6.386443802262616e-05, + "loss": 0.5848485827445984, + "step": 6510 + }, + { + "epoch": 2.7476793248945146, + "grad_norm": 1.412109375, + "learning_rate": 6.384136349814737e-05, + "loss": 0.5920066237449646, + "step": 6512 + }, + { + "epoch": 2.748523206751055, + "grad_norm": 1.174395203590393, + "learning_rate": 6.381828578073186e-05, + "loss": 0.5770407319068909, + "step": 6514 + }, + { + "epoch": 2.749367088607595, + "grad_norm": 1.2811627388000488, + "learning_rate": 6.37952048757032e-05, + "loss": 0.5780549049377441, + "step": 6516 + }, + { + "epoch": 2.750210970464135, + "grad_norm": 1.0966699123382568, + "learning_rate": 6.377212078838573e-05, + "loss": 0.5276137590408325, + "step": 6518 + }, + { + "epoch": 2.7510548523206753, + "grad_norm": 1.082350730895996, + "learning_rate": 6.374903352410449e-05, + "loss": 0.5744844675064087, + "step": 6520 + }, + { + "epoch": 2.751898734177215, + "grad_norm": 1.342262864112854, + "learning_rate": 6.372594308818527e-05, + "loss": 0.6084962487220764, + "step": 6522 + }, + { + "epoch": 2.752742616033755, + "grad_norm": 1.1922634840011597, + "learning_rate": 6.370284948595458e-05, + "loss": 0.5551698803901672, + "step": 6524 + }, + { + "epoch": 2.7535864978902955, + "grad_norm": 1.1368752717971802, + "learning_rate": 6.36797527227397e-05, + "loss": 0.6398477554321289, + "step": 6526 + }, + { + "epoch": 2.7544303797468355, + "grad_norm": 1.1748154163360596, + "learning_rate": 6.365665280386857e-05, + "loss": 0.6201474666595459, + "step": 6528 + }, + { + "epoch": 2.7552742616033754, + "grad_norm": 1.2439727783203125, + "learning_rate": 6.363354973466993e-05, + "loss": 0.6196629405021667, + "step": 6530 + }, + { + "epoch": 2.756118143459916, + "grad_norm": 1.146153211593628, + "learning_rate": 6.36104435204732e-05, + "loss": 0.6379110813140869, + "step": 6532 + }, + { + "epoch": 2.7569620253164557, + "grad_norm": 1.118996024131775, + "learning_rate": 6.358733416660854e-05, + "loss": 0.5695750713348389, + "step": 6534 + }, + { + "epoch": 2.7578059071729957, + "grad_norm": 1.219043493270874, + "learning_rate": 6.356422167840685e-05, + "loss": 0.5846145153045654, + "step": 6536 + }, + { + "epoch": 2.758649789029536, + "grad_norm": 1.120754361152649, + "learning_rate": 6.354110606119973e-05, + "loss": 0.5762830972671509, + "step": 6538 + }, + { + "epoch": 2.759493670886076, + "grad_norm": 1.0562269687652588, + "learning_rate": 6.351798732031949e-05, + "loss": 0.605473518371582, + "step": 6540 + }, + { + "epoch": 2.760337552742616, + "grad_norm": 1.3034429550170898, + "learning_rate": 6.34948654610992e-05, + "loss": 0.6314473748207092, + "step": 6542 + }, + { + "epoch": 2.7611814345991563, + "grad_norm": 1.1129206418991089, + "learning_rate": 6.347174048887263e-05, + "loss": 0.5332847237586975, + "step": 6544 + }, + { + "epoch": 2.7620253164556963, + "grad_norm": 1.068705439567566, + "learning_rate": 6.344861240897423e-05, + "loss": 0.6015381813049316, + "step": 6546 + }, + { + "epoch": 2.762869198312236, + "grad_norm": 1.161868691444397, + "learning_rate": 6.342548122673925e-05, + "loss": 0.5989309549331665, + "step": 6548 + }, + { + "epoch": 2.7637130801687766, + "grad_norm": 1.1323082447052002, + "learning_rate": 6.340234694750359e-05, + "loss": 0.5843837261199951, + "step": 6550 + }, + { + "epoch": 2.7645569620253165, + "grad_norm": 1.2302695512771606, + "learning_rate": 6.337920957660388e-05, + "loss": 0.603590726852417, + "step": 6552 + }, + { + "epoch": 2.7654008438818565, + "grad_norm": 1.2483820915222168, + "learning_rate": 6.335606911937749e-05, + "loss": 0.6207526326179504, + "step": 6554 + }, + { + "epoch": 2.766244725738397, + "grad_norm": 1.353147029876709, + "learning_rate": 6.333292558116245e-05, + "loss": 0.5964639782905579, + "step": 6556 + }, + { + "epoch": 2.767088607594937, + "grad_norm": 1.2074922323226929, + "learning_rate": 6.330977896729755e-05, + "loss": 0.5078298449516296, + "step": 6558 + }, + { + "epoch": 2.7679324894514767, + "grad_norm": 1.208228588104248, + "learning_rate": 6.328662928312225e-05, + "loss": 0.5649725198745728, + "step": 6560 + }, + { + "epoch": 2.768776371308017, + "grad_norm": 1.2749123573303223, + "learning_rate": 6.326347653397676e-05, + "loss": 0.5552892684936523, + "step": 6562 + }, + { + "epoch": 2.769620253164557, + "grad_norm": 1.1484880447387695, + "learning_rate": 6.324032072520197e-05, + "loss": 0.6514022350311279, + "step": 6564 + }, + { + "epoch": 2.770464135021097, + "grad_norm": 1.1836612224578857, + "learning_rate": 6.321716186213946e-05, + "loss": 0.5342835783958435, + "step": 6566 + }, + { + "epoch": 2.7713080168776374, + "grad_norm": 1.1626124382019043, + "learning_rate": 6.319399995013154e-05, + "loss": 0.6427282691001892, + "step": 6568 + }, + { + "epoch": 2.7721518987341773, + "grad_norm": 1.0736790895462036, + "learning_rate": 6.317083499452123e-05, + "loss": 0.5326613187789917, + "step": 6570 + }, + { + "epoch": 2.7729957805907173, + "grad_norm": 1.1652518510818481, + "learning_rate": 6.314766700065227e-05, + "loss": 0.543228268623352, + "step": 6572 + }, + { + "epoch": 2.7738396624472577, + "grad_norm": 1.232256531715393, + "learning_rate": 6.3124495973869e-05, + "loss": 0.5558459758758545, + "step": 6574 + }, + { + "epoch": 2.7746835443037976, + "grad_norm": 1.3306560516357422, + "learning_rate": 6.310132191951659e-05, + "loss": 0.6432561874389648, + "step": 6576 + }, + { + "epoch": 2.7755274261603375, + "grad_norm": 1.3863320350646973, + "learning_rate": 6.307814484294083e-05, + "loss": 0.6424768567085266, + "step": 6578 + }, + { + "epoch": 2.7763713080168775, + "grad_norm": 1.186691164970398, + "learning_rate": 6.305496474948822e-05, + "loss": 0.5481483936309814, + "step": 6580 + }, + { + "epoch": 2.777215189873418, + "grad_norm": 1.2820651531219482, + "learning_rate": 6.303178164450596e-05, + "loss": 0.5352432727813721, + "step": 6582 + }, + { + "epoch": 2.778059071729958, + "grad_norm": 1.1904656887054443, + "learning_rate": 6.300859553334196e-05, + "loss": 0.6270323991775513, + "step": 6584 + }, + { + "epoch": 2.7789029535864977, + "grad_norm": 1.1635342836380005, + "learning_rate": 6.29854064213448e-05, + "loss": 0.5700342059135437, + "step": 6586 + }, + { + "epoch": 2.779746835443038, + "grad_norm": 1.1065751314163208, + "learning_rate": 6.296221431386379e-05, + "loss": 0.5618587136268616, + "step": 6588 + }, + { + "epoch": 2.780590717299578, + "grad_norm": 1.3106048107147217, + "learning_rate": 6.293901921624885e-05, + "loss": 0.5982993841171265, + "step": 6590 + }, + { + "epoch": 2.781434599156118, + "grad_norm": 1.210839867591858, + "learning_rate": 6.291582113385071e-05, + "loss": 0.6210941076278687, + "step": 6592 + }, + { + "epoch": 2.782278481012658, + "grad_norm": 1.1407668590545654, + "learning_rate": 6.289262007202066e-05, + "loss": 0.5711221694946289, + "step": 6594 + }, + { + "epoch": 2.7831223628691983, + "grad_norm": 1.2315012216567993, + "learning_rate": 6.286941603611078e-05, + "loss": 0.5741305947303772, + "step": 6596 + }, + { + "epoch": 2.7839662447257383, + "grad_norm": 1.3056857585906982, + "learning_rate": 6.284620903147377e-05, + "loss": 0.5329633951187134, + "step": 6598 + }, + { + "epoch": 2.7848101265822782, + "grad_norm": 1.1501489877700806, + "learning_rate": 6.282299906346306e-05, + "loss": 0.6097646951675415, + "step": 6600 + }, + { + "epoch": 2.7848101265822782, + "eval_loss": 0.6737648844718933, + "eval_runtime": 512.921, + "eval_samples_per_second": 4.108, + "eval_steps_per_second": 4.108, + "step": 6600 + }, + { + "epoch": 2.7856540084388186, + "grad_norm": 1.0871381759643555, + "learning_rate": 6.279978613743275e-05, + "loss": 0.5561007857322693, + "step": 6602 + }, + { + "epoch": 2.7864978902953585, + "grad_norm": 1.188563585281372, + "learning_rate": 6.277657025873758e-05, + "loss": 0.5803903341293335, + "step": 6604 + }, + { + "epoch": 2.7873417721518985, + "grad_norm": 1.1444810628890991, + "learning_rate": 6.275335143273305e-05, + "loss": 0.5143039226531982, + "step": 6606 + }, + { + "epoch": 2.788185654008439, + "grad_norm": 1.096595287322998, + "learning_rate": 6.273012966477526e-05, + "loss": 0.543094277381897, + "step": 6608 + }, + { + "epoch": 2.789029535864979, + "grad_norm": 1.195801019668579, + "learning_rate": 6.270690496022105e-05, + "loss": 0.5597999095916748, + "step": 6610 + }, + { + "epoch": 2.7898734177215188, + "grad_norm": 1.236894965171814, + "learning_rate": 6.26836773244279e-05, + "loss": 0.5496288537979126, + "step": 6612 + }, + { + "epoch": 2.790717299578059, + "grad_norm": 1.1474205255508423, + "learning_rate": 6.2660446762754e-05, + "loss": 0.6104549169540405, + "step": 6614 + }, + { + "epoch": 2.791561181434599, + "grad_norm": 1.1649401187896729, + "learning_rate": 6.263721328055818e-05, + "loss": 0.6186942458152771, + "step": 6616 + }, + { + "epoch": 2.792405063291139, + "grad_norm": 1.1187876462936401, + "learning_rate": 6.261397688319993e-05, + "loss": 0.5332194566726685, + "step": 6618 + }, + { + "epoch": 2.7932489451476794, + "grad_norm": 1.2765967845916748, + "learning_rate": 6.25907375760395e-05, + "loss": 0.6478220224380493, + "step": 6620 + }, + { + "epoch": 2.7940928270042193, + "grad_norm": 1.232173204421997, + "learning_rate": 6.256749536443771e-05, + "loss": 0.6406530141830444, + "step": 6622 + }, + { + "epoch": 2.7949367088607593, + "grad_norm": 1.045032262802124, + "learning_rate": 6.254425025375612e-05, + "loss": 0.6082814931869507, + "step": 6624 + }, + { + "epoch": 2.7957805907172997, + "grad_norm": 1.2285528182983398, + "learning_rate": 6.252100224935689e-05, + "loss": 0.6527243852615356, + "step": 6626 + }, + { + "epoch": 2.7966244725738396, + "grad_norm": 1.1741310358047485, + "learning_rate": 6.24977513566029e-05, + "loss": 0.5787529945373535, + "step": 6628 + }, + { + "epoch": 2.7974683544303796, + "grad_norm": 1.1933153867721558, + "learning_rate": 6.247449758085773e-05, + "loss": 0.5816542506217957, + "step": 6630 + }, + { + "epoch": 2.79831223628692, + "grad_norm": 1.3991938829421997, + "learning_rate": 6.245124092748552e-05, + "loss": 0.61644446849823, + "step": 6632 + }, + { + "epoch": 2.79915611814346, + "grad_norm": 1.1720032691955566, + "learning_rate": 6.242798140185117e-05, + "loss": 0.5762863755226135, + "step": 6634 + }, + { + "epoch": 2.8, + "grad_norm": 1.2190258502960205, + "learning_rate": 6.240471900932019e-05, + "loss": 0.656046986579895, + "step": 6636 + }, + { + "epoch": 2.80084388185654, + "grad_norm": 1.128190040588379, + "learning_rate": 6.238145375525877e-05, + "loss": 0.5192724466323853, + "step": 6638 + }, + { + "epoch": 2.80168776371308, + "grad_norm": 1.2625527381896973, + "learning_rate": 6.235818564503377e-05, + "loss": 0.6037933826446533, + "step": 6640 + }, + { + "epoch": 2.80253164556962, + "grad_norm": 1.2483288049697876, + "learning_rate": 6.233491468401268e-05, + "loss": 0.6108730435371399, + "step": 6642 + }, + { + "epoch": 2.8033755274261605, + "grad_norm": 1.3986961841583252, + "learning_rate": 6.231164087756367e-05, + "loss": 0.6408922672271729, + "step": 6644 + }, + { + "epoch": 2.8042194092827004, + "grad_norm": 1.2224489450454712, + "learning_rate": 6.228836423105556e-05, + "loss": 0.648504376411438, + "step": 6646 + }, + { + "epoch": 2.8050632911392404, + "grad_norm": 1.2060397863388062, + "learning_rate": 6.226508474985782e-05, + "loss": 0.5769880414009094, + "step": 6648 + }, + { + "epoch": 2.8059071729957807, + "grad_norm": 1.262581467628479, + "learning_rate": 6.224180243934058e-05, + "loss": 0.6585965752601624, + "step": 6650 + }, + { + "epoch": 2.8067510548523207, + "grad_norm": 1.1175196170806885, + "learning_rate": 6.221851730487463e-05, + "loss": 0.618746817111969, + "step": 6652 + }, + { + "epoch": 2.8075949367088606, + "grad_norm": 1.2256932258605957, + "learning_rate": 6.219522935183141e-05, + "loss": 0.5708954930305481, + "step": 6654 + }, + { + "epoch": 2.808438818565401, + "grad_norm": 1.3388983011245728, + "learning_rate": 6.217193858558298e-05, + "loss": 0.608521580696106, + "step": 6656 + }, + { + "epoch": 2.809282700421941, + "grad_norm": 1.2913719415664673, + "learning_rate": 6.214864501150208e-05, + "loss": 0.64382004737854, + "step": 6658 + }, + { + "epoch": 2.810126582278481, + "grad_norm": 1.039406657218933, + "learning_rate": 6.21253486349621e-05, + "loss": 0.567484438419342, + "step": 6660 + }, + { + "epoch": 2.8109704641350213, + "grad_norm": 1.123612642288208, + "learning_rate": 6.210204946133707e-05, + "loss": 0.5696196556091309, + "step": 6662 + }, + { + "epoch": 2.811814345991561, + "grad_norm": 1.1850367784500122, + "learning_rate": 6.207874749600164e-05, + "loss": 0.6068252921104431, + "step": 6664 + }, + { + "epoch": 2.812658227848101, + "grad_norm": 1.3630138635635376, + "learning_rate": 6.205544274433115e-05, + "loss": 0.6329811215400696, + "step": 6666 + }, + { + "epoch": 2.8135021097046415, + "grad_norm": 1.217410683631897, + "learning_rate": 6.203213521170154e-05, + "loss": 0.5600330829620361, + "step": 6668 + }, + { + "epoch": 2.8143459915611815, + "grad_norm": 3.5133564472198486, + "learning_rate": 6.200882490348942e-05, + "loss": 0.639461874961853, + "step": 6670 + }, + { + "epoch": 2.8151898734177214, + "grad_norm": 1.2535229921340942, + "learning_rate": 6.198551182507203e-05, + "loss": 0.5908592939376831, + "step": 6672 + }, + { + "epoch": 2.816033755274262, + "grad_norm": 1.2667300701141357, + "learning_rate": 6.196219598182726e-05, + "loss": 0.5490466952323914, + "step": 6674 + }, + { + "epoch": 2.8168776371308017, + "grad_norm": 1.332416296005249, + "learning_rate": 6.19388773791336e-05, + "loss": 0.6570454239845276, + "step": 6676 + }, + { + "epoch": 2.8177215189873417, + "grad_norm": 1.2882871627807617, + "learning_rate": 6.191555602237023e-05, + "loss": 0.6296758651733398, + "step": 6678 + }, + { + "epoch": 2.818565400843882, + "grad_norm": 1.2949540615081787, + "learning_rate": 6.189223191691691e-05, + "loss": 0.6238688826560974, + "step": 6680 + }, + { + "epoch": 2.819409282700422, + "grad_norm": 1.3507297039031982, + "learning_rate": 6.18689050681541e-05, + "loss": 0.6287838220596313, + "step": 6682 + }, + { + "epoch": 2.820253164556962, + "grad_norm": 1.0284801721572876, + "learning_rate": 6.184557548146282e-05, + "loss": 0.5871602892875671, + "step": 6684 + }, + { + "epoch": 2.8210970464135023, + "grad_norm": 1.3238089084625244, + "learning_rate": 6.182224316222478e-05, + "loss": 0.5973687171936035, + "step": 6686 + }, + { + "epoch": 2.8219409282700423, + "grad_norm": 1.0406007766723633, + "learning_rate": 6.179890811582232e-05, + "loss": 0.5463243722915649, + "step": 6688 + }, + { + "epoch": 2.8227848101265822, + "grad_norm": 1.1670905351638794, + "learning_rate": 6.177557034763832e-05, + "loss": 0.5976935625076294, + "step": 6690 + }, + { + "epoch": 2.8236286919831226, + "grad_norm": 1.0810848474502563, + "learning_rate": 6.175222986305642e-05, + "loss": 0.6159120798110962, + "step": 6692 + }, + { + "epoch": 2.8244725738396625, + "grad_norm": 1.1419588327407837, + "learning_rate": 6.172888666746078e-05, + "loss": 0.6232127547264099, + "step": 6694 + }, + { + "epoch": 2.8253164556962025, + "grad_norm": 1.118447184562683, + "learning_rate": 6.170554076623627e-05, + "loss": 0.579402506351471, + "step": 6696 + }, + { + "epoch": 2.826160337552743, + "grad_norm": 1.3584961891174316, + "learning_rate": 6.168219216476828e-05, + "loss": 0.5871124863624573, + "step": 6698 + }, + { + "epoch": 2.827004219409283, + "grad_norm": 1.1773170232772827, + "learning_rate": 6.165884086844295e-05, + "loss": 0.6119418144226074, + "step": 6700 + }, + { + "epoch": 2.827004219409283, + "eval_loss": 0.6737436056137085, + "eval_runtime": 513.2559, + "eval_samples_per_second": 4.105, + "eval_steps_per_second": 4.105, + "step": 6700 + }, + { + "epoch": 2.8278481012658228, + "grad_norm": 1.2150315046310425, + "learning_rate": 6.163548688264693e-05, + "loss": 0.606975257396698, + "step": 6702 + }, + { + "epoch": 2.828691983122363, + "grad_norm": 1.23250412940979, + "learning_rate": 6.161213021276754e-05, + "loss": 0.5860852003097534, + "step": 6704 + }, + { + "epoch": 2.829535864978903, + "grad_norm": 1.1053578853607178, + "learning_rate": 6.158877086419273e-05, + "loss": 0.543590784072876, + "step": 6706 + }, + { + "epoch": 2.830379746835443, + "grad_norm": 1.2813301086425781, + "learning_rate": 6.156540884231105e-05, + "loss": 0.6040283441543579, + "step": 6708 + }, + { + "epoch": 2.831223628691983, + "grad_norm": 1.2987254858016968, + "learning_rate": 6.154204415251169e-05, + "loss": 0.586407482624054, + "step": 6710 + }, + { + "epoch": 2.8320675105485233, + "grad_norm": 1.1980805397033691, + "learning_rate": 6.151867680018438e-05, + "loss": 0.6180199384689331, + "step": 6712 + }, + { + "epoch": 2.8329113924050633, + "grad_norm": 1.642957329750061, + "learning_rate": 6.149530679071956e-05, + "loss": 0.5772807002067566, + "step": 6714 + }, + { + "epoch": 2.8337552742616032, + "grad_norm": 1.3908783197402954, + "learning_rate": 6.147193412950825e-05, + "loss": 0.6107099652290344, + "step": 6716 + }, + { + "epoch": 2.8345991561181436, + "grad_norm": 1.3866089582443237, + "learning_rate": 6.144855882194206e-05, + "loss": 0.5335796475410461, + "step": 6718 + }, + { + "epoch": 2.8354430379746836, + "grad_norm": 1.2989959716796875, + "learning_rate": 6.14251808734132e-05, + "loss": 0.5962506532669067, + "step": 6720 + }, + { + "epoch": 2.8362869198312235, + "grad_norm": 1.3145360946655273, + "learning_rate": 6.140180028931456e-05, + "loss": 0.6368465423583984, + "step": 6722 + }, + { + "epoch": 2.8371308016877634, + "grad_norm": 1.1515997648239136, + "learning_rate": 6.137841707503955e-05, + "loss": 0.6448454856872559, + "step": 6724 + }, + { + "epoch": 2.837974683544304, + "grad_norm": 1.0785750150680542, + "learning_rate": 6.135503123598225e-05, + "loss": 0.49946340918540955, + "step": 6726 + }, + { + "epoch": 2.8388185654008438, + "grad_norm": 1.1683695316314697, + "learning_rate": 6.133164277753733e-05, + "loss": 0.550529956817627, + "step": 6728 + }, + { + "epoch": 2.8396624472573837, + "grad_norm": 1.0640658140182495, + "learning_rate": 6.130825170510006e-05, + "loss": 0.5135641098022461, + "step": 6730 + }, + { + "epoch": 2.840506329113924, + "grad_norm": 1.1805553436279297, + "learning_rate": 6.12848580240663e-05, + "loss": 0.6608622670173645, + "step": 6732 + }, + { + "epoch": 2.841350210970464, + "grad_norm": 1.2218462228775024, + "learning_rate": 6.12614617398325e-05, + "loss": 0.6797777414321899, + "step": 6734 + }, + { + "epoch": 2.842194092827004, + "grad_norm": 1.0677950382232666, + "learning_rate": 6.123806285779576e-05, + "loss": 0.5570073127746582, + "step": 6736 + }, + { + "epoch": 2.8430379746835444, + "grad_norm": 1.202785849571228, + "learning_rate": 6.121466138335376e-05, + "loss": 0.6273435354232788, + "step": 6738 + }, + { + "epoch": 2.8438818565400843, + "grad_norm": 1.1837576627731323, + "learning_rate": 6.119125732190477e-05, + "loss": 0.6337732076644897, + "step": 6740 + }, + { + "epoch": 2.8447257383966242, + "grad_norm": 1.2692649364471436, + "learning_rate": 6.116785067884764e-05, + "loss": 0.6228005886077881, + "step": 6742 + }, + { + "epoch": 2.8455696202531646, + "grad_norm": 1.3237874507904053, + "learning_rate": 6.114444145958183e-05, + "loss": 0.5781991481781006, + "step": 6744 + }, + { + "epoch": 2.8464135021097046, + "grad_norm": 1.2384692430496216, + "learning_rate": 6.112102966950742e-05, + "loss": 0.5583632588386536, + "step": 6746 + }, + { + "epoch": 2.8472573839662445, + "grad_norm": 1.1730914115905762, + "learning_rate": 6.109761531402505e-05, + "loss": 0.5704524517059326, + "step": 6748 + }, + { + "epoch": 2.848101265822785, + "grad_norm": 1.3047250509262085, + "learning_rate": 6.107419839853597e-05, + "loss": 0.5658026933670044, + "step": 6750 + }, + { + "epoch": 2.848945147679325, + "grad_norm": 1.2044686079025269, + "learning_rate": 6.105077892844198e-05, + "loss": 0.5919271111488342, + "step": 6752 + }, + { + "epoch": 2.8497890295358648, + "grad_norm": 1.1952540874481201, + "learning_rate": 6.102735690914554e-05, + "loss": 0.578326404094696, + "step": 6754 + }, + { + "epoch": 2.850632911392405, + "grad_norm": 1.2275413274765015, + "learning_rate": 6.1003932346049633e-05, + "loss": 0.6079645156860352, + "step": 6756 + }, + { + "epoch": 2.851476793248945, + "grad_norm": 1.2760299444198608, + "learning_rate": 6.0980505244557884e-05, + "loss": 0.6111302375793457, + "step": 6758 + }, + { + "epoch": 2.852320675105485, + "grad_norm": 1.4044286012649536, + "learning_rate": 6.095707561007444e-05, + "loss": 0.6397197246551514, + "step": 6760 + }, + { + "epoch": 2.8531645569620254, + "grad_norm": 1.3707174062728882, + "learning_rate": 6.0933643448004094e-05, + "loss": 0.6183030605316162, + "step": 6762 + }, + { + "epoch": 2.8540084388185654, + "grad_norm": 1.290480613708496, + "learning_rate": 6.091020876375221e-05, + "loss": 0.6367093920707703, + "step": 6764 + }, + { + "epoch": 2.8548523206751053, + "grad_norm": 1.0469609498977661, + "learning_rate": 6.0886771562724673e-05, + "loss": 0.550685703754425, + "step": 6766 + }, + { + "epoch": 2.8556962025316457, + "grad_norm": 1.312018871307373, + "learning_rate": 6.086333185032804e-05, + "loss": 0.5789266228675842, + "step": 6768 + }, + { + "epoch": 2.8565400843881856, + "grad_norm": 1.3253673315048218, + "learning_rate": 6.0839889631969374e-05, + "loss": 0.5595589876174927, + "step": 6770 + }, + { + "epoch": 2.8573839662447256, + "grad_norm": 1.2848154306411743, + "learning_rate": 6.0816444913056356e-05, + "loss": 0.5642995238304138, + "step": 6772 + }, + { + "epoch": 2.858227848101266, + "grad_norm": 1.2492237091064453, + "learning_rate": 6.079299769899722e-05, + "loss": 0.5502132773399353, + "step": 6774 + }, + { + "epoch": 2.859071729957806, + "grad_norm": 1.2817713022232056, + "learning_rate": 6.076954799520081e-05, + "loss": 0.5535969138145447, + "step": 6776 + }, + { + "epoch": 2.859915611814346, + "grad_norm": 1.1986786127090454, + "learning_rate": 6.074609580707651e-05, + "loss": 0.6086817979812622, + "step": 6778 + }, + { + "epoch": 2.8607594936708862, + "grad_norm": 1.274839162826538, + "learning_rate": 6.0722641140034285e-05, + "loss": 0.6254655718803406, + "step": 6780 + }, + { + "epoch": 2.861603375527426, + "grad_norm": 1.0627212524414062, + "learning_rate": 6.0699183999484685e-05, + "loss": 0.6227576732635498, + "step": 6782 + }, + { + "epoch": 2.862447257383966, + "grad_norm": 1.2313296794891357, + "learning_rate": 6.0675724390838815e-05, + "loss": 0.6257740259170532, + "step": 6784 + }, + { + "epoch": 2.8632911392405065, + "grad_norm": 1.1398836374282837, + "learning_rate": 6.065226231950837e-05, + "loss": 0.6438660621643066, + "step": 6786 + }, + { + "epoch": 2.8641350210970464, + "grad_norm": 1.1606178283691406, + "learning_rate": 6.0628797790905566e-05, + "loss": 0.5654972195625305, + "step": 6788 + }, + { + "epoch": 2.8649789029535864, + "grad_norm": 1.2857846021652222, + "learning_rate": 6.060533081044326e-05, + "loss": 0.5413897633552551, + "step": 6790 + }, + { + "epoch": 2.8658227848101268, + "grad_norm": 1.2358965873718262, + "learning_rate": 6.058186138353481e-05, + "loss": 0.5737078785896301, + "step": 6792 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 1.0813729763031006, + "learning_rate": 6.055838951559417e-05, + "loss": 0.5880253314971924, + "step": 6794 + }, + { + "epoch": 2.8675105485232066, + "grad_norm": 1.2310819625854492, + "learning_rate": 6.0534915212035836e-05, + "loss": 0.5762695074081421, + "step": 6796 + }, + { + "epoch": 2.868354430379747, + "grad_norm": 1.2762445211410522, + "learning_rate": 6.0511438478274906e-05, + "loss": 0.6172254085540771, + "step": 6798 + }, + { + "epoch": 2.869198312236287, + "grad_norm": 1.0100860595703125, + "learning_rate": 6.0487959319726994e-05, + "loss": 0.5419955849647522, + "step": 6800 + }, + { + "epoch": 2.869198312236287, + "eval_loss": 0.6721681356430054, + "eval_runtime": 513.1285, + "eval_samples_per_second": 4.106, + "eval_steps_per_second": 4.106, + "step": 6800 + }, + { + "epoch": 2.870042194092827, + "grad_norm": 1.3078527450561523, + "learning_rate": 6.046447774180827e-05, + "loss": 0.6330351233482361, + "step": 6802 + }, + { + "epoch": 2.8708860759493673, + "grad_norm": 1.3523176908493042, + "learning_rate": 6.044099374993553e-05, + "loss": 0.5479466915130615, + "step": 6804 + }, + { + "epoch": 2.8717299578059072, + "grad_norm": 1.109269142150879, + "learning_rate": 6.041750734952604e-05, + "loss": 0.5516952872276306, + "step": 6806 + }, + { + "epoch": 2.872573839662447, + "grad_norm": 1.2368918657302856, + "learning_rate": 6.039401854599769e-05, + "loss": 0.5878147482872009, + "step": 6808 + }, + { + "epoch": 2.8734177215189876, + "grad_norm": 1.1626032590866089, + "learning_rate": 6.037052734476886e-05, + "loss": 0.5637685656547546, + "step": 6810 + }, + { + "epoch": 2.8742616033755275, + "grad_norm": 1.1955288648605347, + "learning_rate": 6.0347033751258566e-05, + "loss": 0.5398213267326355, + "step": 6812 + }, + { + "epoch": 2.8751054852320674, + "grad_norm": 1.3805105686187744, + "learning_rate": 6.0323537770886285e-05, + "loss": 0.6098157167434692, + "step": 6814 + }, + { + "epoch": 2.875949367088608, + "grad_norm": 1.2644819021224976, + "learning_rate": 6.030003940907212e-05, + "loss": 0.5970560312271118, + "step": 6816 + }, + { + "epoch": 2.8767932489451478, + "grad_norm": 1.1625932455062866, + "learning_rate": 6.027653867123667e-05, + "loss": 0.5918156504631042, + "step": 6818 + }, + { + "epoch": 2.8776371308016877, + "grad_norm": 1.3591371774673462, + "learning_rate": 6.025303556280112e-05, + "loss": 0.5625584721565247, + "step": 6820 + }, + { + "epoch": 2.878481012658228, + "grad_norm": 1.266757845878601, + "learning_rate": 6.022953008918718e-05, + "loss": 0.6422242522239685, + "step": 6822 + }, + { + "epoch": 2.879324894514768, + "grad_norm": 1.273234248161316, + "learning_rate": 6.0206022255817095e-05, + "loss": 0.6625136733055115, + "step": 6824 + }, + { + "epoch": 2.880168776371308, + "grad_norm": 1.2808254957199097, + "learning_rate": 6.0182512068113715e-05, + "loss": 0.6410037279129028, + "step": 6826 + }, + { + "epoch": 2.8810126582278484, + "grad_norm": 1.1684991121292114, + "learning_rate": 6.0158999531500335e-05, + "loss": 0.5269461274147034, + "step": 6828 + }, + { + "epoch": 2.8818565400843883, + "grad_norm": 1.3655736446380615, + "learning_rate": 6.0135484651400886e-05, + "loss": 0.6546348929405212, + "step": 6830 + }, + { + "epoch": 2.8827004219409282, + "grad_norm": 1.3913087844848633, + "learning_rate": 6.011196743323977e-05, + "loss": 0.5872722864151001, + "step": 6832 + }, + { + "epoch": 2.8835443037974686, + "grad_norm": 1.1047117710113525, + "learning_rate": 6.008844788244199e-05, + "loss": 0.5498786568641663, + "step": 6834 + }, + { + "epoch": 2.8843881856540086, + "grad_norm": 1.0897705554962158, + "learning_rate": 6.006492600443301e-05, + "loss": 0.5740244388580322, + "step": 6836 + }, + { + "epoch": 2.8852320675105485, + "grad_norm": 1.0046823024749756, + "learning_rate": 6.004140180463891e-05, + "loss": 0.5618779063224792, + "step": 6838 + }, + { + "epoch": 2.8860759493670884, + "grad_norm": 1.231499195098877, + "learning_rate": 6.001787528848628e-05, + "loss": 0.6124269366264343, + "step": 6840 + }, + { + "epoch": 2.886919831223629, + "grad_norm": 1.1776596307754517, + "learning_rate": 5.999434646140219e-05, + "loss": 0.5512109994888306, + "step": 6842 + }, + { + "epoch": 2.8877637130801688, + "grad_norm": 1.2528871297836304, + "learning_rate": 5.9970815328814334e-05, + "loss": 0.610329270362854, + "step": 6844 + }, + { + "epoch": 2.8886075949367087, + "grad_norm": 1.4408416748046875, + "learning_rate": 5.994728189615087e-05, + "loss": 0.568793773651123, + "step": 6846 + }, + { + "epoch": 2.889451476793249, + "grad_norm": 1.2031673192977905, + "learning_rate": 5.9923746168840523e-05, + "loss": 0.6107773184776306, + "step": 6848 + }, + { + "epoch": 2.890295358649789, + "grad_norm": 1.3201221227645874, + "learning_rate": 5.990020815231251e-05, + "loss": 0.6217910647392273, + "step": 6850 + }, + { + "epoch": 2.891139240506329, + "grad_norm": 1.1753840446472168, + "learning_rate": 5.987666785199661e-05, + "loss": 0.6051784157752991, + "step": 6852 + }, + { + "epoch": 2.8919831223628694, + "grad_norm": 1.2406786680221558, + "learning_rate": 5.985312527332314e-05, + "loss": 0.5736448168754578, + "step": 6854 + }, + { + "epoch": 2.8928270042194093, + "grad_norm": 1.6206021308898926, + "learning_rate": 5.98295804217229e-05, + "loss": 0.5454224944114685, + "step": 6856 + }, + { + "epoch": 2.8936708860759492, + "grad_norm": 1.2756178379058838, + "learning_rate": 5.9806033302627227e-05, + "loss": 0.5912685990333557, + "step": 6858 + }, + { + "epoch": 2.894514767932489, + "grad_norm": 1.223631501197815, + "learning_rate": 5.9782483921468e-05, + "loss": 0.5619014501571655, + "step": 6860 + }, + { + "epoch": 2.8953586497890296, + "grad_norm": 1.06546151638031, + "learning_rate": 5.975893228367762e-05, + "loss": 0.5629459619522095, + "step": 6862 + }, + { + "epoch": 2.8962025316455695, + "grad_norm": 1.0573277473449707, + "learning_rate": 5.9735378394688965e-05, + "loss": 0.4997110366821289, + "step": 6864 + }, + { + "epoch": 2.8970464135021095, + "grad_norm": 1.2832465171813965, + "learning_rate": 5.97118222599355e-05, + "loss": 0.6370334625244141, + "step": 6866 + }, + { + "epoch": 2.89789029535865, + "grad_norm": 1.1721924543380737, + "learning_rate": 5.968826388485116e-05, + "loss": 0.6095840334892273, + "step": 6868 + }, + { + "epoch": 2.8987341772151898, + "grad_norm": 1.1428951025009155, + "learning_rate": 5.966470327487042e-05, + "loss": 0.6075419187545776, + "step": 6870 + }, + { + "epoch": 2.8995780590717297, + "grad_norm": 1.2369399070739746, + "learning_rate": 5.964114043542822e-05, + "loss": 0.6376850605010986, + "step": 6872 + }, + { + "epoch": 2.90042194092827, + "grad_norm": 1.178520679473877, + "learning_rate": 5.961757537196011e-05, + "loss": 0.57747882604599, + "step": 6874 + }, + { + "epoch": 2.90126582278481, + "grad_norm": 1.2600151300430298, + "learning_rate": 5.959400808990205e-05, + "loss": 0.626102864742279, + "step": 6876 + }, + { + "epoch": 2.90210970464135, + "grad_norm": 1.2809659242630005, + "learning_rate": 5.957043859469058e-05, + "loss": 0.6087106466293335, + "step": 6878 + }, + { + "epoch": 2.9029535864978904, + "grad_norm": 1.2029764652252197, + "learning_rate": 5.954686689176274e-05, + "loss": 0.599288284778595, + "step": 6880 + }, + { + "epoch": 2.9037974683544303, + "grad_norm": 1.2000751495361328, + "learning_rate": 5.952329298655607e-05, + "loss": 0.6364397406578064, + "step": 6882 + }, + { + "epoch": 2.9046413502109703, + "grad_norm": 1.3380756378173828, + "learning_rate": 5.949971688450859e-05, + "loss": 0.6032583713531494, + "step": 6884 + }, + { + "epoch": 2.9054852320675106, + "grad_norm": 1.207139015197754, + "learning_rate": 5.9476138591058874e-05, + "loss": 0.6217718720436096, + "step": 6886 + }, + { + "epoch": 2.9063291139240506, + "grad_norm": 1.2060731649398804, + "learning_rate": 5.945255811164598e-05, + "loss": 0.5663400888442993, + "step": 6888 + }, + { + "epoch": 2.9071729957805905, + "grad_norm": 1.3331942558288574, + "learning_rate": 5.9428975451709465e-05, + "loss": 0.583290696144104, + "step": 6890 + }, + { + "epoch": 2.908016877637131, + "grad_norm": 1.226565957069397, + "learning_rate": 5.94053906166894e-05, + "loss": 0.5606404542922974, + "step": 6892 + }, + { + "epoch": 2.908860759493671, + "grad_norm": 1.167909026145935, + "learning_rate": 5.938180361202636e-05, + "loss": 0.5337109565734863, + "step": 6894 + }, + { + "epoch": 2.909704641350211, + "grad_norm": 1.2748368978500366, + "learning_rate": 5.93582144431614e-05, + "loss": 0.64582759141922, + "step": 6896 + }, + { + "epoch": 2.910548523206751, + "grad_norm": 1.2209413051605225, + "learning_rate": 5.93346231155361e-05, + "loss": 0.631919801235199, + "step": 6898 + }, + { + "epoch": 2.911392405063291, + "grad_norm": 1.2692270278930664, + "learning_rate": 5.931102963459252e-05, + "loss": 0.5999054908752441, + "step": 6900 + }, + { + "epoch": 2.911392405063291, + "eval_loss": 0.6713213920593262, + "eval_runtime": 513.1265, + "eval_samples_per_second": 4.106, + "eval_steps_per_second": 4.106, + "step": 6900 + }, + { + "epoch": 2.912236286919831, + "grad_norm": 1.3654414415359497, + "learning_rate": 5.928743400577323e-05, + "loss": 0.634549081325531, + "step": 6902 + }, + { + "epoch": 2.9130801687763714, + "grad_norm": 1.4427542686462402, + "learning_rate": 5.926383623452128e-05, + "loss": 0.684973418712616, + "step": 6904 + }, + { + "epoch": 2.9139240506329114, + "grad_norm": 1.3192591667175293, + "learning_rate": 5.9240236326280216e-05, + "loss": 0.6641559600830078, + "step": 6906 + }, + { + "epoch": 2.9147679324894513, + "grad_norm": 1.3328732252120972, + "learning_rate": 5.921663428649411e-05, + "loss": 0.6443751454353333, + "step": 6908 + }, + { + "epoch": 2.9156118143459917, + "grad_norm": 1.191504716873169, + "learning_rate": 5.9193030120607486e-05, + "loss": 0.674626886844635, + "step": 6910 + }, + { + "epoch": 2.9164556962025316, + "grad_norm": 1.2599490880966187, + "learning_rate": 5.916942383406535e-05, + "loss": 0.6297666430473328, + "step": 6912 + }, + { + "epoch": 2.9172995780590716, + "grad_norm": 0.9829303622245789, + "learning_rate": 5.914581543231324e-05, + "loss": 0.5809952616691589, + "step": 6914 + }, + { + "epoch": 2.918143459915612, + "grad_norm": 1.1566280126571655, + "learning_rate": 5.9122204920797176e-05, + "loss": 0.6383126974105835, + "step": 6916 + }, + { + "epoch": 2.918987341772152, + "grad_norm": 1.047351360321045, + "learning_rate": 5.9098592304963616e-05, + "loss": 0.5681729316711426, + "step": 6918 + }, + { + "epoch": 2.919831223628692, + "grad_norm": 1.2059552669525146, + "learning_rate": 5.907497759025956e-05, + "loss": 0.5985210537910461, + "step": 6920 + }, + { + "epoch": 2.9206751054852322, + "grad_norm": 1.1845992803573608, + "learning_rate": 5.905136078213247e-05, + "loss": 0.5815024375915527, + "step": 6922 + }, + { + "epoch": 2.921518987341772, + "grad_norm": 1.3542579412460327, + "learning_rate": 5.9027741886030266e-05, + "loss": 0.6437575221061707, + "step": 6924 + }, + { + "epoch": 2.922362869198312, + "grad_norm": 1.1001946926116943, + "learning_rate": 5.900412090740139e-05, + "loss": 0.5773448348045349, + "step": 6926 + }, + { + "epoch": 2.9232067510548525, + "grad_norm": 1.220449447631836, + "learning_rate": 5.898049785169476e-05, + "loss": 0.6076427698135376, + "step": 6928 + }, + { + "epoch": 2.9240506329113924, + "grad_norm": 1.126592993736267, + "learning_rate": 5.895687272435975e-05, + "loss": 0.5418170690536499, + "step": 6930 + }, + { + "epoch": 2.9248945147679324, + "grad_norm": 1.1005871295928955, + "learning_rate": 5.893324553084622e-05, + "loss": 0.6057441234588623, + "step": 6932 + }, + { + "epoch": 2.9257383966244728, + "grad_norm": 1.0291813611984253, + "learning_rate": 5.89096162766045e-05, + "loss": 0.4844438433647156, + "step": 6934 + }, + { + "epoch": 2.9265822784810127, + "grad_norm": 1.0685851573944092, + "learning_rate": 5.888598496708543e-05, + "loss": 0.5230311751365662, + "step": 6936 + }, + { + "epoch": 2.9274261603375527, + "grad_norm": 1.1004319190979004, + "learning_rate": 5.8862351607740285e-05, + "loss": 0.6191393136978149, + "step": 6938 + }, + { + "epoch": 2.928270042194093, + "grad_norm": 1.2164443731307983, + "learning_rate": 5.8838716204020815e-05, + "loss": 0.5574309825897217, + "step": 6940 + }, + { + "epoch": 2.929113924050633, + "grad_norm": 1.104511022567749, + "learning_rate": 5.881507876137928e-05, + "loss": 0.5820326209068298, + "step": 6942 + }, + { + "epoch": 2.929957805907173, + "grad_norm": 1.4402027130126953, + "learning_rate": 5.879143928526838e-05, + "loss": 0.6016243696212769, + "step": 6944 + }, + { + "epoch": 2.9308016877637133, + "grad_norm": 1.2131510972976685, + "learning_rate": 5.8767797781141274e-05, + "loss": 0.574772834777832, + "step": 6946 + }, + { + "epoch": 2.9316455696202532, + "grad_norm": 1.2146058082580566, + "learning_rate": 5.874415425445159e-05, + "loss": 0.6725581884384155, + "step": 6948 + }, + { + "epoch": 2.932489451476793, + "grad_norm": 1.2887672185897827, + "learning_rate": 5.872050871065349e-05, + "loss": 0.5900663733482361, + "step": 6950 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 1.340739369392395, + "learning_rate": 5.869686115520148e-05, + "loss": 0.6624540686607361, + "step": 6952 + }, + { + "epoch": 2.9341772151898735, + "grad_norm": 1.3531051874160767, + "learning_rate": 5.867321159355062e-05, + "loss": 0.5319855809211731, + "step": 6954 + }, + { + "epoch": 2.9350210970464135, + "grad_norm": 1.441260814666748, + "learning_rate": 5.864956003115646e-05, + "loss": 0.6661397218704224, + "step": 6956 + }, + { + "epoch": 2.935864978902954, + "grad_norm": 1.314922571182251, + "learning_rate": 5.862590647347488e-05, + "loss": 0.6062843799591064, + "step": 6958 + }, + { + "epoch": 2.9367088607594938, + "grad_norm": 1.134419560432434, + "learning_rate": 5.860225092596237e-05, + "loss": 0.6123294234275818, + "step": 6960 + }, + { + "epoch": 2.9375527426160337, + "grad_norm": 1.3195313215255737, + "learning_rate": 5.8578593394075746e-05, + "loss": 0.5984833240509033, + "step": 6962 + }, + { + "epoch": 2.938396624472574, + "grad_norm": 1.1626067161560059, + "learning_rate": 5.855493388327242e-05, + "loss": 0.5695837736129761, + "step": 6964 + }, + { + "epoch": 2.939240506329114, + "grad_norm": 1.1392630338668823, + "learning_rate": 5.853127239901012e-05, + "loss": 0.5688632726669312, + "step": 6966 + }, + { + "epoch": 2.940084388185654, + "grad_norm": 1.2131112813949585, + "learning_rate": 5.850760894674713e-05, + "loss": 0.6139572262763977, + "step": 6968 + }, + { + "epoch": 2.9409282700421944, + "grad_norm": 1.1740806102752686, + "learning_rate": 5.8483943531942154e-05, + "loss": 0.6654361486434937, + "step": 6970 + }, + { + "epoch": 2.9417721518987343, + "grad_norm": 1.1364716291427612, + "learning_rate": 5.846027616005433e-05, + "loss": 0.5477408766746521, + "step": 6972 + }, + { + "epoch": 2.9426160337552743, + "grad_norm": 1.212761640548706, + "learning_rate": 5.843660683654328e-05, + "loss": 0.6023505926132202, + "step": 6974 + }, + { + "epoch": 2.943459915611814, + "grad_norm": 1.1042946577072144, + "learning_rate": 5.8412935566869075e-05, + "loss": 0.5926207304000854, + "step": 6976 + }, + { + "epoch": 2.9443037974683546, + "grad_norm": 1.2444789409637451, + "learning_rate": 5.83892623564922e-05, + "loss": 0.5590356588363647, + "step": 6978 + }, + { + "epoch": 2.9451476793248945, + "grad_norm": 1.0782465934753418, + "learning_rate": 5.8365587210873616e-05, + "loss": 0.553716778755188, + "step": 6980 + }, + { + "epoch": 2.9459915611814345, + "grad_norm": 1.1914669275283813, + "learning_rate": 5.834191013547473e-05, + "loss": 0.5937044024467468, + "step": 6982 + }, + { + "epoch": 2.946835443037975, + "grad_norm": 1.1819682121276855, + "learning_rate": 5.83182311357574e-05, + "loss": 0.6439019441604614, + "step": 6984 + }, + { + "epoch": 2.947679324894515, + "grad_norm": 1.1807081699371338, + "learning_rate": 5.829455021718389e-05, + "loss": 0.5403141379356384, + "step": 6986 + }, + { + "epoch": 2.9485232067510547, + "grad_norm": 1.2721227407455444, + "learning_rate": 5.827086738521692e-05, + "loss": 0.5281378626823425, + "step": 6988 + }, + { + "epoch": 2.9493670886075947, + "grad_norm": 1.6942147016525269, + "learning_rate": 5.824718264531972e-05, + "loss": 0.5722067952156067, + "step": 6990 + }, + { + "epoch": 2.950210970464135, + "grad_norm": 1.3415225744247437, + "learning_rate": 5.8223496002955865e-05, + "loss": 0.6228076815605164, + "step": 6992 + }, + { + "epoch": 2.951054852320675, + "grad_norm": 1.235356092453003, + "learning_rate": 5.819980746358941e-05, + "loss": 0.6019303202629089, + "step": 6994 + }, + { + "epoch": 2.951898734177215, + "grad_norm": 1.2500600814819336, + "learning_rate": 5.817611703268486e-05, + "loss": 0.5699147582054138, + "step": 6996 + }, + { + "epoch": 2.9527426160337553, + "grad_norm": 1.1581830978393555, + "learning_rate": 5.8152424715707145e-05, + "loss": 0.6304079294204712, + "step": 6998 + }, + { + "epoch": 2.9535864978902953, + "grad_norm": 1.2924201488494873, + "learning_rate": 5.812873051812161e-05, + "loss": 0.5464767217636108, + "step": 7000 + }, + { + "epoch": 2.9535864978902953, + "eval_loss": 0.6706293225288391, + "eval_runtime": 513.4396, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 7000 + }, + { + "epoch": 2.954430379746835, + "grad_norm": 1.2045931816101074, + "learning_rate": 5.810503444539405e-05, + "loss": 0.5139666795730591, + "step": 7002 + }, + { + "epoch": 2.9552742616033756, + "grad_norm": 1.0592173337936401, + "learning_rate": 5.8081336502990716e-05, + "loss": 0.574500322341919, + "step": 7004 + }, + { + "epoch": 2.9561181434599155, + "grad_norm": 1.003440499305725, + "learning_rate": 5.805763669637825e-05, + "loss": 0.5784007906913757, + "step": 7006 + }, + { + "epoch": 2.9569620253164555, + "grad_norm": 1.2018240690231323, + "learning_rate": 5.8033935031023757e-05, + "loss": 0.5930284261703491, + "step": 7008 + }, + { + "epoch": 2.957805907172996, + "grad_norm": 1.4118605852127075, + "learning_rate": 5.801023151239473e-05, + "loss": 0.641598105430603, + "step": 7010 + }, + { + "epoch": 2.958649789029536, + "grad_norm": 1.167186975479126, + "learning_rate": 5.798652614595914e-05, + "loss": 0.5804623365402222, + "step": 7012 + }, + { + "epoch": 2.9594936708860757, + "grad_norm": 1.1934285163879395, + "learning_rate": 5.796281893718536e-05, + "loss": 0.6128653883934021, + "step": 7014 + }, + { + "epoch": 2.960337552742616, + "grad_norm": 1.1616190671920776, + "learning_rate": 5.7939109891542164e-05, + "loss": 0.5458035469055176, + "step": 7016 + }, + { + "epoch": 2.961181434599156, + "grad_norm": 1.2685189247131348, + "learning_rate": 5.7915399014498814e-05, + "loss": 0.5968486070632935, + "step": 7018 + }, + { + "epoch": 2.962025316455696, + "grad_norm": 1.2075960636138916, + "learning_rate": 5.789168631152491e-05, + "loss": 0.633076548576355, + "step": 7020 + }, + { + "epoch": 2.9628691983122364, + "grad_norm": 1.1098700761795044, + "learning_rate": 5.786797178809055e-05, + "loss": 0.592155933380127, + "step": 7022 + }, + { + "epoch": 2.9637130801687763, + "grad_norm": 1.1458083391189575, + "learning_rate": 5.78442554496662e-05, + "loss": 0.5480605363845825, + "step": 7024 + }, + { + "epoch": 2.9645569620253163, + "grad_norm": 1.0702389478683472, + "learning_rate": 5.7820537301722766e-05, + "loss": 0.5167773365974426, + "step": 7026 + }, + { + "epoch": 2.9654008438818567, + "grad_norm": 1.200501799583435, + "learning_rate": 5.779681734973157e-05, + "loss": 0.5489409565925598, + "step": 7028 + }, + { + "epoch": 2.9662447257383966, + "grad_norm": 1.075738549232483, + "learning_rate": 5.777309559916435e-05, + "loss": 0.6175599098205566, + "step": 7030 + }, + { + "epoch": 2.9670886075949365, + "grad_norm": 1.2832911014556885, + "learning_rate": 5.774937205549328e-05, + "loss": 0.5721893310546875, + "step": 7032 + }, + { + "epoch": 2.967932489451477, + "grad_norm": 1.3263260126113892, + "learning_rate": 5.7725646724190884e-05, + "loss": 0.7007027864456177, + "step": 7034 + }, + { + "epoch": 2.968776371308017, + "grad_norm": 1.254817247390747, + "learning_rate": 5.770191961073017e-05, + "loss": 0.5676232576370239, + "step": 7036 + }, + { + "epoch": 2.969620253164557, + "grad_norm": 1.0725815296173096, + "learning_rate": 5.767819072058453e-05, + "loss": 0.5563743114471436, + "step": 7038 + }, + { + "epoch": 2.970464135021097, + "grad_norm": 1.2760009765625, + "learning_rate": 5.765446005922774e-05, + "loss": 0.6449083089828491, + "step": 7040 + }, + { + "epoch": 2.971308016877637, + "grad_norm": 1.2716739177703857, + "learning_rate": 5.763072763213402e-05, + "loss": 0.6155483722686768, + "step": 7042 + }, + { + "epoch": 2.972151898734177, + "grad_norm": 1.3112155199050903, + "learning_rate": 5.7606993444778004e-05, + "loss": 0.5797539949417114, + "step": 7044 + }, + { + "epoch": 2.9729957805907175, + "grad_norm": 1.069555401802063, + "learning_rate": 5.758325750263468e-05, + "loss": 0.5277710556983948, + "step": 7046 + }, + { + "epoch": 2.9738396624472574, + "grad_norm": 1.2229703664779663, + "learning_rate": 5.755951981117949e-05, + "loss": 0.5641140937805176, + "step": 7048 + }, + { + "epoch": 2.9746835443037973, + "grad_norm": 1.1228448152542114, + "learning_rate": 5.753578037588827e-05, + "loss": 0.5734127163887024, + "step": 7050 + }, + { + "epoch": 2.9755274261603377, + "grad_norm": 1.372084379196167, + "learning_rate": 5.751203920223724e-05, + "loss": 0.5992875695228577, + "step": 7052 + }, + { + "epoch": 2.9763713080168777, + "grad_norm": 1.232243537902832, + "learning_rate": 5.7488296295703036e-05, + "loss": 0.5811893343925476, + "step": 7054 + }, + { + "epoch": 2.9772151898734176, + "grad_norm": 1.1907097101211548, + "learning_rate": 5.746455166176269e-05, + "loss": 0.5502846240997314, + "step": 7056 + }, + { + "epoch": 2.978059071729958, + "grad_norm": 1.1842679977416992, + "learning_rate": 5.7440805305893644e-05, + "loss": 0.5908812284469604, + "step": 7058 + }, + { + "epoch": 2.978902953586498, + "grad_norm": 1.2167452573776245, + "learning_rate": 5.741705723357371e-05, + "loss": 0.5468931198120117, + "step": 7060 + }, + { + "epoch": 2.979746835443038, + "grad_norm": 1.2835358381271362, + "learning_rate": 5.739330745028113e-05, + "loss": 0.5421503782272339, + "step": 7062 + }, + { + "epoch": 2.9805907172995783, + "grad_norm": 1.230869174003601, + "learning_rate": 5.736955596149449e-05, + "loss": 0.5574424266815186, + "step": 7064 + }, + { + "epoch": 2.981434599156118, + "grad_norm": 1.1757540702819824, + "learning_rate": 5.7345802772692844e-05, + "loss": 0.5349726676940918, + "step": 7066 + }, + { + "epoch": 2.982278481012658, + "grad_norm": 1.2147842645645142, + "learning_rate": 5.732204788935558e-05, + "loss": 0.5739659667015076, + "step": 7068 + }, + { + "epoch": 2.9831223628691985, + "grad_norm": 1.1981799602508545, + "learning_rate": 5.729829131696247e-05, + "loss": 0.6001242995262146, + "step": 7070 + }, + { + "epoch": 2.9839662447257385, + "grad_norm": 1.0104349851608276, + "learning_rate": 5.7274533060993744e-05, + "loss": 0.5373315811157227, + "step": 7072 + }, + { + "epoch": 2.9848101265822784, + "grad_norm": 1.31861412525177, + "learning_rate": 5.725077312692994e-05, + "loss": 0.6236737370491028, + "step": 7074 + }, + { + "epoch": 2.985654008438819, + "grad_norm": 1.2060835361480713, + "learning_rate": 5.722701152025203e-05, + "loss": 0.5138278007507324, + "step": 7076 + }, + { + "epoch": 2.9864978902953587, + "grad_norm": 1.2231637239456177, + "learning_rate": 5.720324824644134e-05, + "loss": 0.5775829553604126, + "step": 7078 + }, + { + "epoch": 2.9873417721518987, + "grad_norm": 1.110559344291687, + "learning_rate": 5.717948331097965e-05, + "loss": 0.5619624853134155, + "step": 7080 + }, + { + "epoch": 2.988185654008439, + "grad_norm": 1.0486462116241455, + "learning_rate": 5.715571671934903e-05, + "loss": 0.5401903390884399, + "step": 7082 + }, + { + "epoch": 2.989029535864979, + "grad_norm": 1.7979792356491089, + "learning_rate": 5.713194847703201e-05, + "loss": 0.6185324192047119, + "step": 7084 + }, + { + "epoch": 2.989873417721519, + "grad_norm": 1.1270287036895752, + "learning_rate": 5.710817858951143e-05, + "loss": 0.5637381672859192, + "step": 7086 + }, + { + "epoch": 2.9907172995780593, + "grad_norm": 1.0734593868255615, + "learning_rate": 5.708440706227055e-05, + "loss": 0.5341202020645142, + "step": 7088 + }, + { + "epoch": 2.9915611814345993, + "grad_norm": 1.1479569673538208, + "learning_rate": 5.7060633900793035e-05, + "loss": 0.6088040471076965, + "step": 7090 + }, + { + "epoch": 2.992405063291139, + "grad_norm": 1.417993426322937, + "learning_rate": 5.703685911056288e-05, + "loss": 0.6260532736778259, + "step": 7092 + }, + { + "epoch": 2.9932489451476796, + "grad_norm": 1.0302354097366333, + "learning_rate": 5.701308269706449e-05, + "loss": 0.5241007804870605, + "step": 7094 + }, + { + "epoch": 2.9940928270042195, + "grad_norm": 1.0818110704421997, + "learning_rate": 5.6989304665782585e-05, + "loss": 0.5663899183273315, + "step": 7096 + }, + { + "epoch": 2.9949367088607595, + "grad_norm": 1.3382261991500854, + "learning_rate": 5.696552502220235e-05, + "loss": 0.6420456171035767, + "step": 7098 + }, + { + "epoch": 2.9957805907173, + "grad_norm": 1.0404452085494995, + "learning_rate": 5.6941743771809254e-05, + "loss": 0.6239140033721924, + "step": 7100 + }, + { + "epoch": 2.9957805907173, + "eval_loss": 0.6692973375320435, + "eval_runtime": 512.8985, + "eval_samples_per_second": 4.108, + "eval_steps_per_second": 4.108, + "step": 7100 + }, + { + "epoch": 2.99662447257384, + "grad_norm": 1.0349514484405518, + "learning_rate": 5.691796092008918e-05, + "loss": 0.5956323146820068, + "step": 7102 + }, + { + "epoch": 2.9974683544303797, + "grad_norm": 1.0786800384521484, + "learning_rate": 5.689417647252839e-05, + "loss": 0.5639365911483765, + "step": 7104 + }, + { + "epoch": 2.9983122362869197, + "grad_norm": 1.2075775861740112, + "learning_rate": 5.687039043461351e-05, + "loss": 0.5529769659042358, + "step": 7106 + }, + { + "epoch": 2.99915611814346, + "grad_norm": 1.2835887670516968, + "learning_rate": 5.6846602811831496e-05, + "loss": 0.5834671258926392, + "step": 7108 + }, + { + "epoch": 3.0, + "grad_norm": 1.3102463483810425, + "learning_rate": 5.682281360966969e-05, + "loss": 0.5820922255516052, + "step": 7110 + }, + { + "epoch": 3.00084388185654, + "grad_norm": 1.24532949924469, + "learning_rate": 5.679902283361582e-05, + "loss": 0.5958086252212524, + "step": 7112 + }, + { + "epoch": 3.0016877637130803, + "grad_norm": 1.0468344688415527, + "learning_rate": 5.677523048915798e-05, + "loss": 0.5267294645309448, + "step": 7114 + }, + { + "epoch": 3.0025316455696203, + "grad_norm": 1.2053340673446655, + "learning_rate": 5.675143658178458e-05, + "loss": 0.49180498719215393, + "step": 7116 + }, + { + "epoch": 3.00337552742616, + "grad_norm": 1.1861987113952637, + "learning_rate": 5.6727641116984406e-05, + "loss": 0.6163156032562256, + "step": 7118 + }, + { + "epoch": 3.0042194092827006, + "grad_norm": 0.9804314374923706, + "learning_rate": 5.670384410024665e-05, + "loss": 0.4780079424381256, + "step": 7120 + }, + { + "epoch": 3.0050632911392405, + "grad_norm": 1.148734450340271, + "learning_rate": 5.668004553706081e-05, + "loss": 0.4762009382247925, + "step": 7122 + }, + { + "epoch": 3.0059071729957805, + "grad_norm": 1.3817394971847534, + "learning_rate": 5.665624543291677e-05, + "loss": 0.5391061305999756, + "step": 7124 + }, + { + "epoch": 3.006751054852321, + "grad_norm": 1.2641339302062988, + "learning_rate": 5.663244379330471e-05, + "loss": 0.5118980407714844, + "step": 7126 + }, + { + "epoch": 3.007594936708861, + "grad_norm": 1.1882877349853516, + "learning_rate": 5.660864062371527e-05, + "loss": 0.5076818466186523, + "step": 7128 + }, + { + "epoch": 3.0084388185654007, + "grad_norm": 1.3996630907058716, + "learning_rate": 5.658483592963936e-05, + "loss": 0.5128282308578491, + "step": 7130 + }, + { + "epoch": 3.009282700421941, + "grad_norm": 1.4738327264785767, + "learning_rate": 5.6561029716568246e-05, + "loss": 0.5689603090286255, + "step": 7132 + }, + { + "epoch": 3.010126582278481, + "grad_norm": 1.2539118528366089, + "learning_rate": 5.6537221989993605e-05, + "loss": 0.537216067314148, + "step": 7134 + }, + { + "epoch": 3.010970464135021, + "grad_norm": 1.2467267513275146, + "learning_rate": 5.6513412755407394e-05, + "loss": 0.5913172960281372, + "step": 7136 + }, + { + "epoch": 3.0118143459915614, + "grad_norm": 1.232380986213684, + "learning_rate": 5.648960201830194e-05, + "loss": 0.535701334476471, + "step": 7138 + }, + { + "epoch": 3.0126582278481013, + "grad_norm": 1.2236435413360596, + "learning_rate": 5.6465789784169944e-05, + "loss": 0.5035087466239929, + "step": 7140 + }, + { + "epoch": 3.0135021097046413, + "grad_norm": 1.1154464483261108, + "learning_rate": 5.6441976058504444e-05, + "loss": 0.5219660401344299, + "step": 7142 + }, + { + "epoch": 3.014345991561181, + "grad_norm": 1.1690709590911865, + "learning_rate": 5.6418160846798765e-05, + "loss": 0.5170891880989075, + "step": 7144 + }, + { + "epoch": 3.0151898734177216, + "grad_norm": 1.3172271251678467, + "learning_rate": 5.639434415454663e-05, + "loss": 0.52115398645401, + "step": 7146 + }, + { + "epoch": 3.0160337552742615, + "grad_norm": 1.1508091688156128, + "learning_rate": 5.637052598724213e-05, + "loss": 0.49015527963638306, + "step": 7148 + }, + { + "epoch": 3.0168776371308015, + "grad_norm": 1.1777493953704834, + "learning_rate": 5.634670635037962e-05, + "loss": 0.5465641021728516, + "step": 7150 + }, + { + "epoch": 3.017721518987342, + "grad_norm": 1.2320231199264526, + "learning_rate": 5.632288524945385e-05, + "loss": 0.5174515843391418, + "step": 7152 + }, + { + "epoch": 3.018565400843882, + "grad_norm": 1.3233075141906738, + "learning_rate": 5.629906268995988e-05, + "loss": 0.521284818649292, + "step": 7154 + }, + { + "epoch": 3.0194092827004217, + "grad_norm": 1.1378387212753296, + "learning_rate": 5.6275238677393136e-05, + "loss": 0.4841000437736511, + "step": 7156 + }, + { + "epoch": 3.020253164556962, + "grad_norm": 1.4944018125534058, + "learning_rate": 5.6251413217249325e-05, + "loss": 0.5399911403656006, + "step": 7158 + }, + { + "epoch": 3.021097046413502, + "grad_norm": 1.3964036703109741, + "learning_rate": 5.622758631502457e-05, + "loss": 0.6075693368911743, + "step": 7160 + }, + { + "epoch": 3.021940928270042, + "grad_norm": 1.2494895458221436, + "learning_rate": 5.6203757976215244e-05, + "loss": 0.4700590968132019, + "step": 7162 + }, + { + "epoch": 3.0227848101265824, + "grad_norm": 1.2082068920135498, + "learning_rate": 5.617992820631809e-05, + "loss": 0.46371224522590637, + "step": 7164 + }, + { + "epoch": 3.0236286919831223, + "grad_norm": 1.2820552587509155, + "learning_rate": 5.61560970108302e-05, + "loss": 0.6175356507301331, + "step": 7166 + }, + { + "epoch": 3.0244725738396623, + "grad_norm": 1.243906855583191, + "learning_rate": 5.613226439524896e-05, + "loss": 0.5443550944328308, + "step": 7168 + }, + { + "epoch": 3.0253164556962027, + "grad_norm": 1.2818046808242798, + "learning_rate": 5.6108430365072097e-05, + "loss": 0.540513277053833, + "step": 7170 + }, + { + "epoch": 3.0261603375527426, + "grad_norm": 1.2159545421600342, + "learning_rate": 5.608459492579765e-05, + "loss": 0.47928962111473083, + "step": 7172 + }, + { + "epoch": 3.0270042194092825, + "grad_norm": 1.2186859846115112, + "learning_rate": 5.606075808292401e-05, + "loss": 0.572704553604126, + "step": 7174 + }, + { + "epoch": 3.027848101265823, + "grad_norm": 1.0899910926818848, + "learning_rate": 5.60369198419499e-05, + "loss": 0.5537641048431396, + "step": 7176 + }, + { + "epoch": 3.028691983122363, + "grad_norm": 1.1885626316070557, + "learning_rate": 5.601308020837431e-05, + "loss": 0.5430077910423279, + "step": 7178 + }, + { + "epoch": 3.029535864978903, + "grad_norm": 1.3681434392929077, + "learning_rate": 5.5989239187696595e-05, + "loss": 0.5838874578475952, + "step": 7180 + }, + { + "epoch": 3.030379746835443, + "grad_norm": 1.4902375936508179, + "learning_rate": 5.596539678541644e-05, + "loss": 0.5168817639350891, + "step": 7182 + }, + { + "epoch": 3.031223628691983, + "grad_norm": 1.4395933151245117, + "learning_rate": 5.59415530070338e-05, + "loss": 0.5464931726455688, + "step": 7184 + }, + { + "epoch": 3.032067510548523, + "grad_norm": 1.2699668407440186, + "learning_rate": 5.5917707858049e-05, + "loss": 0.5364856123924255, + "step": 7186 + }, + { + "epoch": 3.0329113924050635, + "grad_norm": 1.1673169136047363, + "learning_rate": 5.589386134396264e-05, + "loss": 0.5676021575927734, + "step": 7188 + }, + { + "epoch": 3.0337552742616034, + "grad_norm": 1.2029050588607788, + "learning_rate": 5.5870013470275675e-05, + "loss": 0.5174224972724915, + "step": 7190 + }, + { + "epoch": 3.0345991561181433, + "grad_norm": 1.2046477794647217, + "learning_rate": 5.5846164242489326e-05, + "loss": 0.5298268795013428, + "step": 7192 + }, + { + "epoch": 3.0354430379746837, + "grad_norm": 1.2438830137252808, + "learning_rate": 5.582231366610516e-05, + "loss": 0.5120787024497986, + "step": 7194 + }, + { + "epoch": 3.0362869198312237, + "grad_norm": 1.1918164491653442, + "learning_rate": 5.579846174662506e-05, + "loss": 0.4706324338912964, + "step": 7196 + }, + { + "epoch": 3.0371308016877636, + "grad_norm": 1.125056266784668, + "learning_rate": 5.57746084895512e-05, + "loss": 0.5319511294364929, + "step": 7198 + }, + { + "epoch": 3.037974683544304, + "grad_norm": 1.3552099466323853, + "learning_rate": 5.575075390038607e-05, + "loss": 0.5893887877464294, + "step": 7200 + }, + { + "epoch": 3.037974683544304, + "eval_loss": 0.6751418709754944, + "eval_runtime": 513.8972, + "eval_samples_per_second": 4.1, + "eval_steps_per_second": 4.1, + "step": 7200 + }, + { + "epoch": 3.038818565400844, + "grad_norm": 1.3924046754837036, + "learning_rate": 5.572689798463243e-05, + "loss": 0.5680004358291626, + "step": 7202 + }, + { + "epoch": 3.039662447257384, + "grad_norm": 1.3154771327972412, + "learning_rate": 5.5703040747793444e-05, + "loss": 0.5572541356086731, + "step": 7204 + }, + { + "epoch": 3.0405063291139243, + "grad_norm": 1.2266511917114258, + "learning_rate": 5.567918219537247e-05, + "loss": 0.535094141960144, + "step": 7206 + }, + { + "epoch": 3.041350210970464, + "grad_norm": 1.2234530448913574, + "learning_rate": 5.565532233287324e-05, + "loss": 0.5958529710769653, + "step": 7208 + }, + { + "epoch": 3.042194092827004, + "grad_norm": 1.2451010942459106, + "learning_rate": 5.563146116579977e-05, + "loss": 0.5555807948112488, + "step": 7210 + }, + { + "epoch": 3.043037974683544, + "grad_norm": 1.518996000289917, + "learning_rate": 5.560759869965635e-05, + "loss": 0.5391029715538025, + "step": 7212 + }, + { + "epoch": 3.0438818565400845, + "grad_norm": 1.4555507898330688, + "learning_rate": 5.5583734939947604e-05, + "loss": 0.6110212802886963, + "step": 7214 + }, + { + "epoch": 3.0447257383966244, + "grad_norm": 1.1732209920883179, + "learning_rate": 5.555986989217844e-05, + "loss": 0.4841096103191376, + "step": 7216 + }, + { + "epoch": 3.0455696202531644, + "grad_norm": 1.3211549520492554, + "learning_rate": 5.55360035618541e-05, + "loss": 0.5234199166297913, + "step": 7218 + }, + { + "epoch": 3.0464135021097047, + "grad_norm": 1.0290759801864624, + "learning_rate": 5.551213595448003e-05, + "loss": 0.5311322808265686, + "step": 7220 + }, + { + "epoch": 3.0472573839662447, + "grad_norm": 1.3045908212661743, + "learning_rate": 5.548826707556206e-05, + "loss": 0.5279681086540222, + "step": 7222 + }, + { + "epoch": 3.0481012658227846, + "grad_norm": 1.039219617843628, + "learning_rate": 5.54643969306063e-05, + "loss": 0.47327345609664917, + "step": 7224 + }, + { + "epoch": 3.048945147679325, + "grad_norm": 1.5341938734054565, + "learning_rate": 5.544052552511909e-05, + "loss": 0.5803293585777283, + "step": 7226 + }, + { + "epoch": 3.049789029535865, + "grad_norm": 1.24624502658844, + "learning_rate": 5.5416652864607156e-05, + "loss": 0.5452714562416077, + "step": 7228 + }, + { + "epoch": 3.050632911392405, + "grad_norm": 1.192566156387329, + "learning_rate": 5.5392778954577416e-05, + "loss": 0.48333147168159485, + "step": 7230 + }, + { + "epoch": 3.0514767932489453, + "grad_norm": 1.3091192245483398, + "learning_rate": 5.536890380053715e-05, + "loss": 0.4947234094142914, + "step": 7232 + }, + { + "epoch": 3.052320675105485, + "grad_norm": 1.171740174293518, + "learning_rate": 5.534502740799388e-05, + "loss": 0.5226179361343384, + "step": 7234 + }, + { + "epoch": 3.053164556962025, + "grad_norm": 1.1677600145339966, + "learning_rate": 5.532114978245544e-05, + "loss": 0.490182101726532, + "step": 7236 + }, + { + "epoch": 3.0540084388185655, + "grad_norm": 1.2062798738479614, + "learning_rate": 5.529727092942994e-05, + "loss": 0.5542705655097961, + "step": 7238 + }, + { + "epoch": 3.0548523206751055, + "grad_norm": 1.2385777235031128, + "learning_rate": 5.5273390854425774e-05, + "loss": 0.5947107076644897, + "step": 7240 + }, + { + "epoch": 3.0556962025316454, + "grad_norm": 1.39088773727417, + "learning_rate": 5.524950956295162e-05, + "loss": 0.5728942155838013, + "step": 7242 + }, + { + "epoch": 3.056540084388186, + "grad_norm": 1.3944119215011597, + "learning_rate": 5.522562706051643e-05, + "loss": 0.5572934150695801, + "step": 7244 + }, + { + "epoch": 3.0573839662447257, + "grad_norm": 1.3647639751434326, + "learning_rate": 5.520174335262944e-05, + "loss": 0.6066661477088928, + "step": 7246 + }, + { + "epoch": 3.0582278481012657, + "grad_norm": 1.604581356048584, + "learning_rate": 5.5177858444800146e-05, + "loss": 0.5683084726333618, + "step": 7248 + }, + { + "epoch": 3.059071729957806, + "grad_norm": 1.280266284942627, + "learning_rate": 5.515397234253836e-05, + "loss": 0.5676847100257874, + "step": 7250 + }, + { + "epoch": 3.059915611814346, + "grad_norm": 1.1750892400741577, + "learning_rate": 5.513008505135414e-05, + "loss": 0.48119837045669556, + "step": 7252 + }, + { + "epoch": 3.060759493670886, + "grad_norm": 1.307988166809082, + "learning_rate": 5.510619657675783e-05, + "loss": 0.5646002888679504, + "step": 7254 + }, + { + "epoch": 3.0616033755274263, + "grad_norm": 1.2408503293991089, + "learning_rate": 5.508230692426002e-05, + "loss": 0.5276528596878052, + "step": 7256 + }, + { + "epoch": 3.0624472573839663, + "grad_norm": 1.2521553039550781, + "learning_rate": 5.505841609937161e-05, + "loss": 0.5539094805717468, + "step": 7258 + }, + { + "epoch": 3.0632911392405062, + "grad_norm": 1.387758493423462, + "learning_rate": 5.503452410760377e-05, + "loss": 0.5842460989952087, + "step": 7260 + }, + { + "epoch": 3.0641350210970466, + "grad_norm": 1.300126552581787, + "learning_rate": 5.501063095446789e-05, + "loss": 0.5075781345367432, + "step": 7262 + }, + { + "epoch": 3.0649789029535865, + "grad_norm": 1.3773088455200195, + "learning_rate": 5.498673664547569e-05, + "loss": 0.5713207721710205, + "step": 7264 + }, + { + "epoch": 3.0658227848101265, + "grad_norm": 1.3680146932601929, + "learning_rate": 5.496284118613912e-05, + "loss": 0.523406982421875, + "step": 7266 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 1.1380960941314697, + "learning_rate": 5.493894458197041e-05, + "loss": 0.5297801494598389, + "step": 7268 + }, + { + "epoch": 3.067510548523207, + "grad_norm": 1.4078724384307861, + "learning_rate": 5.491504683848202e-05, + "loss": 0.5217325091362, + "step": 7270 + }, + { + "epoch": 3.0683544303797468, + "grad_norm": 1.2392537593841553, + "learning_rate": 5.489114796118674e-05, + "loss": 0.5451233386993408, + "step": 7272 + }, + { + "epoch": 3.0691983122362867, + "grad_norm": 1.159034013748169, + "learning_rate": 5.4867247955597544e-05, + "loss": 0.5403155088424683, + "step": 7274 + }, + { + "epoch": 3.070042194092827, + "grad_norm": 1.1931780576705933, + "learning_rate": 5.484334682722773e-05, + "loss": 0.5579524636268616, + "step": 7276 + }, + { + "epoch": 3.070886075949367, + "grad_norm": 1.1836986541748047, + "learning_rate": 5.4819444581590805e-05, + "loss": 0.570443868637085, + "step": 7278 + }, + { + "epoch": 3.071729957805907, + "grad_norm": 1.2491910457611084, + "learning_rate": 5.4795541224200595e-05, + "loss": 0.5276142358779907, + "step": 7280 + }, + { + "epoch": 3.0725738396624473, + "grad_norm": 1.1931475400924683, + "learning_rate": 5.477163676057112e-05, + "loss": 0.48629680275917053, + "step": 7282 + }, + { + "epoch": 3.0734177215189873, + "grad_norm": 1.2027641534805298, + "learning_rate": 5.4747731196216676e-05, + "loss": 0.5148687958717346, + "step": 7284 + }, + { + "epoch": 3.0742616033755272, + "grad_norm": 1.4708147048950195, + "learning_rate": 5.4723824536651844e-05, + "loss": 0.5084875226020813, + "step": 7286 + }, + { + "epoch": 3.0751054852320676, + "grad_norm": 1.2080403566360474, + "learning_rate": 5.4699916787391404e-05, + "loss": 0.5537340641021729, + "step": 7288 + }, + { + "epoch": 3.0759493670886076, + "grad_norm": 1.1593934297561646, + "learning_rate": 5.467600795395043e-05, + "loss": 0.5617695450782776, + "step": 7290 + }, + { + "epoch": 3.0767932489451475, + "grad_norm": 1.2356617450714111, + "learning_rate": 5.465209804184421e-05, + "loss": 0.5376757383346558, + "step": 7292 + }, + { + "epoch": 3.077637130801688, + "grad_norm": 1.1403000354766846, + "learning_rate": 5.4628187056588344e-05, + "loss": 0.5002268552780151, + "step": 7294 + }, + { + "epoch": 3.078481012658228, + "grad_norm": 1.4888559579849243, + "learning_rate": 5.460427500369858e-05, + "loss": 0.6053714752197266, + "step": 7296 + }, + { + "epoch": 3.0793248945147678, + "grad_norm": 1.4204037189483643, + "learning_rate": 5.4580361888691e-05, + "loss": 0.6156546473503113, + "step": 7298 + }, + { + "epoch": 3.080168776371308, + "grad_norm": 1.9313244819641113, + "learning_rate": 5.4556447717081925e-05, + "loss": 0.5875506401062012, + "step": 7300 + }, + { + "epoch": 3.080168776371308, + "eval_loss": 0.678839385509491, + "eval_runtime": 513.7013, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 4.102, + "step": 7300 + }, + { + "epoch": 3.081012658227848, + "grad_norm": 1.1877124309539795, + "learning_rate": 5.453253249438786e-05, + "loss": 0.48908641934394836, + "step": 7302 + }, + { + "epoch": 3.081856540084388, + "grad_norm": 1.308233380317688, + "learning_rate": 5.4508616226125595e-05, + "loss": 0.5307457447052002, + "step": 7304 + }, + { + "epoch": 3.0827004219409284, + "grad_norm": 1.3067306280136108, + "learning_rate": 5.4484698917812164e-05, + "loss": 0.48060229420661926, + "step": 7306 + }, + { + "epoch": 3.0835443037974684, + "grad_norm": 1.3354034423828125, + "learning_rate": 5.446078057496481e-05, + "loss": 0.5300682187080383, + "step": 7308 + }, + { + "epoch": 3.0843881856540083, + "grad_norm": 1.1963045597076416, + "learning_rate": 5.443686120310105e-05, + "loss": 0.47262853384017944, + "step": 7310 + }, + { + "epoch": 3.0852320675105487, + "grad_norm": 1.352649450302124, + "learning_rate": 5.441294080773863e-05, + "loss": 0.5363158583641052, + "step": 7312 + }, + { + "epoch": 3.0860759493670886, + "grad_norm": 1.415164828300476, + "learning_rate": 5.438901939439551e-05, + "loss": 0.5516205430030823, + "step": 7314 + }, + { + "epoch": 3.0869198312236286, + "grad_norm": 1.2061728239059448, + "learning_rate": 5.436509696858992e-05, + "loss": 0.5458099246025085, + "step": 7316 + }, + { + "epoch": 3.087763713080169, + "grad_norm": 1.2327239513397217, + "learning_rate": 5.434117353584027e-05, + "loss": 0.5184649229049683, + "step": 7318 + }, + { + "epoch": 3.088607594936709, + "grad_norm": 1.0882518291473389, + "learning_rate": 5.431724910166528e-05, + "loss": 0.46032577753067017, + "step": 7320 + }, + { + "epoch": 3.089451476793249, + "grad_norm": 1.2710907459259033, + "learning_rate": 5.429332367158384e-05, + "loss": 0.5696587562561035, + "step": 7322 + }, + { + "epoch": 3.090295358649789, + "grad_norm": 1.5157700777053833, + "learning_rate": 5.4269397251115065e-05, + "loss": 0.59807288646698, + "step": 7324 + }, + { + "epoch": 3.091139240506329, + "grad_norm": 1.2869718074798584, + "learning_rate": 5.424546984577835e-05, + "loss": 0.5135430693626404, + "step": 7326 + }, + { + "epoch": 3.091983122362869, + "grad_norm": 1.19942045211792, + "learning_rate": 5.4221541461093276e-05, + "loss": 0.47552013397216797, + "step": 7328 + }, + { + "epoch": 3.0928270042194095, + "grad_norm": 1.4979162216186523, + "learning_rate": 5.4197612102579665e-05, + "loss": 0.5549390316009521, + "step": 7330 + }, + { + "epoch": 3.0936708860759494, + "grad_norm": 1.3181121349334717, + "learning_rate": 5.4173681775757545e-05, + "loss": 0.4878964126110077, + "step": 7332 + }, + { + "epoch": 3.0945147679324894, + "grad_norm": 1.740233063697815, + "learning_rate": 5.414975048614722e-05, + "loss": 0.526251494884491, + "step": 7334 + }, + { + "epoch": 3.0953586497890297, + "grad_norm": 1.2123478651046753, + "learning_rate": 5.412581823926914e-05, + "loss": 0.48297834396362305, + "step": 7336 + }, + { + "epoch": 3.0962025316455697, + "grad_norm": 1.2853679656982422, + "learning_rate": 5.410188504064403e-05, + "loss": 0.5184051990509033, + "step": 7338 + }, + { + "epoch": 3.0970464135021096, + "grad_norm": 1.2580705881118774, + "learning_rate": 5.4077950895792815e-05, + "loss": 0.5476894974708557, + "step": 7340 + }, + { + "epoch": 3.09789029535865, + "grad_norm": 1.3363854885101318, + "learning_rate": 5.4054015810236666e-05, + "loss": 0.5379365682601929, + "step": 7342 + }, + { + "epoch": 3.09873417721519, + "grad_norm": 1.3067597150802612, + "learning_rate": 5.4030079789496925e-05, + "loss": 0.5325208306312561, + "step": 7344 + }, + { + "epoch": 3.09957805907173, + "grad_norm": 1.3179864883422852, + "learning_rate": 5.400614283909515e-05, + "loss": 0.56773442029953, + "step": 7346 + }, + { + "epoch": 3.10042194092827, + "grad_norm": 1.2006254196166992, + "learning_rate": 5.3982204964553196e-05, + "loss": 0.542707622051239, + "step": 7348 + }, + { + "epoch": 3.1012658227848102, + "grad_norm": 1.2013983726501465, + "learning_rate": 5.395826617139301e-05, + "loss": 0.5579875111579895, + "step": 7350 + }, + { + "epoch": 3.10210970464135, + "grad_norm": 1.2002209424972534, + "learning_rate": 5.3934326465136854e-05, + "loss": 0.5369019508361816, + "step": 7352 + }, + { + "epoch": 3.10295358649789, + "grad_norm": 1.1660926342010498, + "learning_rate": 5.3910385851307133e-05, + "loss": 0.5573506355285645, + "step": 7354 + }, + { + "epoch": 3.1037974683544305, + "grad_norm": 1.3189473152160645, + "learning_rate": 5.38864443354265e-05, + "loss": 0.5485683679580688, + "step": 7356 + }, + { + "epoch": 3.1046413502109704, + "grad_norm": 1.11967134475708, + "learning_rate": 5.38625019230178e-05, + "loss": 0.4980843663215637, + "step": 7358 + }, + { + "epoch": 3.1054852320675104, + "grad_norm": 1.429019570350647, + "learning_rate": 5.3838558619604074e-05, + "loss": 0.5331753492355347, + "step": 7360 + }, + { + "epoch": 3.1063291139240508, + "grad_norm": 1.2600942850112915, + "learning_rate": 5.381461443070862e-05, + "loss": 0.5362547636032104, + "step": 7362 + }, + { + "epoch": 3.1071729957805907, + "grad_norm": 1.6344311237335205, + "learning_rate": 5.379066936185486e-05, + "loss": 0.5793240070343018, + "step": 7364 + }, + { + "epoch": 3.1080168776371306, + "grad_norm": 1.4372280836105347, + "learning_rate": 5.376672341856649e-05, + "loss": 0.5316762328147888, + "step": 7366 + }, + { + "epoch": 3.108860759493671, + "grad_norm": 1.4075509309768677, + "learning_rate": 5.3742776606367364e-05, + "loss": 0.5305402874946594, + "step": 7368 + }, + { + "epoch": 3.109704641350211, + "grad_norm": 1.6254384517669678, + "learning_rate": 5.371882893078156e-05, + "loss": 0.5756345391273499, + "step": 7370 + }, + { + "epoch": 3.110548523206751, + "grad_norm": 1.2218619585037231, + "learning_rate": 5.3694880397333335e-05, + "loss": 0.4959688186645508, + "step": 7372 + }, + { + "epoch": 3.1113924050632913, + "grad_norm": 1.3503917455673218, + "learning_rate": 5.3670931011547166e-05, + "loss": 0.593587338924408, + "step": 7374 + }, + { + "epoch": 3.1122362869198312, + "grad_norm": 1.403222918510437, + "learning_rate": 5.364698077894772e-05, + "loss": 0.5407475233078003, + "step": 7376 + }, + { + "epoch": 3.113080168776371, + "grad_norm": 1.4017539024353027, + "learning_rate": 5.3623029705059835e-05, + "loss": 0.6125431060791016, + "step": 7378 + }, + { + "epoch": 3.1139240506329116, + "grad_norm": 1.4538600444793701, + "learning_rate": 5.359907779540859e-05, + "loss": 0.5179317593574524, + "step": 7380 + }, + { + "epoch": 3.1147679324894515, + "grad_norm": 1.2120319604873657, + "learning_rate": 5.3575125055519225e-05, + "loss": 0.43457767367362976, + "step": 7382 + }, + { + "epoch": 3.1156118143459914, + "grad_norm": 1.3049911260604858, + "learning_rate": 5.355117149091717e-05, + "loss": 0.5810346603393555, + "step": 7384 + }, + { + "epoch": 3.116455696202532, + "grad_norm": 1.1788939237594604, + "learning_rate": 5.3527217107128036e-05, + "loss": 0.4865732789039612, + "step": 7386 + }, + { + "epoch": 3.1172995780590718, + "grad_norm": 1.4433233737945557, + "learning_rate": 5.350326190967768e-05, + "loss": 0.5363003015518188, + "step": 7388 + }, + { + "epoch": 3.1181434599156117, + "grad_norm": 1.2610430717468262, + "learning_rate": 5.347930590409207e-05, + "loss": 0.5111554861068726, + "step": 7390 + }, + { + "epoch": 3.118987341772152, + "grad_norm": 1.1659626960754395, + "learning_rate": 5.345534909589742e-05, + "loss": 0.5018916726112366, + "step": 7392 + }, + { + "epoch": 3.119831223628692, + "grad_norm": 1.1380181312561035, + "learning_rate": 5.343139149062008e-05, + "loss": 0.5640519261360168, + "step": 7394 + }, + { + "epoch": 3.120675105485232, + "grad_norm": 2.249542713165283, + "learning_rate": 5.340743309378663e-05, + "loss": 0.5245673060417175, + "step": 7396 + }, + { + "epoch": 3.1215189873417724, + "grad_norm": 1.288784384727478, + "learning_rate": 5.338347391092381e-05, + "loss": 0.5807012915611267, + "step": 7398 + }, + { + "epoch": 3.1223628691983123, + "grad_norm": 1.3856520652770996, + "learning_rate": 5.3359513947558525e-05, + "loss": 0.5531487464904785, + "step": 7400 + }, + { + "epoch": 3.1223628691983123, + "eval_loss": 0.676459550857544, + "eval_runtime": 513.5901, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 4.102, + "step": 7400 + }, + { + "epoch": 3.1232067510548522, + "grad_norm": 1.4650845527648926, + "learning_rate": 5.333555320921791e-05, + "loss": 0.610917329788208, + "step": 7402 + }, + { + "epoch": 3.124050632911392, + "grad_norm": 1.4528120756149292, + "learning_rate": 5.331159170142923e-05, + "loss": 0.5132687091827393, + "step": 7404 + }, + { + "epoch": 3.1248945147679326, + "grad_norm": 1.297371745109558, + "learning_rate": 5.328762942971994e-05, + "loss": 0.5424297451972961, + "step": 7406 + }, + { + "epoch": 3.1257383966244725, + "grad_norm": 1.3470855951309204, + "learning_rate": 5.326366639961767e-05, + "loss": 0.5797439813613892, + "step": 7408 + }, + { + "epoch": 3.1265822784810124, + "grad_norm": 1.0487306118011475, + "learning_rate": 5.323970261665027e-05, + "loss": 0.45681023597717285, + "step": 7410 + }, + { + "epoch": 3.127426160337553, + "grad_norm": 1.1351137161254883, + "learning_rate": 5.321573808634567e-05, + "loss": 0.5057253241539001, + "step": 7412 + }, + { + "epoch": 3.1282700421940928, + "grad_norm": 1.3002208471298218, + "learning_rate": 5.3191772814232055e-05, + "loss": 0.55838543176651, + "step": 7414 + }, + { + "epoch": 3.1291139240506327, + "grad_norm": 1.3143419027328491, + "learning_rate": 5.316780680583776e-05, + "loss": 0.5052227973937988, + "step": 7416 + }, + { + "epoch": 3.129957805907173, + "grad_norm": 1.2752583026885986, + "learning_rate": 5.314384006669126e-05, + "loss": 0.5119181871414185, + "step": 7418 + }, + { + "epoch": 3.130801687763713, + "grad_norm": 1.2892590761184692, + "learning_rate": 5.3119872602321256e-05, + "loss": 0.5696089267730713, + "step": 7420 + }, + { + "epoch": 3.131645569620253, + "grad_norm": 1.510764718055725, + "learning_rate": 5.309590441825654e-05, + "loss": 0.6057182550430298, + "step": 7422 + }, + { + "epoch": 3.1324894514767934, + "grad_norm": 1.2366914749145508, + "learning_rate": 5.307193552002616e-05, + "loss": 0.5079684853553772, + "step": 7424 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 1.2063475847244263, + "learning_rate": 5.3047965913159226e-05, + "loss": 0.4977130591869354, + "step": 7426 + }, + { + "epoch": 3.1341772151898732, + "grad_norm": 1.603097677230835, + "learning_rate": 5.30239956031851e-05, + "loss": 0.5761610865592957, + "step": 7428 + }, + { + "epoch": 3.1350210970464136, + "grad_norm": 1.2723357677459717, + "learning_rate": 5.300002459563328e-05, + "loss": 0.5743051767349243, + "step": 7430 + }, + { + "epoch": 3.1358649789029536, + "grad_norm": 1.3077106475830078, + "learning_rate": 5.297605289603338e-05, + "loss": 0.5411891341209412, + "step": 7432 + }, + { + "epoch": 3.1367088607594935, + "grad_norm": 1.3610905408859253, + "learning_rate": 5.2952080509915246e-05, + "loss": 0.5488677620887756, + "step": 7434 + }, + { + "epoch": 3.137552742616034, + "grad_norm": 1.1999255418777466, + "learning_rate": 5.292810744280884e-05, + "loss": 0.554864227771759, + "step": 7436 + }, + { + "epoch": 3.138396624472574, + "grad_norm": 1.2868118286132812, + "learning_rate": 5.2904133700244276e-05, + "loss": 0.49844983220100403, + "step": 7438 + }, + { + "epoch": 3.1392405063291138, + "grad_norm": 1.3824434280395508, + "learning_rate": 5.288015928775183e-05, + "loss": 0.5171698331832886, + "step": 7440 + }, + { + "epoch": 3.140084388185654, + "grad_norm": 1.502249002456665, + "learning_rate": 5.285618421086197e-05, + "loss": 0.550440788269043, + "step": 7442 + }, + { + "epoch": 3.140928270042194, + "grad_norm": 1.2650765180587769, + "learning_rate": 5.283220847510526e-05, + "loss": 0.5033495426177979, + "step": 7444 + }, + { + "epoch": 3.141772151898734, + "grad_norm": 1.2669732570648193, + "learning_rate": 5.280823208601244e-05, + "loss": 0.48968273401260376, + "step": 7446 + }, + { + "epoch": 3.1426160337552744, + "grad_norm": 1.104645848274231, + "learning_rate": 5.278425504911442e-05, + "loss": 0.4713798463344574, + "step": 7448 + }, + { + "epoch": 3.1434599156118144, + "grad_norm": 1.2858284711837769, + "learning_rate": 5.276027736994224e-05, + "loss": 0.5249105095863342, + "step": 7450 + }, + { + "epoch": 3.1443037974683543, + "grad_norm": 1.3720128536224365, + "learning_rate": 5.2736299054027064e-05, + "loss": 0.5125989317893982, + "step": 7452 + }, + { + "epoch": 3.1451476793248947, + "grad_norm": 1.2519328594207764, + "learning_rate": 5.271232010690025e-05, + "loss": 0.5324952006340027, + "step": 7454 + }, + { + "epoch": 3.1459915611814346, + "grad_norm": 1.2284791469573975, + "learning_rate": 5.2688340534093295e-05, + "loss": 0.4961182475090027, + "step": 7456 + }, + { + "epoch": 3.1468354430379746, + "grad_norm": 1.428916335105896, + "learning_rate": 5.26643603411378e-05, + "loss": 0.5569467544555664, + "step": 7458 + }, + { + "epoch": 3.147679324894515, + "grad_norm": 1.172302007675171, + "learning_rate": 5.264037953356554e-05, + "loss": 0.476906418800354, + "step": 7460 + }, + { + "epoch": 3.148523206751055, + "grad_norm": 1.2087178230285645, + "learning_rate": 5.261639811690843e-05, + "loss": 0.5321967601776123, + "step": 7462 + }, + { + "epoch": 3.149367088607595, + "grad_norm": 1.1226983070373535, + "learning_rate": 5.259241609669854e-05, + "loss": 0.5333749651908875, + "step": 7464 + }, + { + "epoch": 3.1502109704641352, + "grad_norm": 1.156534194946289, + "learning_rate": 5.256843347846803e-05, + "loss": 0.5035849809646606, + "step": 7466 + }, + { + "epoch": 3.151054852320675, + "grad_norm": 1.3600608110427856, + "learning_rate": 5.2544450267749244e-05, + "loss": 0.4934900104999542, + "step": 7468 + }, + { + "epoch": 3.151898734177215, + "grad_norm": 1.2820971012115479, + "learning_rate": 5.252046647007465e-05, + "loss": 0.5037409067153931, + "step": 7470 + }, + { + "epoch": 3.1527426160337555, + "grad_norm": 1.1549314260482788, + "learning_rate": 5.249648209097685e-05, + "loss": 0.5056651830673218, + "step": 7472 + }, + { + "epoch": 3.1535864978902954, + "grad_norm": 1.1724461317062378, + "learning_rate": 5.2472497135988586e-05, + "loss": 0.4975930154323578, + "step": 7474 + }, + { + "epoch": 3.1544303797468354, + "grad_norm": 1.1598713397979736, + "learning_rate": 5.2448511610642695e-05, + "loss": 0.46283629536628723, + "step": 7476 + }, + { + "epoch": 3.1552742616033758, + "grad_norm": 1.44228196144104, + "learning_rate": 5.2424525520472236e-05, + "loss": 0.557330846786499, + "step": 7478 + }, + { + "epoch": 3.1561181434599157, + "grad_norm": 1.3199583292007446, + "learning_rate": 5.2400538871010266e-05, + "loss": 0.513535737991333, + "step": 7480 + }, + { + "epoch": 3.1569620253164556, + "grad_norm": 1.180692434310913, + "learning_rate": 5.23765516677901e-05, + "loss": 0.5200037956237793, + "step": 7482 + }, + { + "epoch": 3.1578059071729956, + "grad_norm": 1.4217020273208618, + "learning_rate": 5.23525639163451e-05, + "loss": 0.576216459274292, + "step": 7484 + }, + { + "epoch": 3.158649789029536, + "grad_norm": 1.238783359527588, + "learning_rate": 5.23285756222088e-05, + "loss": 0.555095374584198, + "step": 7486 + }, + { + "epoch": 3.159493670886076, + "grad_norm": 1.293283462524414, + "learning_rate": 5.2304586790914815e-05, + "loss": 0.5228440761566162, + "step": 7488 + }, + { + "epoch": 3.160337552742616, + "grad_norm": 1.373578429222107, + "learning_rate": 5.22805974279969e-05, + "loss": 0.5684541463851929, + "step": 7490 + }, + { + "epoch": 3.1611814345991562, + "grad_norm": 1.1387807130813599, + "learning_rate": 5.225660753898899e-05, + "loss": 0.4627608358860016, + "step": 7492 + }, + { + "epoch": 3.162025316455696, + "grad_norm": 1.1708600521087646, + "learning_rate": 5.223261712942504e-05, + "loss": 0.5046111345291138, + "step": 7494 + }, + { + "epoch": 3.162869198312236, + "grad_norm": 1.3370471000671387, + "learning_rate": 5.220862620483921e-05, + "loss": 0.5108349323272705, + "step": 7496 + }, + { + "epoch": 3.1637130801687765, + "grad_norm": 1.399530053138733, + "learning_rate": 5.2184634770765716e-05, + "loss": 0.525260329246521, + "step": 7498 + }, + { + "epoch": 3.1645569620253164, + "grad_norm": 1.4769412279129028, + "learning_rate": 5.216064283273896e-05, + "loss": 0.6050346493721008, + "step": 7500 + }, + { + "epoch": 3.1645569620253164, + "eval_loss": 0.6774632334709167, + "eval_runtime": 513.4064, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 4.104, + "step": 7500 + } + ], + "logging_steps": 2, + "max_steps": 14220, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 4 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.789010110485719e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/sft_devstral_24B_v2/checkpoints/checkpoint-7500/training_args.bin b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bcbb0c1830757458e5f1538c7e05857fe1a2bb5e --- /dev/null +++ b/sft_devstral_24B_v2/checkpoints/checkpoint-7500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09df88fe57630482e911c5fab6026e3d20e4f37f6e48706f3566768f533d6d7 +size 4792 diff --git a/sft_devstral_24B_v2/config_resolved.yaml b/sft_devstral_24B_v2/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bd390b9b0c768b8ac88c17f977cf8e879795551d --- /dev/null +++ b/sft_devstral_24B_v2/config_resolved.yaml @@ -0,0 +1,102 @@ +run: + run_dir: ./task2file/sft_devstral_24B_v2 + seed: 42 +wandb: + enabled: true + project: sft-training + entity: null + name: null + tags: + - sft-lora + - 24b-Devstral + notes: null +model: + repo_id: ./Models/Devstral-Small-2-24B-HS-CPT + revision: null + base_local_dir: base_model + trust_remote_code: true + tokenizer_use_fast: true + device_map: auto + torch_dtype: bfloat16 + use_4bit: false + bnb_4bit_quant_type: nf4 + bnb_4bit_use_double_quant: false + bnb_4bit_compute_dtype: bfloat16 + attn_implementation: null +data: + train_jsonl: sft_dataset.jsonl + eval_jsonl: null + eval_split_ratio: 0.1 + instruction_field: instruction + input_field: input + output_field: output + format_type: custom + system_prompt: "You are a Hyperswitch Rust code analyzer. Identify functions/structs\ + \ that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain\ + \ the data flow and why each component must change:\n- Flow: [Input \u2192 Processing\ + \ \u2192 Output with arrows]\n- For each component: \"The [ComponentName] ([path])\ + \ must [action] because [reason]\u2014without this, [consequence]\"\n- Explain\ + \ coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\n\ + add::crates/another/file.rs::function::AnotherComponent\n\n\n## Rules\n\n\ + 1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for\ + \ nested items: `status::StructName::Type::Name`\n3. Always explain \"must change\ + \ because\" and \"without this\"\n3. Types of components: function, struct, enum,\ + \ impl, trait\n4. If there is extra information (e.g., enum variants), include\ + \ that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with \n\ + \n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook\ + \ system routes events via EventClass enum. Flow: webhook \u2192 EventClass \u2192\ + \ handler \u2192 processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass)\ + \ must add Subscriptions variant because it defines event routing\u2014without\ + \ this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus)\ + \ must map to EventType because it converts status to events\u2014without this,\ + \ status changes don't trigger webhooks. These are coupled: EventClass routes\ + \ to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\n\ + crates/common_enums/src/transformers.rs::SubscriptionStatus\n\n" + custom_template: '##INSTRUCTION + + {instruction}<|im_end|> + + {input}<|im_end|> + + {output}<|im_end|>' + max_length: 2048 + shuffle: true + num_proc: 4 +peft: + enabled: true + r: 8 + lora_alpha: 16 + lora_dropout: 0.05 + bias: none + target_modules: auto +train: + num_train_epochs: 6 + per_device_train_batch_size: 1 + per_device_eval_batch_size: 1 + gradient_accumulation_steps: 8 + learning_rate: 1e-4 + weight_decay: 0.0 + warmup_ratio: 0.08 + lr_scheduler_type: cosine + optim: adamw_torch + max_grad_norm: 0.8 + gradient_checkpointing: true + logging_steps: 2 + save_strategy: steps + save_steps: 500 + save_total_limit: 20 + evaluation_strategy: steps + eval_steps: 100 + load_best_model_at_end: true + early_stopping: + enabled: true + patience: 5 + min_delta: 0.001 + metric: eval_loss + mode: min + resume_from_checkpoint: auto +merge: + enabled: true + merged_dtype: float16 + max_shard_size: 2GB + output_dir: ./Models/Devstral-Small-2-24B-HS-CPT-SFT_v2 diff --git a/sft_devstral_24B_v2/eval_final.json b/sft_devstral_24B_v2/eval_final.json new file mode 100644 index 0000000000000000000000000000000000000000..05712431ff936d6e517eec8635feb63e8533e862 --- /dev/null +++ b/sft_devstral_24B_v2/eval_final.json @@ -0,0 +1,8 @@ +{ + "eval_loss": 0.6706293225288391, + "eval_runtime": 511.6513, + "eval_samples_per_second": 4.118, + "eval_steps_per_second": 4.118, + "epoch": 3.2067510548523206, + "perplexity": 1.955467553274469 +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/logs/eval.jsonl b/sft_devstral_24B_v2/logs/eval.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a8e0902a437e61ebe1d98cc1b24032399d049e0c --- /dev/null +++ b/sft_devstral_24B_v2/logs/eval.jsonl @@ -0,0 +1,77 @@ +{"ts": "2025-12-26T18:34:59", "event": "eval", "step": 100, "epoch": 0.04219409282700422, "eval_loss": 1.138856053352356, "eval_runtime": 859.7128, "eval_samples_per_second": 2.451, "eval_steps_per_second": 2.451, "perplexity": 3.1231935540832674} +{"ts": "2025-12-26T19:05:22", "event": "eval", "step": 200, "epoch": 0.08438818565400844, "eval_loss": 0.995743453502655, "eval_runtime": 846.8257, "eval_samples_per_second": 2.488, "eval_steps_per_second": 2.488, "perplexity": 2.7067359257317922} +{"ts": "2025-12-26T19:35:57", "event": "eval", "step": 300, "epoch": 0.12658227848101267, "eval_loss": 0.9517185688018799, "eval_runtime": 860.0287, "eval_samples_per_second": 2.45, "eval_steps_per_second": 2.45, "perplexity": 2.5901571998746475} +{"ts": "2025-12-26T20:06:52", "event": "eval", "step": 400, "epoch": 0.16877637130801687, "eval_loss": 0.9282881617546082, "eval_runtime": 869.6867, "eval_samples_per_second": 2.423, "eval_steps_per_second": 2.423, "perplexity": 2.5301742193066197} +{"ts": "2025-12-26T20:37:22", "event": "eval", "step": 500, "epoch": 0.2109704641350211, "eval_loss": 0.9080732464790344, "eval_runtime": 857.0753, "eval_samples_per_second": 2.458, "eval_steps_per_second": 2.458, "perplexity": 2.4795404646097325} +{"ts": "2025-12-26T21:07:55", "event": "eval", "step": 600, "epoch": 0.25316455696202533, "eval_loss": 0.8903881311416626, "eval_runtime": 845.9969, "eval_samples_per_second": 2.491, "eval_steps_per_second": 2.491, "perplexity": 2.4360749843862655} +{"ts": "2025-12-26T21:38:29", "event": "eval", "step": 700, "epoch": 0.29535864978902954, "eval_loss": 0.8730722069740295, "eval_runtime": 858.184, "eval_samples_per_second": 2.455, "eval_steps_per_second": 2.455, "perplexity": 2.3942552136153896} +{"ts": "2025-12-26T22:09:04", "event": "eval", "step": 800, "epoch": 0.33755274261603374, "eval_loss": 0.8635594248771667, "eval_runtime": 865.9348, "eval_samples_per_second": 2.433, "eval_steps_per_second": 2.433, "perplexity": 2.371587174483758} +{"ts": "2025-12-26T22:39:42", "event": "eval", "step": 900, "epoch": 0.379746835443038, "eval_loss": 0.8491304516792297, "eval_runtime": 852.6211, "eval_samples_per_second": 2.471, "eval_steps_per_second": 2.471, "perplexity": 2.3376133001985813} +{"ts": "2025-12-26T23:10:19", "event": "eval", "step": 1000, "epoch": 0.4219409282700422, "eval_loss": 0.8388314247131348, "eval_runtime": 847.4828, "eval_samples_per_second": 2.486, "eval_steps_per_second": 2.486, "perplexity": 2.3136617085393727} +{"ts": "2025-12-26T23:41:01", "event": "eval", "step": 1100, "epoch": 0.4641350210970464, "eval_loss": 0.8283821940422058, "eval_runtime": 861.0464, "eval_samples_per_second": 2.447, "eval_steps_per_second": 2.447, "perplexity": 2.2896115950724094} +{"ts": "2025-12-27T00:11:32", "event": "eval", "step": 1200, "epoch": 0.5063291139240507, "eval_loss": 0.8186545968055725, "eval_runtime": 862.1638, "eval_samples_per_second": 2.444, "eval_steps_per_second": 2.444, "perplexity": 2.267447153803737} +{"ts": "2025-12-27T00:42:14", "event": "eval", "step": 1300, "epoch": 0.5485232067510548, "eval_loss": 0.808323085308075, "eval_runtime": 853.577, "eval_samples_per_second": 2.468, "eval_steps_per_second": 2.468, "perplexity": 2.244141595588398} +{"ts": "2025-12-27T01:12:54", "event": "eval", "step": 1400, "epoch": 0.5907172995780591, "eval_loss": 0.8009664416313171, "eval_runtime": 851.9417, "eval_samples_per_second": 2.473, "eval_steps_per_second": 2.473, "perplexity": 2.227692823570967} +{"ts": "2025-12-27T01:43:40", "event": "eval", "step": 1500, "epoch": 0.6329113924050633, "eval_loss": 0.7896141409873962, "eval_runtime": 865.9069, "eval_samples_per_second": 2.433, "eval_steps_per_second": 2.433, "perplexity": 2.2025463898941693} +{"ts": "2025-12-27T02:14:07", "event": "eval", "step": 1600, "epoch": 0.6751054852320675, "eval_loss": 0.7836604714393616, "eval_runtime": 861.5352, "eval_samples_per_second": 2.446, "eval_steps_per_second": 2.446, "perplexity": 2.189472115099779} +{"ts": "2025-12-27T02:44:39", "event": "eval", "step": 1700, "epoch": 0.7172995780590717, "eval_loss": 0.7783148884773254, "eval_runtime": 846.1986, "eval_samples_per_second": 2.49, "eval_steps_per_second": 2.49, "perplexity": 2.1777993369634507} +{"ts": "2025-12-27T03:15:22", "event": "eval", "step": 1800, "epoch": 0.759493670886076, "eval_loss": 0.7719914317131042, "eval_runtime": 853.1943, "eval_samples_per_second": 2.47, "eval_steps_per_second": 2.47, "perplexity": 2.16407156624064} +{"ts": "2025-12-27T03:45:59", "event": "eval", "step": 1900, "epoch": 0.8016877637130801, "eval_loss": 0.7648926973342896, "eval_runtime": 865.9394, "eval_samples_per_second": 2.433, "eval_steps_per_second": 2.433, "perplexity": 2.148763794201393} +{"ts": "2025-12-27T04:16:30", "event": "eval", "step": 2000, "epoch": 0.8438818565400844, "eval_loss": 0.7587011456489563, "eval_runtime": 856.2276, "eval_samples_per_second": 2.461, "eval_steps_per_second": 2.461, "perplexity": 2.135500714003631} +{"ts": "2025-12-27T04:47:14", "event": "eval", "step": 2100, "epoch": 0.8860759493670886, "eval_loss": 0.7559094429016113, "eval_runtime": 847.8311, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "perplexity": 2.1295473446786564} +{"ts": "2025-12-27T05:17:56", "event": "eval", "step": 2200, "epoch": 0.9282700421940928, "eval_loss": 0.7497645616531372, "eval_runtime": 856.8766, "eval_samples_per_second": 2.459, "eval_steps_per_second": 2.459, "perplexity": 2.116501652297792} +{"ts": "2025-12-27T05:48:33", "event": "eval", "step": 2300, "epoch": 0.9704641350210971, "eval_loss": 0.7464568614959717, "eval_runtime": 864.2128, "eval_samples_per_second": 2.438, "eval_steps_per_second": 2.438, "perplexity": 2.1095124648903094} +{"ts": "2025-12-27T06:18:53", "event": "eval", "step": 2400, "epoch": 1.0126582278481013, "eval_loss": 0.7421699166297913, "eval_runtime": 854.2185, "eval_samples_per_second": 2.467, "eval_steps_per_second": 2.467, "perplexity": 2.100488457789446} +{"ts": "2025-12-27T06:49:31", "event": "eval", "step": 2500, "epoch": 1.0548523206751055, "eval_loss": 0.741338849067688, "eval_runtime": 847.7478, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "perplexity": 2.098743535142341} +{"ts": "2025-12-27T07:20:16", "event": "eval", "step": 2600, "epoch": 1.0970464135021096, "eval_loss": 0.7377332448959351, "eval_runtime": 859.6612, "eval_samples_per_second": 2.451, "eval_steps_per_second": 2.451, "perplexity": 2.091189922548451} +{"ts": "2025-12-27T07:51:03", "event": "eval", "step": 2700, "epoch": 1.139240506329114, "eval_loss": 0.7335711717605591, "eval_runtime": 861.9651, "eval_samples_per_second": 2.444, "eval_steps_per_second": 2.444, "perplexity": 2.0825043247357775} +{"ts": "2025-12-27T08:21:29", "event": "eval", "step": 2800, "epoch": 1.1814345991561181, "eval_loss": 0.7298192977905273, "eval_runtime": 849.544, "eval_samples_per_second": 2.48, "eval_steps_per_second": 2.48, "perplexity": 2.074705669900544} +{"ts": "2025-12-27T08:52:09", "event": "eval", "step": 2900, "epoch": 1.2236286919831223, "eval_loss": 0.7281573414802551, "eval_runtime": 854.563, "eval_samples_per_second": 2.466, "eval_steps_per_second": 2.466, "perplexity": 2.0712604634048333} +{"ts": "2025-12-27T09:23:05", "event": "eval", "step": 3000, "epoch": 1.2658227848101267, "eval_loss": 0.72515869140625, "eval_runtime": 868.0515, "eval_samples_per_second": 2.427, "eval_steps_per_second": 2.427, "perplexity": 2.0650587810476666} +{"ts": "2025-12-27T09:53:39", "event": "eval", "step": 3100, "epoch": 1.3080168776371308, "eval_loss": 0.7225774526596069, "eval_runtime": 862.4006, "eval_samples_per_second": 2.443, "eval_steps_per_second": 2.443, "perplexity": 2.0597352449225896} +{"ts": "2025-12-27T10:24:10", "event": "eval", "step": 3200, "epoch": 1.350210970464135, "eval_loss": 0.7200453281402588, "eval_runtime": 846.2953, "eval_samples_per_second": 2.49, "eval_steps_per_second": 2.49, "perplexity": 2.0545263363912047} +{"ts": "2025-12-27T10:54:40", "event": "eval", "step": 3300, "epoch": 1.3924050632911391, "eval_loss": 0.7173135876655579, "eval_runtime": 853.5344, "eval_samples_per_second": 2.469, "eval_steps_per_second": 2.469, "perplexity": 2.0489215625209867} +{"ts": "2025-12-27T11:25:25", "event": "eval", "step": 3400, "epoch": 1.4345991561181435, "eval_loss": 0.715917706489563, "eval_runtime": 868.51, "eval_samples_per_second": 2.426, "eval_steps_per_second": 2.426, "perplexity": 2.046063506698008} +{"ts": "2025-12-27T11:55:47", "event": "eval", "step": 3500, "epoch": 1.4767932489451476, "eval_loss": 0.7155047059059143, "eval_runtime": 855.8428, "eval_samples_per_second": 2.462, "eval_steps_per_second": 2.462, "perplexity": 2.0452186557495358} +{"ts": "2025-12-27T12:26:22", "event": "eval", "step": 3600, "epoch": 1.518987341772152, "eval_loss": 0.7118256688117981, "eval_runtime": 851.3079, "eval_samples_per_second": 2.475, "eval_steps_per_second": 2.475, "perplexity": 2.0377080448290807} +{"ts": "2025-12-27T12:57:01", "event": "eval", "step": 3700, "epoch": 1.5611814345991561, "eval_loss": 0.7099412679672241, "eval_runtime": 857.2273, "eval_samples_per_second": 2.458, "eval_steps_per_second": 2.458, "perplexity": 2.0338718017134907} +{"ts": "2025-12-27T13:27:39", "event": "eval", "step": 3800, "epoch": 1.6033755274261603, "eval_loss": 0.7080941200256348, "eval_runtime": 865.6774, "eval_samples_per_second": 2.434, "eval_steps_per_second": 2.434, "perplexity": 2.030118407206169} +{"ts": "2025-12-27T13:58:20", "event": "eval", "step": 3900, "epoch": 1.6455696202531644, "eval_loss": 0.7049403786659241, "eval_runtime": 854.9866, "eval_samples_per_second": 2.464, "eval_steps_per_second": 2.464, "perplexity": 2.023726024080043} +{"ts": "2025-12-27T14:28:59", "event": "eval", "step": 4000, "epoch": 1.6877637130801688, "eval_loss": 0.7027890682220459, "eval_runtime": 848.7529, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "perplexity": 2.0193770408327394} +{"ts": "2025-12-27T14:59:26", "event": "eval", "step": 4100, "epoch": 1.729957805907173, "eval_loss": 0.7022181153297424, "eval_runtime": 844.6405, "eval_samples_per_second": 2.495, "eval_steps_per_second": 2.495, "perplexity": 2.0182244007535304} +{"ts": "2025-12-27T15:20:08", "event": "eval", "step": 4200, "epoch": 1.7721518987341773, "eval_loss": 0.6993561387062073, "eval_runtime": 542.0281, "eval_samples_per_second": 3.887, "eval_steps_per_second": 3.887, "perplexity": 2.012456547365305} +{"ts": "2025-12-27T15:39:13", "event": "eval", "step": 4300, "epoch": 1.8143459915611815, "eval_loss": 0.6981000900268555, "eval_runtime": 514.4659, "eval_samples_per_second": 4.096, "eval_steps_per_second": 4.096, "perplexity": 2.0099303907966624} +{"ts": "2025-12-27T15:58:13", "event": "eval", "step": 4400, "epoch": 1.8565400843881856, "eval_loss": 0.6961485147476196, "eval_runtime": 513.5724, "eval_samples_per_second": 4.103, "eval_steps_per_second": 4.103, "perplexity": 2.0060116854010337} +{"ts": "2025-12-27T16:17:15", "event": "eval", "step": 4500, "epoch": 1.8987341772151898, "eval_loss": 0.6938078999519348, "eval_runtime": 513.615, "eval_samples_per_second": 4.102, "eval_steps_per_second": 4.102, "perplexity": 2.0013218754302557} +{"ts": "2025-12-27T16:38:02", "event": "eval", "step": 4600, "epoch": 1.9409282700421941, "eval_loss": 0.6930755376815796, "eval_runtime": 617.8927, "eval_samples_per_second": 3.41, "eval_steps_per_second": 3.41, "perplexity": 1.999856719375848} +{"ts": "2025-12-27T16:58:16", "event": "eval", "step": 4700, "epoch": 1.9831223628691983, "eval_loss": 0.6923081278800964, "eval_runtime": 514.7729, "eval_samples_per_second": 4.093, "eval_steps_per_second": 4.093, "perplexity": 1.9983225984528428} +{"ts": "2025-12-27T17:17:24", "event": "eval", "step": 4800, "epoch": 2.0253164556962027, "eval_loss": 0.6924457550048828, "eval_runtime": 514.0427, "eval_samples_per_second": 4.099, "eval_steps_per_second": 4.099, "perplexity": 1.998597640772671} +{"ts": "2025-12-27T17:36:32", "event": "eval", "step": 4900, "epoch": 2.067510548523207, "eval_loss": 0.6941288113594055, "eval_runtime": 513.4497, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104, "perplexity": 2.0019642255133236} +{"ts": "2025-12-27T17:58:22", "event": "eval", "step": 5000, "epoch": 2.109704641350211, "eval_loss": 0.6908889412879944, "eval_runtime": 675.8398, "eval_samples_per_second": 3.118, "eval_steps_per_second": 3.118, "perplexity": 1.9954886172641344} +{"ts": "2025-12-27T18:23:03", "event": "eval", "step": 5100, "epoch": 2.151898734177215, "eval_loss": 0.6902023553848267, "eval_runtime": 733.915, "eval_samples_per_second": 2.871, "eval_steps_per_second": 2.871, "perplexity": 1.9941190131388347} +{"ts": "2025-12-27T18:55:59", "event": "eval", "step": 5200, "epoch": 2.1940928270042193, "eval_loss": 0.6915348172187805, "eval_runtime": 1167.9782, "eval_samples_per_second": 1.804, "eval_steps_per_second": 1.804, "perplexity": 1.9967778716365487} +{"ts": "2025-12-27T19:32:22", "event": "eval", "step": 5300, "epoch": 2.2362869198312234, "eval_loss": 0.6898328065872192, "eval_runtime": 739.3794, "eval_samples_per_second": 2.85, "eval_steps_per_second": 2.85, "perplexity": 1.993382225003213} +{"ts": "2025-12-27T19:58:17", "event": "eval", "step": 5400, "epoch": 2.278481012658228, "eval_loss": 0.6875645518302917, "eval_runtime": 861.3558, "eval_samples_per_second": 2.446, "eval_steps_per_second": 2.446, "perplexity": 1.988865850369486} +{"ts": "2025-12-27T20:31:44", "event": "eval", "step": 5500, "epoch": 2.320675105485232, "eval_loss": 0.6867148876190186, "eval_runtime": 941.3545, "eval_samples_per_second": 2.238, "eval_steps_per_second": 2.238, "perplexity": 1.9871766999423568} +{"ts": "2025-12-27T21:05:14", "event": "eval", "step": 5600, "epoch": 2.3628691983122363, "eval_loss": 0.6851074695587158, "eval_runtime": 938.5536, "eval_samples_per_second": 2.245, "eval_steps_per_second": 2.245, "perplexity": 1.9839850420773193} +{"ts": "2025-12-27T21:38:52", "event": "eval", "step": 5700, "epoch": 2.4050632911392404, "eval_loss": 0.6841402053833008, "eval_runtime": 941.6641, "eval_samples_per_second": 2.238, "eval_steps_per_second": 2.238, "perplexity": 1.9820669322305768} +{"ts": "2025-12-27T22:09:41", "event": "eval", "step": 5800, "epoch": 2.4472573839662446, "eval_loss": 0.6835155487060547, "eval_runtime": 758.407, "eval_samples_per_second": 2.778, "eval_steps_per_second": 2.778, "perplexity": 1.9808292075033642} +{"ts": "2025-12-27T22:28:42", "event": "eval", "step": 5900, "epoch": 2.489451476793249, "eval_loss": 0.6820966005325317, "eval_runtime": 513.3515, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104, "perplexity": 1.9780205066890182} +{"ts": "2025-12-27T22:47:43", "event": "eval", "step": 6000, "epoch": 2.5316455696202533, "eval_loss": 0.6813357472419739, "eval_runtime": 513.5491, "eval_samples_per_second": 4.103, "eval_steps_per_second": 4.103, "perplexity": 1.9765160956683256} +{"ts": "2025-12-27T23:06:47", "event": "eval", "step": 6100, "epoch": 2.5738396624472575, "eval_loss": 0.6812278628349304, "eval_runtime": 513.4749, "eval_samples_per_second": 4.103, "eval_steps_per_second": 4.103, "perplexity": 1.9763028719032991} +{"ts": "2025-12-27T23:25:56", "event": "eval", "step": 6200, "epoch": 2.6160337552742616, "eval_loss": 0.6795271039009094, "eval_runtime": 513.2393, "eval_samples_per_second": 4.105, "eval_steps_per_second": 4.105, "perplexity": 1.972944513825857} +{"ts": "2025-12-27T23:44:50", "event": "eval", "step": 6300, "epoch": 2.6582278481012658, "eval_loss": 0.6781066656112671, "eval_runtime": 512.3669, "eval_samples_per_second": 4.112, "eval_steps_per_second": 4.112, "perplexity": 1.9701440573037758} +{"ts": "2025-12-28T00:03:48", "event": "eval", "step": 6400, "epoch": 2.70042194092827, "eval_loss": 0.6764505505561829, "eval_runtime": 512.7682, "eval_samples_per_second": 4.109, "eval_steps_per_second": 4.109, "perplexity": 1.9668839723527984} +{"ts": "2025-12-28T00:22:46", "event": "eval", "step": 6500, "epoch": 2.742616033755274, "eval_loss": 0.6768895387649536, "eval_runtime": 513.0657, "eval_samples_per_second": 4.107, "eval_steps_per_second": 4.107, "perplexity": 1.9677476007721588} +{"ts": "2025-12-28T00:41:51", "event": "eval", "step": 6600, "epoch": 2.7848101265822782, "eval_loss": 0.6737648844718933, "eval_runtime": 512.921, "eval_samples_per_second": 4.108, "eval_steps_per_second": 4.108, "perplexity": 1.9616086658032716} +{"ts": "2025-12-28T01:00:52", "event": "eval", "step": 6700, "epoch": 2.827004219409283, "eval_loss": 0.6737436056137085, "eval_runtime": 513.2559, "eval_samples_per_second": 4.105, "eval_steps_per_second": 4.105, "perplexity": 1.961566925454753} +{"ts": "2025-12-28T01:19:55", "event": "eval", "step": 6800, "epoch": 2.869198312236287, "eval_loss": 0.6721681356430054, "eval_runtime": 513.1285, "eval_samples_per_second": 4.106, "eval_steps_per_second": 4.106, "perplexity": 1.9584789687983855} +{"ts": "2025-12-28T01:38:47", "event": "eval", "step": 6900, "epoch": 2.911392405063291, "eval_loss": 0.6713213920593262, "eval_runtime": 513.1265, "eval_samples_per_second": 4.106, "eval_steps_per_second": 4.106, "perplexity": 1.9568213411895954} +{"ts": "2025-12-28T01:57:47", "event": "eval", "step": 7000, "epoch": 2.9535864978902953, "eval_loss": 0.6706293225288391, "eval_runtime": 513.4396, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104, "perplexity": 1.955467553274469} +{"ts": "2025-12-28T02:16:49", "event": "eval", "step": 7100, "epoch": 2.9957805907173, "eval_loss": 0.6692973375320435, "eval_runtime": 512.8985, "eval_samples_per_second": 4.108, "eval_steps_per_second": 4.108, "perplexity": 1.9528646337415076} +{"ts": "2025-12-28T02:35:50", "event": "eval", "step": 7200, "epoch": 3.037974683544304, "eval_loss": 0.6751418709754944, "eval_runtime": 513.8972, "eval_samples_per_second": 4.1, "eval_steps_per_second": 4.1, "perplexity": 1.9643116350103986} +{"ts": "2025-12-28T02:54:46", "event": "eval", "step": 7300, "epoch": 3.080168776371308, "eval_loss": 0.678839385509491, "eval_runtime": 513.7013, "eval_samples_per_second": 4.102, "eval_steps_per_second": 4.102, "perplexity": 1.9715881500500663} +{"ts": "2025-12-28T03:13:51", "event": "eval", "step": 7400, "epoch": 3.1223628691983123, "eval_loss": 0.676459550857544, "eval_runtime": 513.5901, "eval_samples_per_second": 4.102, "eval_steps_per_second": 4.102, "perplexity": 1.9669016749809562} +{"ts": "2025-12-28T03:32:54", "event": "eval", "step": 7500, "epoch": 3.1645569620253164, "eval_loss": 0.6774632334709167, "eval_runtime": 513.4064, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104, "perplexity": 1.9688768110333967} +{"ts": "2025-12-28T03:51:52", "event": "eval", "step": 7600, "epoch": 3.2067510548523206, "eval_loss": 0.6755207777023315, "eval_runtime": 513.9779, "eval_samples_per_second": 4.099, "eval_steps_per_second": 4.099, "perplexity": 1.965056066928733} +{"ts": "2025-12-28T04:00:24", "event": "eval", "step": 7600, "epoch": 3.2067510548523206, "eval_loss": 0.6706293225288391, "eval_runtime": 511.6513, "eval_samples_per_second": 4.118, "eval_steps_per_second": 4.118, "perplexity": 1.955467553274469} diff --git a/sft_devstral_24B_v2/logs/train.jsonl b/sft_devstral_24B_v2/logs/train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c66ca1dd68f334566905c60c19b70c633e1b7659 --- /dev/null +++ b/sft_devstral_24B_v2/logs/train.jsonl @@ -0,0 +1,3878 @@ +{"ts": "2025-12-26T18:09:06", "event": "train_log", "step": 2, "epoch": 0.0008438818565400844, "progress_pct": 0.01, "epoch_pct": 0.01, "eta": "26:25:01", "max_grad_norm": 0.8, "loss": 1.3927901983261108, "grad_norm": 1.597854733467102, "learning_rate": 8.787346221441124e-08} +{"ts": "2025-12-26T18:09:19", "event": "train_log", "step": 4, "epoch": 0.0016877637130801688, "progress_pct": 0.03, "epoch_pct": 0.03, "eta": "25:51:25", "max_grad_norm": 0.8, "loss": 1.407160758972168, "grad_norm": 1.6547431945800781, "learning_rate": 2.6362038664323375e-07} +{"ts": "2025-12-26T18:09:32", "event": "train_log", "step": 6, "epoch": 0.002531645569620253, "progress_pct": 0.04, "epoch_pct": 0.04, "eta": "25:42:10", "max_grad_norm": 0.8, "loss": 1.376656174659729, "grad_norm": 1.8221601247787476, "learning_rate": 4.393673110720563e-07} +{"ts": "2025-12-26T18:09:45", "event": "train_log", "step": 8, "epoch": 0.0033755274261603376, "progress_pct": 0.06, "epoch_pct": 0.06, "eta": "25:41:48", "max_grad_norm": 0.8, "loss": 1.247712254524231, "grad_norm": 1.4831048250198364, "learning_rate": 6.151142355008788e-07} +{"ts": "2025-12-26T18:09:57", "event": "train_log", "step": 10, "epoch": 0.004219409282700422, "progress_pct": 0.07, "epoch_pct": 0.07, "eta": "25:35:59", "max_grad_norm": 0.8, "loss": 1.2685163021087646, "grad_norm": 1.668201208114624, "learning_rate": 7.908611599297013e-07} +{"ts": "2025-12-26T18:10:10", "event": "train_log", "step": 12, "epoch": 0.005063291139240506, "progress_pct": 0.08, "epoch_pct": 0.08, "eta": "25:28:48", "max_grad_norm": 0.8, "loss": 1.2942761182785034, "grad_norm": 1.67417311668396, "learning_rate": 9.666080843585237e-07} +{"ts": "2025-12-26T18:10:22", "event": "train_log", "step": 14, "epoch": 0.00590717299578059, "progress_pct": 0.1, "epoch_pct": 0.1, "eta": "25:19:06", "max_grad_norm": 0.8, "loss": 1.3638604879379272, "grad_norm": 1.7154079675674438, "learning_rate": 1.1423550087873463e-06} +{"ts": "2025-12-26T18:10:35", "event": "train_log", "step": 16, "epoch": 0.006751054852320675, "progress_pct": 0.11, "epoch_pct": 0.11, "eta": "25:09:55", "max_grad_norm": 0.8, "loss": 1.3476728200912476, "grad_norm": 1.729427456855774, "learning_rate": 1.3181019332161688e-06} +{"ts": "2025-12-26T18:10:47", "event": "train_log", "step": 18, "epoch": 0.007594936708860759, "progress_pct": 0.13, "epoch_pct": 0.13, "eta": "25:03:46", "max_grad_norm": 0.8, "loss": 1.3476393222808838, "grad_norm": 1.3813447952270508, "learning_rate": 1.4938488576449913e-06} +{"ts": "2025-12-26T18:10:59", "event": "train_log", "step": 20, "epoch": 0.008438818565400843, "progress_pct": 0.14, "epoch_pct": 0.14, "eta": "25:02:55", "max_grad_norm": 0.8, "loss": 1.2449309825897217, "grad_norm": 1.557220458984375, "learning_rate": 1.6695957820738139e-06} +{"ts": "2025-12-26T18:11:13", "event": "train_log", "step": 22, "epoch": 0.009282700421940928, "progress_pct": 0.15, "epoch_pct": 0.15, "eta": "25:06:54", "max_grad_norm": 0.8, "loss": 1.3125361204147339, "grad_norm": 1.1883500814437866, "learning_rate": 1.8453427065026362e-06} +{"ts": "2025-12-26T18:11:25", "event": "train_log", "step": 24, "epoch": 0.010126582278481013, "progress_pct": 0.17, "epoch_pct": 0.17, "eta": "25:02:32", "max_grad_norm": 0.8, "loss": 1.3724769353866577, "grad_norm": 1.7290029525756836, "learning_rate": 2.0210896309314587e-06} +{"ts": "2025-12-26T18:11:37", "event": "train_log", "step": 26, "epoch": 0.010970464135021098, "progress_pct": 0.18, "epoch_pct": 0.18, "eta": "25:00:40", "max_grad_norm": 0.8, "loss": 1.3401387929916382, "grad_norm": 1.5627557039260864, "learning_rate": 2.1968365553602812e-06} +{"ts": "2025-12-26T18:11:51", "event": "train_log", "step": 28, "epoch": 0.01181434599156118, "progress_pct": 0.2, "epoch_pct": 0.2, "eta": "25:04:55", "max_grad_norm": 0.8, "loss": 1.365437388420105, "grad_norm": 1.796866774559021, "learning_rate": 2.3725834797891038e-06} +{"ts": "2025-12-26T18:12:03", "event": "train_log", "step": 30, "epoch": 0.012658227848101266, "progress_pct": 0.21, "epoch_pct": 0.21, "eta": "25:03:58", "max_grad_norm": 0.8, "loss": 1.2706533670425415, "grad_norm": 1.7030404806137085, "learning_rate": 2.5483304042179263e-06} +{"ts": "2025-12-26T18:12:16", "event": "train_log", "step": 32, "epoch": 0.01350210970464135, "progress_pct": 0.23, "epoch_pct": 0.23, "eta": "25:01:31", "max_grad_norm": 0.8, "loss": 1.3084994554519653, "grad_norm": 1.3186293840408325, "learning_rate": 2.724077328646749e-06} +{"ts": "2025-12-26T18:12:28", "event": "train_log", "step": 34, "epoch": 0.014345991561181435, "progress_pct": 0.24, "epoch_pct": 0.24, "eta": "24:59:23", "max_grad_norm": 0.8, "loss": 1.3259696960449219, "grad_norm": 1.5762513875961304, "learning_rate": 2.8998242530755714e-06} +{"ts": "2025-12-26T18:12:41", "event": "train_log", "step": 36, "epoch": 0.015189873417721518, "progress_pct": 0.25, "epoch_pct": 0.25, "eta": "24:59:32", "max_grad_norm": 0.8, "loss": 1.3205676078796387, "grad_norm": 1.422295331954956, "learning_rate": 3.075571177504394e-06} +{"ts": "2025-12-26T18:12:54", "event": "train_log", "step": 38, "epoch": 0.016033755274261603, "progress_pct": 0.27, "epoch_pct": 0.27, "eta": "25:01:02", "max_grad_norm": 0.8, "loss": 1.3740568161010742, "grad_norm": 1.495523452758789, "learning_rate": 3.2513181019332165e-06} +{"ts": "2025-12-26T18:13:07", "event": "train_log", "step": 40, "epoch": 0.016877637130801686, "progress_pct": 0.28, "epoch_pct": 0.28, "eta": "25:02:47", "max_grad_norm": 0.8, "loss": 1.321828842163086, "grad_norm": 1.5112254619598389, "learning_rate": 3.427065026362039e-06} +{"ts": "2025-12-26T18:13:19", "event": "train_log", "step": 42, "epoch": 0.017721518987341773, "progress_pct": 0.3, "epoch_pct": 0.3, "eta": "24:58:58", "max_grad_norm": 0.8, "loss": 1.3673173189163208, "grad_norm": 1.4667807817459106, "learning_rate": 3.602811950790861e-06} +{"ts": "2025-12-26T18:13:31", "event": "train_log", "step": 44, "epoch": 0.018565400843881856, "progress_pct": 0.31, "epoch_pct": 0.31, "eta": "24:58:11", "max_grad_norm": 0.8, "loss": 1.3968093395233154, "grad_norm": 1.6609723567962646, "learning_rate": 3.7785588752196836e-06} +{"ts": "2025-12-26T18:13:43", "event": "train_log", "step": 46, "epoch": 0.019409282700421943, "progress_pct": 0.32, "epoch_pct": 0.32, "eta": "24:53:58", "max_grad_norm": 0.8, "loss": 1.4295302629470825, "grad_norm": 1.59381103515625, "learning_rate": 3.954305799648506e-06} +{"ts": "2025-12-26T18:13:56", "event": "train_log", "step": 48, "epoch": 0.020253164556962026, "progress_pct": 0.34, "epoch_pct": 0.34, "eta": "24:52:47", "max_grad_norm": 0.8, "loss": 1.2536572217941284, "grad_norm": 1.1470608711242676, "learning_rate": 4.130052724077329e-06} +{"ts": "2025-12-26T18:14:09", "event": "train_log", "step": 50, "epoch": 0.02109704641350211, "progress_pct": 0.35, "epoch_pct": 0.35, "eta": "24:53:46", "max_grad_norm": 0.8, "loss": 1.242217779159546, "grad_norm": 1.2014588117599487, "learning_rate": 4.305799648506151e-06} +{"ts": "2025-12-26T18:14:21", "event": "train_log", "step": 52, "epoch": 0.021940928270042195, "progress_pct": 0.37, "epoch_pct": 0.37, "eta": "24:53:08", "max_grad_norm": 0.8, "loss": 1.2166963815689087, "grad_norm": 1.2327464818954468, "learning_rate": 4.481546572934974e-06} +{"ts": "2025-12-26T18:14:33", "event": "train_log", "step": 54, "epoch": 0.02278481012658228, "progress_pct": 0.38, "epoch_pct": 0.38, "eta": "24:49:33", "max_grad_norm": 0.8, "loss": 1.25709867477417, "grad_norm": 1.9708983898162842, "learning_rate": 4.657293497363796e-06} +{"ts": "2025-12-26T18:14:46", "event": "train_log", "step": 56, "epoch": 0.02362869198312236, "progress_pct": 0.39, "epoch_pct": 0.39, "eta": "24:50:52", "max_grad_norm": 0.8, "loss": 1.2886158227920532, "grad_norm": 1.180569052696228, "learning_rate": 4.833040421792619e-06} +{"ts": "2025-12-26T18:14:58", "event": "train_log", "step": 58, "epoch": 0.024472573839662448, "progress_pct": 0.41, "epoch_pct": 0.41, "eta": "24:49:06", "max_grad_norm": 0.8, "loss": 1.29886794090271, "grad_norm": 1.5029548406600952, "learning_rate": 5.008787346221441e-06} +{"ts": "2025-12-26T18:15:11", "event": "train_log", "step": 60, "epoch": 0.02531645569620253, "progress_pct": 0.42, "epoch_pct": 0.42, "eta": "24:47:29", "max_grad_norm": 0.8, "loss": 1.2387628555297852, "grad_norm": 1.5380216836929321, "learning_rate": 5.184534270650264e-06} +{"ts": "2025-12-26T18:15:24", "event": "train_log", "step": 62, "epoch": 0.026160337552742614, "progress_pct": 0.44, "epoch_pct": 0.44, "eta": "24:48:42", "max_grad_norm": 0.8, "loss": 1.2177000045776367, "grad_norm": 1.572144865989685, "learning_rate": 5.3602811950790864e-06} +{"ts": "2025-12-26T18:15:36", "event": "train_log", "step": 64, "epoch": 0.0270042194092827, "progress_pct": 0.45, "epoch_pct": 0.45, "eta": "24:48:06", "max_grad_norm": 0.8, "loss": 1.181516170501709, "grad_norm": 1.4882780313491821, "learning_rate": 5.536028119507909e-06} +{"ts": "2025-12-26T18:15:49", "event": "train_log", "step": 66, "epoch": 0.027848101265822784, "progress_pct": 0.46, "epoch_pct": 0.46, "eta": "24:48:23", "max_grad_norm": 0.8, "loss": 1.2101733684539795, "grad_norm": 1.2982488870620728, "learning_rate": 5.7117750439367315e-06} +{"ts": "2025-12-26T18:16:02", "event": "train_log", "step": 68, "epoch": 0.02869198312236287, "progress_pct": 0.48, "epoch_pct": 0.48, "eta": "24:49:55", "max_grad_norm": 0.8, "loss": 1.2277681827545166, "grad_norm": 1.5236955881118774, "learning_rate": 5.887521968365554e-06} +{"ts": "2025-12-26T18:16:15", "event": "train_log", "step": 70, "epoch": 0.029535864978902954, "progress_pct": 0.49, "epoch_pct": 0.49, "eta": "24:52:02", "max_grad_norm": 0.8, "loss": 1.1688424348831177, "grad_norm": 1.4521006345748901, "learning_rate": 6.0632688927943766e-06} +{"ts": "2025-12-26T18:16:27", "event": "train_log", "step": 72, "epoch": 0.030379746835443037, "progress_pct": 0.51, "epoch_pct": 0.51, "eta": "24:48:44", "max_grad_norm": 0.8, "loss": 1.273059368133545, "grad_norm": 1.2352311611175537, "learning_rate": 6.239015817223199e-06} +{"ts": "2025-12-26T18:16:40", "event": "train_log", "step": 74, "epoch": 0.031223628691983123, "progress_pct": 0.52, "epoch_pct": 0.52, "eta": "24:50:28", "max_grad_norm": 0.8, "loss": 1.1609034538269043, "grad_norm": 1.3438209295272827, "learning_rate": 6.414762741652021e-06} +{"ts": "2025-12-26T18:16:53", "event": "train_log", "step": 76, "epoch": 0.032067510548523206, "progress_pct": 0.53, "epoch_pct": 0.53, "eta": "24:50:50", "max_grad_norm": 0.8, "loss": 1.2508260011672974, "grad_norm": 1.9009398221969604, "learning_rate": 6.590509666080843e-06} +{"ts": "2025-12-26T18:17:05", "event": "train_log", "step": 78, "epoch": 0.03291139240506329, "progress_pct": 0.55, "epoch_pct": 0.55, "eta": "24:49:14", "max_grad_norm": 0.8, "loss": 1.2524956464767456, "grad_norm": 1.6718412637710571, "learning_rate": 6.766256590509666e-06} +{"ts": "2025-12-26T18:17:21", "event": "train_log", "step": 80, "epoch": 0.03375527426160337, "progress_pct": 0.56, "epoch_pct": 0.56, "eta": "24:56:54", "max_grad_norm": 0.8, "loss": 1.1472493410110474, "grad_norm": 1.249891757965088, "learning_rate": 6.942003514938488e-06} +{"ts": "2025-12-26T18:17:41", "event": "train_log", "step": 82, "epoch": 0.03459915611814346, "progress_pct": 0.58, "epoch_pct": 0.58, "eta": "25:19:22", "max_grad_norm": 0.8, "loss": 1.0845389366149902, "grad_norm": 1.4398653507232666, "learning_rate": 7.117750439367312e-06} +{"ts": "2025-12-26T18:18:01", "event": "train_log", "step": 84, "epoch": 0.035443037974683546, "progress_pct": 0.59, "epoch_pct": 0.59, "eta": "25:38:34", "max_grad_norm": 0.8, "loss": 1.1088868379592896, "grad_norm": 1.3701167106628418, "learning_rate": 7.293497363796134e-06} +{"ts": "2025-12-26T18:18:22", "event": "train_log", "step": 86, "epoch": 0.036286919831223625, "progress_pct": 0.6, "epoch_pct": 0.6, "eta": "25:58:42", "max_grad_norm": 0.8, "loss": 1.1513772010803223, "grad_norm": 1.277998924255371, "learning_rate": 7.469244288224957e-06} +{"ts": "2025-12-26T18:18:42", "event": "train_log", "step": 88, "epoch": 0.03713080168776371, "progress_pct": 0.62, "epoch_pct": 0.62, "eta": "26:18:01", "max_grad_norm": 0.8, "loss": 1.1385771036148071, "grad_norm": 1.4970002174377441, "learning_rate": 7.644991212653779e-06} +{"ts": "2025-12-26T18:19:02", "event": "train_log", "step": 90, "epoch": 0.0379746835443038, "progress_pct": 0.63, "epoch_pct": 0.63, "eta": "26:35:20", "max_grad_norm": 0.8, "loss": 1.1632680892944336, "grad_norm": 1.3384218215942383, "learning_rate": 7.820738137082601e-06} +{"ts": "2025-12-26T18:19:23", "event": "train_log", "step": 92, "epoch": 0.038818565400843885, "progress_pct": 0.65, "epoch_pct": 0.65, "eta": "26:54:53", "max_grad_norm": 0.8, "loss": 1.2256064414978027, "grad_norm": 1.4317446947097778, "learning_rate": 7.996485061511425e-06} +{"ts": "2025-12-26T18:19:43", "event": "train_log", "step": 94, "epoch": 0.039662447257383965, "progress_pct": 0.66, "epoch_pct": 0.66, "eta": "27:08:43", "max_grad_norm": 0.8, "loss": 1.1935789585113525, "grad_norm": 1.8743640184402466, "learning_rate": 8.172231985940246e-06} +{"ts": "2025-12-26T18:20:02", "event": "train_log", "step": 96, "epoch": 0.04050632911392405, "progress_pct": 0.68, "epoch_pct": 0.68, "eta": "27:22:05", "max_grad_norm": 0.8, "loss": 1.1429362297058105, "grad_norm": 1.4789546728134155, "learning_rate": 8.347978910369069e-06} +{"ts": "2025-12-26T18:20:19", "event": "train_log", "step": 98, "epoch": 0.04135021097046414, "progress_pct": 0.69, "epoch_pct": 0.69, "eta": "27:27:51", "max_grad_norm": 0.8, "loss": 1.1831508874893188, "grad_norm": 1.658605694770813, "learning_rate": 8.523725834797891e-06} +{"ts": "2025-12-26T18:20:39", "event": "train_log", "step": 100, "epoch": 0.04219409282700422, "progress_pct": 0.7, "epoch_pct": 0.7, "eta": "27:42:56", "max_grad_norm": 0.8, "loss": 1.0539867877960205, "grad_norm": 1.5077892541885376, "learning_rate": 8.699472759226714e-06} +{"ts": "2025-12-26T18:34:59", "event": "train_log", "step": 100, "epoch": 0.04219409282700422, "progress_pct": 0.7, "epoch_pct": 0.7, "eta": "61:26:08", "max_grad_norm": 0.8, "eval_loss": 1.138856053352356, "eval_runtime": 859.7128, "eval_samples_per_second": 2.451, "eval_steps_per_second": 2.451} +{"ts": "2025-12-26T18:35:18", "event": "train_log", "step": 102, "epoch": 0.043037974683544304, "progress_pct": 0.72, "epoch_pct": 0.72, "eta": "60:58:09", "max_grad_norm": 0.8, "loss": 1.0719901323318481, "grad_norm": 1.4335681200027466, "learning_rate": 8.875219683655536e-06} +{"ts": "2025-12-26T18:35:37", "event": "train_log", "step": 104, "epoch": 0.04388185654008439, "progress_pct": 0.73, "epoch_pct": 0.73, "eta": "60:30:36", "max_grad_norm": 0.8, "loss": 1.0654313564300537, "grad_norm": 1.7387681007385254, "learning_rate": 9.050966608084359e-06} +{"ts": "2025-12-26T18:35:56", "event": "train_log", "step": 106, "epoch": 0.04472573839662447, "progress_pct": 0.75, "epoch_pct": 0.75, "eta": "60:02:23", "max_grad_norm": 0.8, "loss": 1.0752698183059692, "grad_norm": 1.6071950197219849, "learning_rate": 9.226713532513181e-06} +{"ts": "2025-12-26T18:36:14", "event": "train_log", "step": 108, "epoch": 0.04556962025316456, "progress_pct": 0.76, "epoch_pct": 0.76, "eta": "59:35:36", "max_grad_norm": 0.8, "loss": 1.1029763221740723, "grad_norm": 1.40005362033844, "learning_rate": 9.402460456942004e-06} +{"ts": "2025-12-26T18:36:34", "event": "train_log", "step": 110, "epoch": 0.046413502109704644, "progress_pct": 0.77, "epoch_pct": 0.77, "eta": "59:11:31", "max_grad_norm": 0.8, "loss": 1.1157960891723633, "grad_norm": 2.2338669300079346, "learning_rate": 9.578207381370826e-06} +{"ts": "2025-12-26T18:36:52", "event": "train_log", "step": 112, "epoch": 0.04725738396624472, "progress_pct": 0.79, "epoch_pct": 0.79, "eta": "58:46:50", "max_grad_norm": 0.8, "loss": 1.1095420122146606, "grad_norm": 1.4972727298736572, "learning_rate": 9.753954305799649e-06} +{"ts": "2025-12-26T18:37:12", "event": "train_log", "step": 114, "epoch": 0.04810126582278481, "progress_pct": 0.8, "epoch_pct": 0.8, "eta": "58:25:33", "max_grad_norm": 0.8, "loss": 1.109113097190857, "grad_norm": 1.317979097366333, "learning_rate": 9.929701230228471e-06} +{"ts": "2025-12-26T18:37:31", "event": "train_log", "step": 116, "epoch": 0.048945147679324896, "progress_pct": 0.82, "epoch_pct": 0.82, "eta": "58:02:32", "max_grad_norm": 0.8, "loss": 1.1055104732513428, "grad_norm": 1.496346116065979, "learning_rate": 1.0105448154657294e-05} +{"ts": "2025-12-26T18:37:50", "event": "train_log", "step": 118, "epoch": 0.049789029535864976, "progress_pct": 0.83, "epoch_pct": 0.83, "eta": "57:40:46", "max_grad_norm": 0.8, "loss": 1.118395209312439, "grad_norm": 1.385406732559204, "learning_rate": 1.0281195079086117e-05} +{"ts": "2025-12-26T18:38:08", "event": "train_log", "step": 120, "epoch": 0.05063291139240506, "progress_pct": 0.84, "epoch_pct": 0.84, "eta": "57:18:31", "max_grad_norm": 0.8, "loss": 1.1008446216583252, "grad_norm": 1.524222731590271, "learning_rate": 1.0456942003514939e-05} +{"ts": "2025-12-26T18:38:27", "event": "train_log", "step": 122, "epoch": 0.05147679324894515, "progress_pct": 0.86, "epoch_pct": 0.86, "eta": "56:57:29", "max_grad_norm": 0.8, "loss": 1.0891425609588623, "grad_norm": 1.6308200359344482, "learning_rate": 1.0632688927943762e-05} +{"ts": "2025-12-26T18:38:46", "event": "train_log", "step": 124, "epoch": 0.05232067510548523, "progress_pct": 0.87, "epoch_pct": 0.87, "eta": "56:37:42", "max_grad_norm": 0.8, "loss": 0.9080473184585571, "grad_norm": 1.3681106567382812, "learning_rate": 1.0808435852372584e-05} +{"ts": "2025-12-26T18:39:06", "event": "train_log", "step": 126, "epoch": 0.053164556962025315, "progress_pct": 0.89, "epoch_pct": 0.89, "eta": "56:20:19", "max_grad_norm": 0.8, "loss": 1.0337369441986084, "grad_norm": 1.9429908990859985, "learning_rate": 1.0984182776801407e-05} +{"ts": "2025-12-26T18:39:24", "event": "train_log", "step": 128, "epoch": 0.0540084388185654, "progress_pct": 0.9, "epoch_pct": 0.9, "eta": "56:01:10", "max_grad_norm": 0.8, "loss": 1.0703333616256714, "grad_norm": 1.5830830335617065, "learning_rate": 1.115992970123023e-05} +{"ts": "2025-12-26T18:39:43", "event": "train_log", "step": 130, "epoch": 0.05485232067510549, "progress_pct": 0.91, "epoch_pct": 0.91, "eta": "55:42:22", "max_grad_norm": 0.8, "loss": 1.004652738571167, "grad_norm": 1.4792555570602417, "learning_rate": 1.1335676625659052e-05} +{"ts": "2025-12-26T18:40:03", "event": "train_log", "step": 132, "epoch": 0.05569620253164557, "progress_pct": 0.93, "epoch_pct": 0.93, "eta": "55:27:42", "max_grad_norm": 0.8, "loss": 0.9798293709754944, "grad_norm": 1.7196226119995117, "learning_rate": 1.1511423550087874e-05} +{"ts": "2025-12-26T18:40:22", "event": "train_log", "step": 134, "epoch": 0.056540084388185655, "progress_pct": 0.94, "epoch_pct": 0.94, "eta": "55:10:00", "max_grad_norm": 0.8, "loss": 1.0213249921798706, "grad_norm": 1.8733659982681274, "learning_rate": 1.1687170474516697e-05} +{"ts": "2025-12-26T18:40:41", "event": "train_log", "step": 136, "epoch": 0.05738396624472574, "progress_pct": 0.96, "epoch_pct": 0.96, "eta": "54:54:50", "max_grad_norm": 0.8, "loss": 1.0358591079711914, "grad_norm": 1.3431142568588257, "learning_rate": 1.186291739894552e-05} +{"ts": "2025-12-26T18:41:01", "event": "train_log", "step": 138, "epoch": 0.05822784810126582, "progress_pct": 0.97, "epoch_pct": 0.97, "eta": "54:39:21", "max_grad_norm": 0.8, "loss": 0.9372249841690063, "grad_norm": 1.527864933013916, "learning_rate": 1.2038664323374342e-05} +{"ts": "2025-12-26T18:41:20", "event": "train_log", "step": 140, "epoch": 0.05907172995780591, "progress_pct": 0.98, "epoch_pct": 0.98, "eta": "54:23:55", "max_grad_norm": 0.8, "loss": 1.0277758836746216, "grad_norm": 1.5495563745498657, "learning_rate": 1.2214411247803164e-05} +{"ts": "2025-12-26T18:41:40", "event": "train_log", "step": 142, "epoch": 0.059915611814345994, "progress_pct": 1.0, "epoch_pct": 1.0, "eta": "54:10:30", "max_grad_norm": 0.8, "loss": 1.0349801778793335, "grad_norm": 1.6792418956756592, "learning_rate": 1.2390158172231985e-05} +{"ts": "2025-12-26T18:41:58", "event": "train_log", "step": 144, "epoch": 0.060759493670886074, "progress_pct": 1.01, "epoch_pct": 1.01, "eta": "53:55:22", "max_grad_norm": 0.8, "loss": 0.9578297734260559, "grad_norm": 1.6468945741653442, "learning_rate": 1.256590509666081e-05} +{"ts": "2025-12-26T18:42:18", "event": "train_log", "step": 146, "epoch": 0.06160337552742616, "progress_pct": 1.03, "epoch_pct": 1.03, "eta": "53:41:24", "max_grad_norm": 0.8, "loss": 1.0628854036331177, "grad_norm": 1.7243824005126953, "learning_rate": 1.2741652021089632e-05} +{"ts": "2025-12-26T18:42:38", "event": "train_log", "step": 148, "epoch": 0.06244725738396625, "progress_pct": 1.04, "epoch_pct": 1.04, "eta": "53:29:14", "max_grad_norm": 0.8, "loss": 0.9336449503898621, "grad_norm": 1.7286981344223022, "learning_rate": 1.2917398945518455e-05} +{"ts": "2025-12-26T18:42:59", "event": "train_log", "step": 150, "epoch": 0.06329113924050633, "progress_pct": 1.05, "epoch_pct": 1.05, "eta": "53:18:44", "max_grad_norm": 0.8, "loss": 0.953730583190918, "grad_norm": 1.6411832571029663, "learning_rate": 1.3093145869947277e-05} +{"ts": "2025-12-26T18:43:18", "event": "train_log", "step": 152, "epoch": 0.06413502109704641, "progress_pct": 1.07, "epoch_pct": 1.07, "eta": "53:05:30", "max_grad_norm": 0.8, "loss": 1.051239013671875, "grad_norm": 1.8297001123428345, "learning_rate": 1.3268892794376098e-05} +{"ts": "2025-12-26T18:43:36", "event": "train_log", "step": 154, "epoch": 0.06497890295358649, "progress_pct": 1.08, "epoch_pct": 1.08, "eta": "52:52:09", "max_grad_norm": 0.8, "loss": 0.9955035448074341, "grad_norm": 1.9660519361495972, "learning_rate": 1.3444639718804922e-05} +{"ts": "2025-12-26T18:43:56", "event": "train_log", "step": 156, "epoch": 0.06582278481012659, "progress_pct": 1.1, "epoch_pct": 1.1, "eta": "52:40:07", "max_grad_norm": 0.8, "loss": 0.913300096988678, "grad_norm": 1.8423733711242676, "learning_rate": 1.3620386643233743e-05} +{"ts": "2025-12-26T18:44:16", "event": "train_log", "step": 158, "epoch": 0.06666666666666667, "progress_pct": 1.11, "epoch_pct": 1.11, "eta": "52:29:51", "max_grad_norm": 0.8, "loss": 1.0429846048355103, "grad_norm": 1.9146347045898438, "learning_rate": 1.3796133567662567e-05} +{"ts": "2025-12-26T18:44:37", "event": "train_log", "step": 160, "epoch": 0.06751054852320675, "progress_pct": 1.13, "epoch_pct": 1.13, "eta": "52:20:25", "max_grad_norm": 0.8, "loss": 1.0360238552093506, "grad_norm": 1.6221821308135986, "learning_rate": 1.3971880492091388e-05} +{"ts": "2025-12-26T18:44:55", "event": "train_log", "step": 162, "epoch": 0.06835443037974684, "progress_pct": 1.14, "epoch_pct": 1.14, "eta": "52:07:22", "max_grad_norm": 0.8, "loss": 1.0227266550064087, "grad_norm": 2.173283338546753, "learning_rate": 1.4147627416520212e-05} +{"ts": "2025-12-26T18:45:15", "event": "train_log", "step": 164, "epoch": 0.06919831223628692, "progress_pct": 1.15, "epoch_pct": 1.15, "eta": "51:57:06", "max_grad_norm": 0.8, "loss": 1.0075194835662842, "grad_norm": 1.7091665267944336, "learning_rate": 1.4323374340949033e-05} +{"ts": "2025-12-26T18:45:35", "event": "train_log", "step": 166, "epoch": 0.070042194092827, "progress_pct": 1.17, "epoch_pct": 1.17, "eta": "51:47:15", "max_grad_norm": 0.8, "loss": 1.0044782161712646, "grad_norm": 1.7219135761260986, "learning_rate": 1.4499121265377857e-05} +{"ts": "2025-12-26T18:45:56", "event": "train_log", "step": 168, "epoch": 0.07088607594936709, "progress_pct": 1.18, "epoch_pct": 1.18, "eta": "51:39:44", "max_grad_norm": 0.8, "loss": 0.9393973350524902, "grad_norm": 1.6558159589767456, "learning_rate": 1.4674868189806678e-05} +{"ts": "2025-12-26T18:46:15", "event": "train_log", "step": 170, "epoch": 0.07172995780590717, "progress_pct": 1.2, "epoch_pct": 1.2, "eta": "51:28:28", "max_grad_norm": 0.8, "loss": 0.9955337643623352, "grad_norm": 1.9362739324569702, "learning_rate": 1.4850615114235502e-05} +{"ts": "2025-12-26T18:46:35", "event": "train_log", "step": 172, "epoch": 0.07257383966244725, "progress_pct": 1.21, "epoch_pct": 1.21, "eta": "51:19:47", "max_grad_norm": 0.8, "loss": 0.9659126400947571, "grad_norm": 1.7792853116989136, "learning_rate": 1.5026362038664323e-05} +{"ts": "2025-12-26T18:46:56", "event": "train_log", "step": 174, "epoch": 0.07341772151898734, "progress_pct": 1.22, "epoch_pct": 1.22, "eta": "51:12:36", "max_grad_norm": 0.8, "loss": 0.9077855348587036, "grad_norm": 1.7184511423110962, "learning_rate": 1.5202108963093147e-05} +{"ts": "2025-12-26T18:47:17", "event": "train_log", "step": 176, "epoch": 0.07426160337552742, "progress_pct": 1.24, "epoch_pct": 1.24, "eta": "51:04:56", "max_grad_norm": 0.8, "loss": 0.9305018782615662, "grad_norm": 1.5701428651809692, "learning_rate": 1.537785588752197e-05} +{"ts": "2025-12-26T18:47:37", "event": "train_log", "step": 178, "epoch": 0.0751054852320675, "progress_pct": 1.25, "epoch_pct": 1.25, "eta": "50:56:42", "max_grad_norm": 0.8, "loss": 1.0211774110794067, "grad_norm": 1.970229148864746, "learning_rate": 1.555360281195079e-05} +{"ts": "2025-12-26T18:47:58", "event": "train_log", "step": 180, "epoch": 0.0759493670886076, "progress_pct": 1.27, "epoch_pct": 1.27, "eta": "50:49:34", "max_grad_norm": 0.8, "loss": 0.9479315876960754, "grad_norm": 1.8410269021987915, "learning_rate": 1.5729349736379615e-05} +{"ts": "2025-12-26T18:48:18", "event": "train_log", "step": 182, "epoch": 0.07679324894514768, "progress_pct": 1.28, "epoch_pct": 1.28, "eta": "50:40:40", "max_grad_norm": 0.8, "loss": 1.0629050731658936, "grad_norm": 1.8991246223449707, "learning_rate": 1.5905096660808434e-05} +{"ts": "2025-12-26T18:48:37", "event": "train_log", "step": 184, "epoch": 0.07763713080168777, "progress_pct": 1.29, "epoch_pct": 1.29, "eta": "50:31:55", "max_grad_norm": 0.8, "loss": 0.946983814239502, "grad_norm": 1.8052008152008057, "learning_rate": 1.608084358523726e-05} +{"ts": "2025-12-26T18:48:58", "event": "train_log", "step": 186, "epoch": 0.07848101265822785, "progress_pct": 1.31, "epoch_pct": 1.31, "eta": "50:25:25", "max_grad_norm": 0.8, "loss": 0.9413356184959412, "grad_norm": 1.547108769416809, "learning_rate": 1.625659050966608e-05} +{"ts": "2025-12-26T18:49:18", "event": "train_log", "step": 188, "epoch": 0.07932489451476793, "progress_pct": 1.32, "epoch_pct": 1.32, "eta": "50:17:38", "max_grad_norm": 0.8, "loss": 0.9337888956069946, "grad_norm": 1.9713538885116577, "learning_rate": 1.6432337434094905e-05} +{"ts": "2025-12-26T18:49:38", "event": "train_log", "step": 190, "epoch": 0.08016877637130802, "progress_pct": 1.34, "epoch_pct": 1.34, "eta": "50:09:18", "max_grad_norm": 0.8, "loss": 0.9816337823867798, "grad_norm": 1.708789348602295, "learning_rate": 1.6608084358523728e-05} +{"ts": "2025-12-26T18:49:57", "event": "train_log", "step": 192, "epoch": 0.0810126582278481, "progress_pct": 1.35, "epoch_pct": 1.35, "eta": "50:01:15", "max_grad_norm": 0.8, "loss": 1.017122507095337, "grad_norm": 1.815292477607727, "learning_rate": 1.678383128295255e-05} +{"ts": "2025-12-26T18:50:16", "event": "train_log", "step": 194, "epoch": 0.08185654008438818, "progress_pct": 1.36, "epoch_pct": 1.36, "eta": "49:53:07", "max_grad_norm": 0.8, "loss": 0.991599440574646, "grad_norm": 1.7950682640075684, "learning_rate": 1.6959578207381373e-05} +{"ts": "2025-12-26T18:50:36", "event": "train_log", "step": 196, "epoch": 0.08270042194092828, "progress_pct": 1.38, "epoch_pct": 1.38, "eta": "49:45:30", "max_grad_norm": 0.8, "loss": 0.9570834040641785, "grad_norm": 1.692512035369873, "learning_rate": 1.7135325131810195e-05} +{"ts": "2025-12-26T18:50:55", "event": "train_log", "step": 198, "epoch": 0.08354430379746836, "progress_pct": 1.39, "epoch_pct": 1.39, "eta": "49:37:29", "max_grad_norm": 0.8, "loss": 1.035754919052124, "grad_norm": 2.056089162826538, "learning_rate": 1.7311072056239018e-05} +{"ts": "2025-12-26T18:51:15", "event": "train_log", "step": 200, "epoch": 0.08438818565400844, "progress_pct": 1.41, "epoch_pct": 1.41, "eta": "49:30:12", "max_grad_norm": 0.8, "loss": 1.0124205350875854, "grad_norm": 1.7022203207015991, "learning_rate": 1.7486818980667837e-05} +{"ts": "2025-12-26T19:05:22", "event": "train_log", "step": 200, "epoch": 0.08438818565400844, "progress_pct": 1.41, "epoch_pct": 1.41, "eta": "65:59:35", "max_grad_norm": 0.8, "eval_loss": 0.995743453502655, "eval_runtime": 846.8257, "eval_samples_per_second": 2.488, "eval_steps_per_second": 2.488} +{"ts": "2025-12-26T19:05:42", "event": "train_log", "step": 202, "epoch": 0.08523206751054853, "progress_pct": 1.42, "epoch_pct": 1.42, "eta": "65:44:02", "max_grad_norm": 0.8, "loss": 0.8946985006332397, "grad_norm": 1.6088604927062988, "learning_rate": 1.7662565905096663e-05} +{"ts": "2025-12-26T19:06:02", "event": "train_log", "step": 204, "epoch": 0.08607594936708861, "progress_pct": 1.43, "epoch_pct": 1.43, "eta": "65:26:42", "max_grad_norm": 0.8, "loss": 0.976133406162262, "grad_norm": 2.02270770072937, "learning_rate": 1.7838312829525482e-05} +{"ts": "2025-12-26T19:06:22", "event": "train_log", "step": 206, "epoch": 0.08691983122362869, "progress_pct": 1.45, "epoch_pct": 1.45, "eta": "65:11:12", "max_grad_norm": 0.8, "loss": 0.9079383611679077, "grad_norm": 1.7832789421081543, "learning_rate": 1.8014059753954308e-05} +{"ts": "2025-12-26T19:06:42", "event": "train_log", "step": 208, "epoch": 0.08776371308016878, "progress_pct": 1.46, "epoch_pct": 1.46, "eta": "64:55:09", "max_grad_norm": 0.8, "loss": 0.8650367856025696, "grad_norm": 1.9793545007705688, "learning_rate": 1.8189806678383127e-05} +{"ts": "2025-12-26T19:07:00", "event": "train_log", "step": 210, "epoch": 0.08860759493670886, "progress_pct": 1.48, "epoch_pct": 1.48, "eta": "64:38:12", "max_grad_norm": 0.8, "loss": 0.9327266812324524, "grad_norm": 1.8124271631240845, "learning_rate": 1.8365553602811953e-05} +{"ts": "2025-12-26T19:07:20", "event": "train_log", "step": 212, "epoch": 0.08945147679324894, "progress_pct": 1.49, "epoch_pct": 1.49, "eta": "64:22:45", "max_grad_norm": 0.8, "loss": 0.9811079502105713, "grad_norm": 1.8581212759017944, "learning_rate": 1.8541300527240772e-05} +{"ts": "2025-12-26T19:07:39", "event": "train_log", "step": 214, "epoch": 0.09029535864978903, "progress_pct": 1.5, "epoch_pct": 1.5, "eta": "64:07:06", "max_grad_norm": 0.8, "loss": 0.9546971321105957, "grad_norm": 2.001699447631836, "learning_rate": 1.8717047451669598e-05} +{"ts": "2025-12-26T19:07:59", "event": "train_log", "step": 216, "epoch": 0.09113924050632911, "progress_pct": 1.52, "epoch_pct": 1.52, "eta": "63:52:04", "max_grad_norm": 0.8, "loss": 0.9611319899559021, "grad_norm": 1.6994978189468384, "learning_rate": 1.8892794376098417e-05} +{"ts": "2025-12-26T19:08:18", "event": "train_log", "step": 218, "epoch": 0.0919831223628692, "progress_pct": 1.53, "epoch_pct": 1.53, "eta": "63:37:19", "max_grad_norm": 0.8, "loss": 0.9781531095504761, "grad_norm": 2.1379497051239014, "learning_rate": 1.9068541300527243e-05} +{"ts": "2025-12-26T19:08:38", "event": "train_log", "step": 220, "epoch": 0.09282700421940929, "progress_pct": 1.55, "epoch_pct": 1.55, "eta": "63:23:18", "max_grad_norm": 0.8, "loss": 0.9374833106994629, "grad_norm": 1.8961224555969238, "learning_rate": 1.9244288224956066e-05} +{"ts": "2025-12-26T19:08:59", "event": "train_log", "step": 222, "epoch": 0.09367088607594937, "progress_pct": 1.56, "epoch_pct": 1.56, "eta": "63:09:47", "max_grad_norm": 0.8, "loss": 0.9681299328804016, "grad_norm": 1.851464033126831, "learning_rate": 1.9420035149384885e-05} +{"ts": "2025-12-26T19:09:18", "event": "train_log", "step": 224, "epoch": 0.09451476793248945, "progress_pct": 1.58, "epoch_pct": 1.58, "eta": "62:55:38", "max_grad_norm": 0.8, "loss": 1.0086225271224976, "grad_norm": 2.0642266273498535, "learning_rate": 1.959578207381371e-05} +{"ts": "2025-12-26T19:09:38", "event": "train_log", "step": 226, "epoch": 0.09535864978902954, "progress_pct": 1.59, "epoch_pct": 1.59, "eta": "62:42:25", "max_grad_norm": 0.8, "loss": 0.9190312623977661, "grad_norm": 1.8658756017684937, "learning_rate": 1.977152899824253e-05} +{"ts": "2025-12-26T19:09:56", "event": "train_log", "step": 228, "epoch": 0.09620253164556962, "progress_pct": 1.6, "epoch_pct": 1.6, "eta": "62:26:39", "max_grad_norm": 0.8, "loss": 0.9740874171257019, "grad_norm": 2.4398674964904785, "learning_rate": 1.9947275922671356e-05} +{"ts": "2025-12-26T19:10:16", "event": "train_log", "step": 230, "epoch": 0.0970464135021097, "progress_pct": 1.62, "epoch_pct": 1.62, "eta": "62:14:21", "max_grad_norm": 0.8, "loss": 0.884376049041748, "grad_norm": 1.849183440208435, "learning_rate": 2.0123022847100175e-05} +{"ts": "2025-12-26T19:10:36", "event": "train_log", "step": 232, "epoch": 0.09789029535864979, "progress_pct": 1.63, "epoch_pct": 1.63, "eta": "62:01:49", "max_grad_norm": 0.8, "loss": 0.9116487503051758, "grad_norm": 2.027320384979248, "learning_rate": 2.0298769771529e-05} +{"ts": "2025-12-26T19:10:57", "event": "train_log", "step": 234, "epoch": 0.09873417721518987, "progress_pct": 1.65, "epoch_pct": 1.65, "eta": "61:50:27", "max_grad_norm": 0.8, "loss": 0.9035115242004395, "grad_norm": 1.6800135374069214, "learning_rate": 2.047451669595782e-05} +{"ts": "2025-12-26T19:11:18", "event": "train_log", "step": 236, "epoch": 0.09957805907172995, "progress_pct": 1.66, "epoch_pct": 1.66, "eta": "61:39:14", "max_grad_norm": 0.8, "loss": 0.9043796062469482, "grad_norm": 2.2362256050109863, "learning_rate": 2.0650263620386646e-05} +{"ts": "2025-12-26T19:11:37", "event": "train_log", "step": 238, "epoch": 0.10042194092827005, "progress_pct": 1.67, "epoch_pct": 1.67, "eta": "61:26:27", "max_grad_norm": 0.8, "loss": 1.0888828039169312, "grad_norm": 1.938215970993042, "learning_rate": 2.0826010544815465e-05} +{"ts": "2025-12-26T19:11:57", "event": "train_log", "step": 240, "epoch": 0.10126582278481013, "progress_pct": 1.69, "epoch_pct": 1.69, "eta": "61:13:46", "max_grad_norm": 0.8, "loss": 0.9960280656814575, "grad_norm": 1.890328049659729, "learning_rate": 2.100175746924429e-05} +{"ts": "2025-12-26T19:12:16", "event": "train_log", "step": 242, "epoch": 0.1021097046413502, "progress_pct": 1.7, "epoch_pct": 1.7, "eta": "61:01:26", "max_grad_norm": 0.8, "loss": 0.9848901629447937, "grad_norm": 2.021235227584839, "learning_rate": 2.117750439367311e-05} +{"ts": "2025-12-26T19:12:37", "event": "train_log", "step": 244, "epoch": 0.1029535864978903, "progress_pct": 1.72, "epoch_pct": 1.72, "eta": "60:51:03", "max_grad_norm": 0.8, "loss": 0.891694188117981, "grad_norm": 2.023920774459839, "learning_rate": 2.1353251318101936e-05} +{"ts": "2025-12-26T19:12:56", "event": "train_log", "step": 246, "epoch": 0.10379746835443038, "progress_pct": 1.73, "epoch_pct": 1.73, "eta": "60:39:04", "max_grad_norm": 0.8, "loss": 0.9059976935386658, "grad_norm": 1.8061069250106812, "learning_rate": 2.1528998242530755e-05} +{"ts": "2025-12-26T19:13:15", "event": "train_log", "step": 248, "epoch": 0.10464135021097046, "progress_pct": 1.74, "epoch_pct": 1.74, "eta": "60:27:14", "max_grad_norm": 0.8, "loss": 1.0056109428405762, "grad_norm": 2.176302194595337, "learning_rate": 2.1704745166959578e-05} +{"ts": "2025-12-26T19:13:35", "event": "train_log", "step": 250, "epoch": 0.10548523206751055, "progress_pct": 1.76, "epoch_pct": 1.76, "eta": "60:16:20", "max_grad_norm": 0.8, "loss": 0.9645357728004456, "grad_norm": 1.9820969104766846, "learning_rate": 2.18804920913884e-05} +{"ts": "2025-12-26T19:13:56", "event": "train_log", "step": 252, "epoch": 0.10632911392405063, "progress_pct": 1.77, "epoch_pct": 1.77, "eta": "60:06:19", "max_grad_norm": 0.8, "loss": 1.0178182125091553, "grad_norm": 1.8764572143554688, "learning_rate": 2.2056239015817223e-05} +{"ts": "2025-12-26T19:14:14", "event": "train_log", "step": 254, "epoch": 0.10717299578059072, "progress_pct": 1.79, "epoch_pct": 1.79, "eta": "59:53:37", "max_grad_norm": 0.8, "loss": 0.9546761512756348, "grad_norm": 2.56221342086792, "learning_rate": 2.223198594024605e-05} +{"ts": "2025-12-26T19:14:32", "event": "train_log", "step": 256, "epoch": 0.1080168776371308, "progress_pct": 1.8, "epoch_pct": 1.8, "eta": "59:41:40", "max_grad_norm": 0.8, "loss": 0.9300968647003174, "grad_norm": 2.6779074668884277, "learning_rate": 2.2407732864674868e-05} +{"ts": "2025-12-26T19:14:52", "event": "train_log", "step": 258, "epoch": 0.10886075949367088, "progress_pct": 1.81, "epoch_pct": 1.81, "eta": "59:31:28", "max_grad_norm": 0.8, "loss": 0.926638662815094, "grad_norm": 2.140897512435913, "learning_rate": 2.2583479789103694e-05} +{"ts": "2025-12-26T19:15:11", "event": "train_log", "step": 260, "epoch": 0.10970464135021098, "progress_pct": 1.83, "epoch_pct": 1.83, "eta": "59:20:26", "max_grad_norm": 0.8, "loss": 1.0681840181350708, "grad_norm": 2.0880508422851562, "learning_rate": 2.2759226713532513e-05} +{"ts": "2025-12-26T19:15:29", "event": "train_log", "step": 262, "epoch": 0.11054852320675106, "progress_pct": 1.84, "epoch_pct": 1.84, "eta": "59:08:23", "max_grad_norm": 0.8, "loss": 1.0840941667556763, "grad_norm": 2.7273616790771484, "learning_rate": 2.293497363796134e-05} +{"ts": "2025-12-26T19:15:50", "event": "train_log", "step": 264, "epoch": 0.11139240506329114, "progress_pct": 1.86, "epoch_pct": 1.86, "eta": "58:59:19", "max_grad_norm": 0.8, "loss": 0.8637182116508484, "grad_norm": 1.6723874807357788, "learning_rate": 2.3110720562390158e-05} +{"ts": "2025-12-26T19:16:10", "event": "train_log", "step": 266, "epoch": 0.11223628691983123, "progress_pct": 1.87, "epoch_pct": 1.87, "eta": "58:50:04", "max_grad_norm": 0.8, "loss": 0.9554686546325684, "grad_norm": 1.806243896484375, "learning_rate": 2.3286467486818984e-05} +{"ts": "2025-12-26T19:16:31", "event": "train_log", "step": 268, "epoch": 0.11308016877637131, "progress_pct": 1.88, "epoch_pct": 1.88, "eta": "58:41:08", "max_grad_norm": 0.8, "loss": 0.9556593894958496, "grad_norm": 1.9086743593215942, "learning_rate": 2.3462214411247803e-05} +{"ts": "2025-12-26T19:16:51", "event": "train_log", "step": 270, "epoch": 0.11392405063291139, "progress_pct": 1.9, "epoch_pct": 1.9, "eta": "58:31:57", "max_grad_norm": 0.8, "loss": 0.9177709817886353, "grad_norm": 2.1822304725646973, "learning_rate": 2.3637961335676626e-05} +{"ts": "2025-12-26T19:17:10", "event": "train_log", "step": 272, "epoch": 0.11476793248945148, "progress_pct": 1.91, "epoch_pct": 1.91, "eta": "58:22:21", "max_grad_norm": 0.8, "loss": 0.9288759827613831, "grad_norm": 2.1009039878845215, "learning_rate": 2.3813708260105448e-05} +{"ts": "2025-12-26T19:17:30", "event": "train_log", "step": 274, "epoch": 0.11561181434599156, "progress_pct": 1.93, "epoch_pct": 1.93, "eta": "58:12:40", "max_grad_norm": 0.8, "loss": 0.9881691932678223, "grad_norm": 1.9814810752868652, "learning_rate": 2.398945518453427e-05} +{"ts": "2025-12-26T19:17:51", "event": "train_log", "step": 276, "epoch": 0.11645569620253164, "progress_pct": 1.94, "epoch_pct": 1.94, "eta": "58:04:51", "max_grad_norm": 0.8, "loss": 0.9390727281570435, "grad_norm": 1.9946284294128418, "learning_rate": 2.4165202108963093e-05} +{"ts": "2025-12-26T19:18:10", "event": "train_log", "step": 278, "epoch": 0.11729957805907174, "progress_pct": 1.95, "epoch_pct": 1.95, "eta": "57:54:52", "max_grad_norm": 0.8, "loss": 0.9625692963600159, "grad_norm": 2.4489169120788574, "learning_rate": 2.4340949033391916e-05} +{"ts": "2025-12-26T19:18:28", "event": "train_log", "step": 280, "epoch": 0.11814345991561181, "progress_pct": 1.97, "epoch_pct": 1.97, "eta": "57:44:55", "max_grad_norm": 0.8, "loss": 0.9304702877998352, "grad_norm": 2.0919103622436523, "learning_rate": 2.451669595782074e-05} +{"ts": "2025-12-26T19:18:48", "event": "train_log", "step": 282, "epoch": 0.1189873417721519, "progress_pct": 1.98, "epoch_pct": 1.98, "eta": "57:36:00", "max_grad_norm": 0.8, "loss": 0.9313994646072388, "grad_norm": 1.912914752960205, "learning_rate": 2.469244288224956e-05} +{"ts": "2025-12-26T19:19:06", "event": "train_log", "step": 284, "epoch": 0.11983122362869199, "progress_pct": 2.0, "epoch_pct": 2.0, "eta": "57:25:47", "max_grad_norm": 0.8, "loss": 1.004011869430542, "grad_norm": 2.1553256511688232, "learning_rate": 2.4868189806678387e-05} +{"ts": "2025-12-26T19:19:25", "event": "train_log", "step": 286, "epoch": 0.12067510548523207, "progress_pct": 2.01, "epoch_pct": 2.01, "eta": "57:16:36", "max_grad_norm": 0.8, "loss": 0.9092531204223633, "grad_norm": 2.0129058361053467, "learning_rate": 2.504393673110721e-05} +{"ts": "2025-12-26T19:19:43", "event": "train_log", "step": 288, "epoch": 0.12151898734177215, "progress_pct": 2.03, "epoch_pct": 2.03, "eta": "57:07:12", "max_grad_norm": 0.8, "loss": 0.993347704410553, "grad_norm": 2.1632325649261475, "learning_rate": 2.5219683655536032e-05} +{"ts": "2025-12-26T19:20:02", "event": "train_log", "step": 290, "epoch": 0.12236286919831224, "progress_pct": 2.04, "epoch_pct": 2.04, "eta": "56:57:56", "max_grad_norm": 0.8, "loss": 0.978348433971405, "grad_norm": 2.3072738647460938, "learning_rate": 2.539543057996485e-05} +{"ts": "2025-12-26T19:20:21", "event": "train_log", "step": 292, "epoch": 0.12320675105485232, "progress_pct": 2.05, "epoch_pct": 2.05, "eta": "56:49:23", "max_grad_norm": 0.8, "loss": 1.0018101930618286, "grad_norm": 2.056560516357422, "learning_rate": 2.5571177504393674e-05} +{"ts": "2025-12-26T19:20:40", "event": "train_log", "step": 294, "epoch": 0.1240506329113924, "progress_pct": 2.07, "epoch_pct": 2.07, "eta": "56:40:48", "max_grad_norm": 0.8, "loss": 0.9607775211334229, "grad_norm": 1.8906747102737427, "learning_rate": 2.5746924428822493e-05} +{"ts": "2025-12-26T19:21:00", "event": "train_log", "step": 296, "epoch": 0.1248945147679325, "progress_pct": 2.08, "epoch_pct": 2.08, "eta": "56:32:40", "max_grad_norm": 0.8, "loss": 0.9259153008460999, "grad_norm": 2.1375651359558105, "learning_rate": 2.5922671353251322e-05} +{"ts": "2025-12-26T19:21:19", "event": "train_log", "step": 298, "epoch": 0.1257383966244726, "progress_pct": 2.1, "epoch_pct": 2.1, "eta": "56:24:08", "max_grad_norm": 0.8, "loss": 0.8524524569511414, "grad_norm": 1.9994823932647705, "learning_rate": 2.609841827768014e-05} +{"ts": "2025-12-26T19:21:37", "event": "train_log", "step": 300, "epoch": 0.12658227848101267, "progress_pct": 2.11, "epoch_pct": 2.11, "eta": "56:15:07", "max_grad_norm": 0.8, "loss": 1.0047069787979126, "grad_norm": 2.2421181201934814, "learning_rate": 2.6274165202108964e-05} +{"ts": "2025-12-26T19:35:57", "event": "train_log", "step": 300, "epoch": 0.12658227848101267, "progress_pct": 2.11, "epoch_pct": 2.11, "eta": "67:20:12", "max_grad_norm": 0.8, "eval_loss": 0.9517185688018799, "eval_runtime": 860.0287, "eval_samples_per_second": 2.45, "eval_steps_per_second": 2.45} +{"ts": "2025-12-26T19:36:17", "event": "train_log", "step": 302, "epoch": 0.12742616033755275, "progress_pct": 2.12, "epoch_pct": 2.12, "eta": "67:08:29", "max_grad_norm": 0.8, "loss": 0.8475471138954163, "grad_norm": 2.1206254959106445, "learning_rate": 2.6449912126537786e-05} +{"ts": "2025-12-26T19:36:39", "event": "train_log", "step": 304, "epoch": 0.12827004219409283, "progress_pct": 2.14, "epoch_pct": 2.14, "eta": "66:57:41", "max_grad_norm": 0.8, "loss": 0.8643121123313904, "grad_norm": 1.885161280632019, "learning_rate": 2.6625659050966612e-05} +{"ts": "2025-12-26T19:36:59", "event": "train_log", "step": 306, "epoch": 0.1291139240506329, "progress_pct": 2.15, "epoch_pct": 2.15, "eta": "66:46:41", "max_grad_norm": 0.8, "loss": 0.8804612159729004, "grad_norm": 3.1441781520843506, "learning_rate": 2.680140597539543e-05} +{"ts": "2025-12-26T19:37:20", "event": "train_log", "step": 308, "epoch": 0.12995780590717299, "progress_pct": 2.17, "epoch_pct": 2.17, "eta": "66:35:38", "max_grad_norm": 0.8, "loss": 0.8348029255867004, "grad_norm": 1.953133225440979, "learning_rate": 2.6977152899824254e-05} +{"ts": "2025-12-26T19:37:41", "event": "train_log", "step": 310, "epoch": 0.1308016877637131, "progress_pct": 2.18, "epoch_pct": 2.18, "eta": "66:24:43", "max_grad_norm": 0.8, "loss": 0.8889057040214539, "grad_norm": 2.3762667179107666, "learning_rate": 2.7152899824253076e-05} +{"ts": "2025-12-26T19:37:59", "event": "train_log", "step": 312, "epoch": 0.13164556962025317, "progress_pct": 2.19, "epoch_pct": 2.19, "eta": "66:12:20", "max_grad_norm": 0.8, "loss": 1.025565505027771, "grad_norm": 2.4651103019714355, "learning_rate": 2.7328646748681902e-05} +{"ts": "2025-12-26T19:38:20", "event": "train_log", "step": 314, "epoch": 0.13248945147679325, "progress_pct": 2.21, "epoch_pct": 2.21, "eta": "66:02:03", "max_grad_norm": 0.8, "loss": 0.868915855884552, "grad_norm": 1.8522284030914307, "learning_rate": 2.7504393673110725e-05} +{"ts": "2025-12-26T19:38:41", "event": "train_log", "step": 316, "epoch": 0.13333333333333333, "progress_pct": 2.22, "epoch_pct": 2.22, "eta": "65:51:54", "max_grad_norm": 0.8, "loss": 0.8821638226509094, "grad_norm": 1.8048083782196045, "learning_rate": 2.7680140597539544e-05} +{"ts": "2025-12-26T19:39:02", "event": "train_log", "step": 318, "epoch": 0.1341772151898734, "progress_pct": 2.24, "epoch_pct": 2.24, "eta": "65:41:33", "max_grad_norm": 0.8, "loss": 0.8735360503196716, "grad_norm": 1.9933605194091797, "learning_rate": 2.7855887521968367e-05} +{"ts": "2025-12-26T19:39:22", "event": "train_log", "step": 320, "epoch": 0.1350210970464135, "progress_pct": 2.25, "epoch_pct": 2.25, "eta": "65:30:58", "max_grad_norm": 0.8, "loss": 0.8288834691047668, "grad_norm": 2.044337034225464, "learning_rate": 2.8031634446397186e-05} +{"ts": "2025-12-26T19:39:41", "event": "train_log", "step": 322, "epoch": 0.1358649789029536, "progress_pct": 2.26, "epoch_pct": 2.26, "eta": "65:19:40", "max_grad_norm": 0.8, "loss": 0.9104969501495361, "grad_norm": 2.416067361831665, "learning_rate": 2.8207381370826015e-05} +{"ts": "2025-12-26T19:40:02", "event": "train_log", "step": 324, "epoch": 0.13670886075949368, "progress_pct": 2.28, "epoch_pct": 2.28, "eta": "65:09:32", "max_grad_norm": 0.8, "loss": 0.8689924478530884, "grad_norm": 2.0731265544891357, "learning_rate": 2.8383128295254834e-05} +{"ts": "2025-12-26T19:40:23", "event": "train_log", "step": 326, "epoch": 0.13755274261603376, "progress_pct": 2.29, "epoch_pct": 2.29, "eta": "64:59:59", "max_grad_norm": 0.8, "loss": 0.9312222003936768, "grad_norm": 2.049126386642456, "learning_rate": 2.8558875219683657e-05} +{"ts": "2025-12-26T19:40:44", "event": "train_log", "step": 328, "epoch": 0.13839662447257384, "progress_pct": 2.31, "epoch_pct": 2.31, "eta": "64:50:33", "max_grad_norm": 0.8, "loss": 0.8933501839637756, "grad_norm": 2.131026268005371, "learning_rate": 2.8734622144112476e-05} +{"ts": "2025-12-26T19:41:05", "event": "train_log", "step": 330, "epoch": 0.13924050632911392, "progress_pct": 2.32, "epoch_pct": 2.32, "eta": "64:40:52", "max_grad_norm": 0.8, "loss": 0.8998261094093323, "grad_norm": 1.766754150390625, "learning_rate": 2.8910369068541305e-05} +{"ts": "2025-12-26T19:41:23", "event": "train_log", "step": 332, "epoch": 0.140084388185654, "progress_pct": 2.33, "epoch_pct": 2.33, "eta": "64:29:37", "max_grad_norm": 0.8, "loss": 0.8826426267623901, "grad_norm": 2.197706460952759, "learning_rate": 2.9086115992970124e-05} +{"ts": "2025-12-26T19:41:43", "event": "train_log", "step": 334, "epoch": 0.1409282700421941, "progress_pct": 2.35, "epoch_pct": 2.35, "eta": "64:19:54", "max_grad_norm": 0.8, "loss": 0.8590307831764221, "grad_norm": 1.953715443611145, "learning_rate": 2.9261862917398947e-05} +{"ts": "2025-12-26T19:42:01", "event": "train_log", "step": 336, "epoch": 0.14177215189873418, "progress_pct": 2.36, "epoch_pct": 2.36, "eta": "64:09:05", "max_grad_norm": 0.8, "loss": 0.9317060708999634, "grad_norm": 2.200929880142212, "learning_rate": 2.943760984182777e-05} +{"ts": "2025-12-26T19:42:21", "event": "train_log", "step": 338, "epoch": 0.14261603375527426, "progress_pct": 2.38, "epoch_pct": 2.38, "eta": "63:58:59", "max_grad_norm": 0.8, "loss": 0.9965578317642212, "grad_norm": 2.1195082664489746, "learning_rate": 2.961335676625659e-05} +{"ts": "2025-12-26T19:42:40", "event": "train_log", "step": 340, "epoch": 0.14345991561181434, "progress_pct": 2.39, "epoch_pct": 2.39, "eta": "63:48:55", "max_grad_norm": 0.8, "loss": 0.8353848457336426, "grad_norm": 2.3449771404266357, "learning_rate": 2.9789103690685414e-05} +{"ts": "2025-12-26T19:42:59", "event": "train_log", "step": 342, "epoch": 0.14430379746835442, "progress_pct": 2.41, "epoch_pct": 2.41, "eta": "63:39:08", "max_grad_norm": 0.8, "loss": 0.9154735803604126, "grad_norm": 2.000497579574585, "learning_rate": 2.9964850615114237e-05} +{"ts": "2025-12-26T19:43:18", "event": "train_log", "step": 344, "epoch": 0.1451476793248945, "progress_pct": 2.42, "epoch_pct": 2.42, "eta": "63:28:59", "max_grad_norm": 0.8, "loss": 0.9530655741691589, "grad_norm": 2.141890525817871, "learning_rate": 3.014059753954306e-05} +{"ts": "2025-12-26T19:43:38", "event": "train_log", "step": 346, "epoch": 0.1459915611814346, "progress_pct": 2.43, "epoch_pct": 2.43, "eta": "63:19:48", "max_grad_norm": 0.8, "loss": 0.896998405456543, "grad_norm": 1.7717392444610596, "learning_rate": 3.031634446397188e-05} +{"ts": "2025-12-26T19:43:58", "event": "train_log", "step": 348, "epoch": 0.1468354430379747, "progress_pct": 2.45, "epoch_pct": 2.45, "eta": "63:10:16", "max_grad_norm": 0.8, "loss": 0.9084208011627197, "grad_norm": 1.8796685934066772, "learning_rate": 3.0492091388400708e-05} +{"ts": "2025-12-26T19:44:17", "event": "train_log", "step": 350, "epoch": 0.14767932489451477, "progress_pct": 2.46, "epoch_pct": 2.46, "eta": "63:00:36", "max_grad_norm": 0.8, "loss": 0.9183387756347656, "grad_norm": 2.0298709869384766, "learning_rate": 3.066783831282953e-05} +{"ts": "2025-12-26T19:44:37", "event": "train_log", "step": 352, "epoch": 0.14852320675105485, "progress_pct": 2.48, "epoch_pct": 2.48, "eta": "62:52:10", "max_grad_norm": 0.8, "loss": 0.8624772429466248, "grad_norm": 1.9245645999908447, "learning_rate": 3.084358523725835e-05} +{"ts": "2025-12-26T19:44:57", "event": "train_log", "step": 354, "epoch": 0.14936708860759493, "progress_pct": 2.49, "epoch_pct": 2.49, "eta": "62:43:11", "max_grad_norm": 0.8, "loss": 0.9142400026321411, "grad_norm": 2.325681209564209, "learning_rate": 3.101933216168717e-05} +{"ts": "2025-12-26T19:45:15", "event": "train_log", "step": 356, "epoch": 0.150210970464135, "progress_pct": 2.5, "epoch_pct": 2.5, "eta": "62:33:23", "max_grad_norm": 0.8, "loss": 0.9064018130302429, "grad_norm": 2.1200530529022217, "learning_rate": 3.1195079086115995e-05} +{"ts": "2025-12-26T19:45:35", "event": "train_log", "step": 358, "epoch": 0.15105485232067511, "progress_pct": 2.52, "epoch_pct": 2.52, "eta": "62:24:22", "max_grad_norm": 0.8, "loss": 0.9199238419532776, "grad_norm": 1.979314923286438, "learning_rate": 3.137082601054482e-05} +{"ts": "2025-12-26T19:45:54", "event": "train_log", "step": 360, "epoch": 0.1518987341772152, "progress_pct": 2.53, "epoch_pct": 2.53, "eta": "62:15:11", "max_grad_norm": 0.8, "loss": 0.8030132055282593, "grad_norm": 2.1122689247131348, "learning_rate": 3.154657293497364e-05} +{"ts": "2025-12-26T19:46:12", "event": "train_log", "step": 362, "epoch": 0.15274261603375527, "progress_pct": 2.55, "epoch_pct": 2.55, "eta": "62:05:48", "max_grad_norm": 0.8, "loss": 0.9185854196548462, "grad_norm": 2.105767250061035, "learning_rate": 3.172231985940246e-05} +{"ts": "2025-12-26T19:46:31", "event": "train_log", "step": 364, "epoch": 0.15358649789029535, "progress_pct": 2.56, "epoch_pct": 2.56, "eta": "61:57:04", "max_grad_norm": 0.8, "loss": 0.9365083575248718, "grad_norm": 2.179471015930176, "learning_rate": 3.1898066783831285e-05} +{"ts": "2025-12-26T19:46:51", "event": "train_log", "step": 366, "epoch": 0.15443037974683543, "progress_pct": 2.57, "epoch_pct": 2.57, "eta": "61:48:55", "max_grad_norm": 0.8, "loss": 0.8965140581130981, "grad_norm": 2.1444311141967773, "learning_rate": 3.207381370826011e-05} +{"ts": "2025-12-26T19:47:10", "event": "train_log", "step": 368, "epoch": 0.15527426160337554, "progress_pct": 2.59, "epoch_pct": 2.59, "eta": "61:39:58", "max_grad_norm": 0.8, "loss": 0.8787504434585571, "grad_norm": 2.4171674251556396, "learning_rate": 3.224956063268893e-05} +{"ts": "2025-12-26T19:47:29", "event": "train_log", "step": 370, "epoch": 0.15611814345991562, "progress_pct": 2.6, "epoch_pct": 2.6, "eta": "61:31:10", "max_grad_norm": 0.8, "loss": 0.8925284147262573, "grad_norm": 2.418628215789795, "learning_rate": 3.242530755711775e-05} +{"ts": "2025-12-26T19:47:49", "event": "train_log", "step": 372, "epoch": 0.1569620253164557, "progress_pct": 2.62, "epoch_pct": 2.62, "eta": "61:22:59", "max_grad_norm": 0.8, "loss": 0.876179039478302, "grad_norm": 2.2228314876556396, "learning_rate": 3.2601054481546575e-05} +{"ts": "2025-12-26T19:48:08", "event": "train_log", "step": 374, "epoch": 0.15780590717299578, "progress_pct": 2.63, "epoch_pct": 2.63, "eta": "61:14:46", "max_grad_norm": 0.8, "loss": 0.8365707993507385, "grad_norm": 2.324237108230591, "learning_rate": 3.27768014059754e-05} +{"ts": "2025-12-26T19:48:30", "event": "train_log", "step": 376, "epoch": 0.15864978902953586, "progress_pct": 2.64, "epoch_pct": 2.64, "eta": "61:08:01", "max_grad_norm": 0.8, "loss": 0.7864399552345276, "grad_norm": 2.6344552040100098, "learning_rate": 3.295254833040422e-05} +{"ts": "2025-12-26T19:48:49", "event": "train_log", "step": 378, "epoch": 0.15949367088607594, "progress_pct": 2.66, "epoch_pct": 2.66, "eta": "60:59:57", "max_grad_norm": 0.8, "loss": 0.9271875023841858, "grad_norm": 2.047536611557007, "learning_rate": 3.312829525483304e-05} +{"ts": "2025-12-26T19:49:08", "event": "train_log", "step": 380, "epoch": 0.16033755274261605, "progress_pct": 2.67, "epoch_pct": 2.67, "eta": "60:51:36", "max_grad_norm": 0.8, "loss": 0.8799133896827698, "grad_norm": 2.120025157928467, "learning_rate": 3.3304042179261865e-05} +{"ts": "2025-12-26T19:49:28", "event": "train_log", "step": 382, "epoch": 0.16118143459915613, "progress_pct": 2.69, "epoch_pct": 2.69, "eta": "60:44:07", "max_grad_norm": 0.8, "loss": 0.8973530530929565, "grad_norm": 2.363692045211792, "learning_rate": 3.347978910369069e-05} +{"ts": "2025-12-26T19:49:47", "event": "train_log", "step": 384, "epoch": 0.1620253164556962, "progress_pct": 2.7, "epoch_pct": 2.7, "eta": "60:35:37", "max_grad_norm": 0.8, "loss": 1.0277652740478516, "grad_norm": 2.1796772480010986, "learning_rate": 3.365553602811951e-05} +{"ts": "2025-12-26T19:50:06", "event": "train_log", "step": 386, "epoch": 0.16286919831223629, "progress_pct": 2.71, "epoch_pct": 2.71, "eta": "60:27:44", "max_grad_norm": 0.8, "loss": 0.8909643888473511, "grad_norm": 1.9192595481872559, "learning_rate": 3.383128295254833e-05} +{"ts": "2025-12-26T19:50:27", "event": "train_log", "step": 388, "epoch": 0.16371308016877636, "progress_pct": 2.73, "epoch_pct": 2.73, "eta": "60:21:00", "max_grad_norm": 0.8, "loss": 0.837049663066864, "grad_norm": 1.7874376773834229, "learning_rate": 3.4007029876977155e-05} +{"ts": "2025-12-26T19:50:47", "event": "train_log", "step": 390, "epoch": 0.16455696202531644, "progress_pct": 2.74, "epoch_pct": 2.74, "eta": "60:13:37", "max_grad_norm": 0.8, "loss": 0.8625202775001526, "grad_norm": 2.3402366638183594, "learning_rate": 3.4182776801405974e-05} +{"ts": "2025-12-26T19:51:06", "event": "train_log", "step": 392, "epoch": 0.16540084388185655, "progress_pct": 2.76, "epoch_pct": 2.76, "eta": "60:05:55", "max_grad_norm": 0.8, "loss": 0.9288321137428284, "grad_norm": 2.1137185096740723, "learning_rate": 3.43585237258348e-05} +{"ts": "2025-12-26T19:51:25", "event": "train_log", "step": 394, "epoch": 0.16624472573839663, "progress_pct": 2.77, "epoch_pct": 2.77, "eta": "59:58:26", "max_grad_norm": 0.8, "loss": 0.9328726530075073, "grad_norm": 2.3776895999908447, "learning_rate": 3.453427065026362e-05} +{"ts": "2025-12-26T19:51:45", "event": "train_log", "step": 396, "epoch": 0.1670886075949367, "progress_pct": 2.78, "epoch_pct": 2.78, "eta": "59:51:02", "max_grad_norm": 0.8, "loss": 0.9273309707641602, "grad_norm": 2.34941029548645, "learning_rate": 3.4710017574692445e-05} +{"ts": "2025-12-26T19:52:03", "event": "train_log", "step": 398, "epoch": 0.1679324894514768, "progress_pct": 2.8, "epoch_pct": 2.8, "eta": "59:43:20", "max_grad_norm": 0.8, "loss": 0.8703887462615967, "grad_norm": 2.1272573471069336, "learning_rate": 3.4885764499121264e-05} +{"ts": "2025-12-26T19:52:23", "event": "train_log", "step": 400, "epoch": 0.16877637130801687, "progress_pct": 2.81, "epoch_pct": 2.81, "eta": "59:36:04", "max_grad_norm": 0.8, "loss": 0.8808165788650513, "grad_norm": 2.047290802001953, "learning_rate": 3.506151142355009e-05} +{"ts": "2025-12-26T20:06:52", "event": "train_log", "step": 400, "epoch": 0.16877637130801687, "progress_pct": 2.81, "epoch_pct": 2.81, "eta": "67:56:51", "max_grad_norm": 0.8, "eval_loss": 0.9282881617546082, "eval_runtime": 869.6867, "eval_samples_per_second": 2.423, "eval_steps_per_second": 2.423} +{"ts": "2025-12-26T20:07:12", "event": "train_log", "step": 402, "epoch": 0.16962025316455695, "progress_pct": 2.83, "epoch_pct": 2.83, "eta": "67:47:01", "max_grad_norm": 0.8, "loss": 0.9643645286560059, "grad_norm": 1.9874159097671509, "learning_rate": 3.5237258347978916e-05} +{"ts": "2025-12-26T20:07:31", "event": "train_log", "step": 404, "epoch": 0.17046413502109706, "progress_pct": 2.84, "epoch_pct": 2.84, "eta": "67:37:18", "max_grad_norm": 0.8, "loss": 0.9173495769500732, "grad_norm": 1.9299919605255127, "learning_rate": 3.5413005272407735e-05} +{"ts": "2025-12-26T20:07:50", "event": "train_log", "step": 406, "epoch": 0.17130801687763714, "progress_pct": 2.86, "epoch_pct": 2.86, "eta": "67:27:33", "max_grad_norm": 0.8, "loss": 0.8998411893844604, "grad_norm": 2.3379697799682617, "learning_rate": 3.5588752196836555e-05} +{"ts": "2025-12-26T20:08:08", "event": "train_log", "step": 408, "epoch": 0.17215189873417722, "progress_pct": 2.87, "epoch_pct": 2.87, "eta": "67:17:18", "max_grad_norm": 0.8, "loss": 0.9310802221298218, "grad_norm": 2.241370916366577, "learning_rate": 3.5764499121265374e-05} +{"ts": "2025-12-26T20:08:27", "event": "train_log", "step": 410, "epoch": 0.1729957805907173, "progress_pct": 2.88, "epoch_pct": 2.88, "eta": "67:07:28", "max_grad_norm": 0.8, "loss": 0.9605053067207336, "grad_norm": 2.4490108489990234, "learning_rate": 3.5940246045694206e-05} +{"ts": "2025-12-26T20:08:47", "event": "train_log", "step": 412, "epoch": 0.17383966244725738, "progress_pct": 2.9, "epoch_pct": 2.9, "eta": "66:58:54", "max_grad_norm": 0.8, "loss": 0.8485683798789978, "grad_norm": 1.8247230052947998, "learning_rate": 3.6115992970123026e-05} +{"ts": "2025-12-26T20:09:06", "event": "train_log", "step": 414, "epoch": 0.17468354430379746, "progress_pct": 2.91, "epoch_pct": 2.91, "eta": "66:49:22", "max_grad_norm": 0.8, "loss": 0.9325968623161316, "grad_norm": 2.4608843326568604, "learning_rate": 3.6291739894551845e-05} +{"ts": "2025-12-26T20:09:26", "event": "train_log", "step": 416, "epoch": 0.17552742616033756, "progress_pct": 2.93, "epoch_pct": 2.93, "eta": "66:40:18", "max_grad_norm": 0.8, "loss": 0.9125096201896667, "grad_norm": 1.8923161029815674, "learning_rate": 3.646748681898067e-05} +{"ts": "2025-12-26T20:09:45", "event": "train_log", "step": 418, "epoch": 0.17637130801687764, "progress_pct": 2.94, "epoch_pct": 2.94, "eta": "66:31:15", "max_grad_norm": 0.8, "loss": 0.8852217197418213, "grad_norm": 1.8502769470214844, "learning_rate": 3.6643233743409497e-05} +{"ts": "2025-12-26T20:10:05", "event": "train_log", "step": 420, "epoch": 0.17721518987341772, "progress_pct": 2.95, "epoch_pct": 2.95, "eta": "66:22:37", "max_grad_norm": 0.8, "loss": 0.9192792773246765, "grad_norm": 1.9155100584030151, "learning_rate": 3.6818980667838316e-05} +{"ts": "2025-12-26T20:10:25", "event": "train_log", "step": 422, "epoch": 0.1780590717299578, "progress_pct": 2.97, "epoch_pct": 2.97, "eta": "66:13:55", "max_grad_norm": 0.8, "loss": 0.8787404298782349, "grad_norm": 2.181476593017578, "learning_rate": 3.6994727592267135e-05} +{"ts": "2025-12-26T20:10:43", "event": "train_log", "step": 424, "epoch": 0.17890295358649788, "progress_pct": 2.98, "epoch_pct": 2.98, "eta": "66:04:38", "max_grad_norm": 0.8, "loss": 0.9109582901000977, "grad_norm": 2.2469847202301025, "learning_rate": 3.717047451669596e-05} +{"ts": "2025-12-26T20:11:02", "event": "train_log", "step": 426, "epoch": 0.17974683544303796, "progress_pct": 3.0, "epoch_pct": 3.0, "eta": "65:55:45", "max_grad_norm": 0.8, "loss": 0.8560389280319214, "grad_norm": 2.08145809173584, "learning_rate": 3.734622144112479e-05} +{"ts": "2025-12-26T20:11:21", "event": "train_log", "step": 428, "epoch": 0.18059071729957807, "progress_pct": 3.01, "epoch_pct": 3.01, "eta": "65:46:40", "max_grad_norm": 0.8, "loss": 0.9456104040145874, "grad_norm": 4.121932506561279, "learning_rate": 3.7521968365553606e-05} +{"ts": "2025-12-26T20:11:42", "event": "train_log", "step": 430, "epoch": 0.18143459915611815, "progress_pct": 3.02, "epoch_pct": 3.02, "eta": "65:38:57", "max_grad_norm": 0.8, "loss": 0.8421300649642944, "grad_norm": 2.177459478378296, "learning_rate": 3.7697715289982425e-05} +{"ts": "2025-12-26T20:12:01", "event": "train_log", "step": 432, "epoch": 0.18227848101265823, "progress_pct": 3.04, "epoch_pct": 3.04, "eta": "65:30:21", "max_grad_norm": 0.8, "loss": 0.9199858903884888, "grad_norm": 2.324970245361328, "learning_rate": 3.787346221441125e-05} +{"ts": "2025-12-26T20:12:20", "event": "train_log", "step": 434, "epoch": 0.1831223628691983, "progress_pct": 3.05, "epoch_pct": 3.05, "eta": "65:21:45", "max_grad_norm": 0.8, "loss": 0.8953126668930054, "grad_norm": 2.133718490600586, "learning_rate": 3.804920913884007e-05} +{"ts": "2025-12-26T20:12:42", "event": "train_log", "step": 436, "epoch": 0.1839662447257384, "progress_pct": 3.07, "epoch_pct": 3.07, "eta": "65:14:28", "max_grad_norm": 0.8, "loss": 0.8732239007949829, "grad_norm": 1.8527995347976685, "learning_rate": 3.8224956063268896e-05} +{"ts": "2025-12-26T20:13:02", "event": "train_log", "step": 438, "epoch": 0.1848101265822785, "progress_pct": 3.08, "epoch_pct": 3.08, "eta": "65:06:32", "max_grad_norm": 0.8, "loss": 0.8818746209144592, "grad_norm": 1.95817232131958, "learning_rate": 3.8400702987697715e-05} +{"ts": "2025-12-26T20:13:20", "event": "train_log", "step": 440, "epoch": 0.18565400843881857, "progress_pct": 3.09, "epoch_pct": 3.09, "eta": "64:57:49", "max_grad_norm": 0.8, "loss": 0.9153507947921753, "grad_norm": 2.2107293605804443, "learning_rate": 3.857644991212654e-05} +{"ts": "2025-12-26T20:13:40", "event": "train_log", "step": 442, "epoch": 0.18649789029535865, "progress_pct": 3.11, "epoch_pct": 3.11, "eta": "64:50:09", "max_grad_norm": 0.8, "loss": 0.8960154056549072, "grad_norm": 2.004754066467285, "learning_rate": 3.875219683655536e-05} +{"ts": "2025-12-26T20:14:00", "event": "train_log", "step": 444, "epoch": 0.18734177215189873, "progress_pct": 3.12, "epoch_pct": 3.12, "eta": "64:42:10", "max_grad_norm": 0.8, "loss": 0.909011721611023, "grad_norm": 2.1851706504821777, "learning_rate": 3.8927943760984186e-05} +{"ts": "2025-12-26T20:14:19", "event": "train_log", "step": 446, "epoch": 0.1881856540084388, "progress_pct": 3.14, "epoch_pct": 3.14, "eta": "64:33:57", "max_grad_norm": 0.8, "loss": 0.8880158066749573, "grad_norm": 2.4492485523223877, "learning_rate": 3.9103690685413005e-05} +{"ts": "2025-12-26T20:14:37", "event": "train_log", "step": 448, "epoch": 0.1890295358649789, "progress_pct": 3.15, "epoch_pct": 3.15, "eta": "64:25:39", "max_grad_norm": 0.8, "loss": 0.8500842452049255, "grad_norm": 2.745453119277954, "learning_rate": 3.927943760984183e-05} +{"ts": "2025-12-26T20:14:56", "event": "train_log", "step": 450, "epoch": 0.189873417721519, "progress_pct": 3.16, "epoch_pct": 3.16, "eta": "64:17:36", "max_grad_norm": 0.8, "loss": 0.9004045724868774, "grad_norm": 2.1924264430999756, "learning_rate": 3.945518453427065e-05} +{"ts": "2025-12-26T20:15:15", "event": "train_log", "step": 452, "epoch": 0.19071729957805908, "progress_pct": 3.18, "epoch_pct": 3.18, "eta": "64:09:16", "max_grad_norm": 0.8, "loss": 0.9020664095878601, "grad_norm": 2.4051687717437744, "learning_rate": 3.9630931458699476e-05} +{"ts": "2025-12-26T20:15:34", "event": "train_log", "step": 454, "epoch": 0.19156118143459916, "progress_pct": 3.19, "epoch_pct": 3.19, "eta": "64:01:33", "max_grad_norm": 0.8, "loss": 0.8639500737190247, "grad_norm": 1.8077667951583862, "learning_rate": 3.9806678383128295e-05} +{"ts": "2025-12-26T20:15:53", "event": "train_log", "step": 456, "epoch": 0.19240506329113924, "progress_pct": 3.21, "epoch_pct": 3.21, "eta": "63:53:28", "max_grad_norm": 0.8, "loss": 0.8642048239707947, "grad_norm": 2.089043378829956, "learning_rate": 3.998242530755712e-05} +{"ts": "2025-12-26T20:16:11", "event": "train_log", "step": 458, "epoch": 0.19324894514767932, "progress_pct": 3.22, "epoch_pct": 3.22, "eta": "63:45:28", "max_grad_norm": 0.8, "loss": 0.9371927380561829, "grad_norm": 2.029578447341919, "learning_rate": 4.015817223198594e-05} +{"ts": "2025-12-26T20:16:30", "event": "train_log", "step": 460, "epoch": 0.1940928270042194, "progress_pct": 3.23, "epoch_pct": 3.23, "eta": "63:37:42", "max_grad_norm": 0.8, "loss": 0.9120588302612305, "grad_norm": 2.26582407951355, "learning_rate": 4.033391915641476e-05} +{"ts": "2025-12-26T20:16:50", "event": "train_log", "step": 462, "epoch": 0.1949367088607595, "progress_pct": 3.25, "epoch_pct": 3.25, "eta": "63:30:17", "max_grad_norm": 0.8, "loss": 0.8758644461631775, "grad_norm": 1.8671411275863647, "learning_rate": 4.050966608084359e-05} +{"ts": "2025-12-26T20:17:10", "event": "train_log", "step": 464, "epoch": 0.19578059071729959, "progress_pct": 3.26, "epoch_pct": 3.26, "eta": "63:23:21", "max_grad_norm": 0.8, "loss": 0.914577305316925, "grad_norm": 1.9403492212295532, "learning_rate": 4.068541300527241e-05} +{"ts": "2025-12-26T20:17:29", "event": "train_log", "step": 466, "epoch": 0.19662447257383966, "progress_pct": 3.28, "epoch_pct": 3.28, "eta": "63:15:54", "max_grad_norm": 0.8, "loss": 0.8592531681060791, "grad_norm": 1.9939641952514648, "learning_rate": 4.086115992970123e-05} +{"ts": "2025-12-26T20:17:49", "event": "train_log", "step": 468, "epoch": 0.19746835443037974, "progress_pct": 3.29, "epoch_pct": 3.29, "eta": "63:08:43", "max_grad_norm": 0.8, "loss": 0.9251965880393982, "grad_norm": 2.1511380672454834, "learning_rate": 4.103690685413005e-05} +{"ts": "2025-12-26T20:18:07", "event": "train_log", "step": 470, "epoch": 0.19831223628691982, "progress_pct": 3.31, "epoch_pct": 3.31, "eta": "63:01:15", "max_grad_norm": 0.8, "loss": 0.8465172052383423, "grad_norm": 2.2260982990264893, "learning_rate": 4.121265377855888e-05} +{"ts": "2025-12-26T20:18:28", "event": "train_log", "step": 472, "epoch": 0.1991561181434599, "progress_pct": 3.32, "epoch_pct": 3.32, "eta": "62:54:25", "max_grad_norm": 0.8, "loss": 0.8943672180175781, "grad_norm": 2.0510010719299316, "learning_rate": 4.13884007029877e-05} +{"ts": "2025-12-26T20:18:48", "event": "train_log", "step": 474, "epoch": 0.2, "progress_pct": 3.33, "epoch_pct": 3.33, "eta": "62:47:40", "max_grad_norm": 0.8, "loss": 0.9594319462776184, "grad_norm": 2.2040133476257324, "learning_rate": 4.156414762741652e-05} +{"ts": "2025-12-26T20:19:07", "event": "train_log", "step": 476, "epoch": 0.2008438818565401, "progress_pct": 3.35, "epoch_pct": 3.35, "eta": "62:40:44", "max_grad_norm": 0.8, "loss": 0.9031813144683838, "grad_norm": 2.355181932449341, "learning_rate": 4.173989455184534e-05} +{"ts": "2025-12-26T20:19:27", "event": "train_log", "step": 478, "epoch": 0.20168776371308017, "progress_pct": 3.36, "epoch_pct": 3.36, "eta": "62:33:55", "max_grad_norm": 0.8, "loss": 0.9225798845291138, "grad_norm": 2.8434665203094482, "learning_rate": 4.1915641476274166e-05} +{"ts": "2025-12-26T20:19:46", "event": "train_log", "step": 480, "epoch": 0.20253164556962025, "progress_pct": 3.38, "epoch_pct": 3.38, "eta": "62:26:39", "max_grad_norm": 0.8, "loss": 0.894163966178894, "grad_norm": 2.1715340614318848, "learning_rate": 4.209138840070299e-05} +{"ts": "2025-12-26T20:20:05", "event": "train_log", "step": 482, "epoch": 0.20337552742616033, "progress_pct": 3.39, "epoch_pct": 3.39, "eta": "62:19:55", "max_grad_norm": 0.8, "loss": 0.8424109816551208, "grad_norm": 2.078916072845459, "learning_rate": 4.226713532513181e-05} +{"ts": "2025-12-26T20:20:24", "event": "train_log", "step": 484, "epoch": 0.2042194092827004, "progress_pct": 3.4, "epoch_pct": 3.4, "eta": "62:12:53", "max_grad_norm": 0.8, "loss": 0.9102715849876404, "grad_norm": 1.9760961532592773, "learning_rate": 4.244288224956064e-05} +{"ts": "2025-12-26T20:20:45", "event": "train_log", "step": 486, "epoch": 0.20506329113924052, "progress_pct": 3.42, "epoch_pct": 3.42, "eta": "62:06:35", "max_grad_norm": 0.8, "loss": 0.8693854808807373, "grad_norm": 1.9684507846832275, "learning_rate": 4.2618629173989456e-05} +{"ts": "2025-12-26T20:21:05", "event": "train_log", "step": 488, "epoch": 0.2059071729957806, "progress_pct": 3.43, "epoch_pct": 3.43, "eta": "62:00:19", "max_grad_norm": 0.8, "loss": 0.8617543578147888, "grad_norm": 2.1633450984954834, "learning_rate": 4.279437609841828e-05} +{"ts": "2025-12-26T20:21:25", "event": "train_log", "step": 490, "epoch": 0.20675105485232068, "progress_pct": 3.45, "epoch_pct": 3.45, "eta": "61:53:49", "max_grad_norm": 0.8, "loss": 0.9167086482048035, "grad_norm": 2.2695257663726807, "learning_rate": 4.29701230228471e-05} +{"ts": "2025-12-26T20:21:45", "event": "train_log", "step": 492, "epoch": 0.20759493670886076, "progress_pct": 3.46, "epoch_pct": 3.46, "eta": "61:47:31", "max_grad_norm": 0.8, "loss": 0.8333520889282227, "grad_norm": 2.4180049896240234, "learning_rate": 4.314586994727593e-05} +{"ts": "2025-12-26T20:22:04", "event": "train_log", "step": 494, "epoch": 0.20843881856540084, "progress_pct": 3.47, "epoch_pct": 3.47, "eta": "61:40:56", "max_grad_norm": 0.8, "loss": 0.918351411819458, "grad_norm": 2.2942769527435303, "learning_rate": 4.3321616871704746e-05} +{"ts": "2025-12-26T20:22:25", "event": "train_log", "step": 496, "epoch": 0.20928270042194091, "progress_pct": 3.49, "epoch_pct": 3.49, "eta": "61:34:47", "max_grad_norm": 0.8, "loss": 0.8565171957015991, "grad_norm": 1.826458215713501, "learning_rate": 4.349736379613357e-05} +{"ts": "2025-12-26T20:22:44", "event": "train_log", "step": 498, "epoch": 0.21012658227848102, "progress_pct": 3.5, "epoch_pct": 3.5, "eta": "61:28:28", "max_grad_norm": 0.8, "loss": 0.8684167861938477, "grad_norm": 1.9694055318832397, "learning_rate": 4.367311072056239e-05} +{"ts": "2025-12-26T20:23:05", "event": "train_log", "step": 500, "epoch": 0.2109704641350211, "progress_pct": 3.52, "epoch_pct": 3.52, "eta": "61:22:45", "max_grad_norm": 0.8, "loss": 0.7752788662910461, "grad_norm": 1.892659306526184, "learning_rate": 4.384885764499122e-05} +{"ts": "2025-12-26T20:37:22", "event": "train_log", "step": 500, "epoch": 0.2109704641350211, "progress_pct": 3.52, "epoch_pct": 3.52, "eta": "67:54:43", "max_grad_norm": 0.8, "eval_loss": 0.9080732464790344, "eval_runtime": 857.0753, "eval_samples_per_second": 2.458, "eval_steps_per_second": 2.458} +{"ts": "2025-12-26T20:37:41", "event": "train_log", "step": 502, "epoch": 0.21181434599156118, "progress_pct": 3.53, "epoch_pct": 3.53, "eta": "67:46:36", "max_grad_norm": 0.8, "loss": 0.948570728302002, "grad_norm": 1.9322253465652466, "learning_rate": 4.4024604569420036e-05} +{"ts": "2025-12-26T20:38:00", "event": "train_log", "step": 504, "epoch": 0.21265822784810126, "progress_pct": 3.54, "epoch_pct": 3.54, "eta": "67:38:20", "max_grad_norm": 0.8, "loss": 0.8741024732589722, "grad_norm": 2.0456058979034424, "learning_rate": 4.4200351493848855e-05} +{"ts": "2025-12-26T20:38:18", "event": "train_log", "step": 506, "epoch": 0.21350210970464134, "progress_pct": 3.56, "epoch_pct": 3.56, "eta": "67:30:03", "max_grad_norm": 0.8, "loss": 0.9053841829299927, "grad_norm": 2.2406177520751953, "learning_rate": 4.437609841827768e-05} +{"ts": "2025-12-26T20:38:37", "event": "train_log", "step": 508, "epoch": 0.21434599156118145, "progress_pct": 3.57, "epoch_pct": 3.57, "eta": "67:22:03", "max_grad_norm": 0.8, "loss": 0.8886576294898987, "grad_norm": 2.013934850692749, "learning_rate": 4.455184534270651e-05} +{"ts": "2025-12-26T20:38:58", "event": "train_log", "step": 510, "epoch": 0.21518987341772153, "progress_pct": 3.59, "epoch_pct": 3.59, "eta": "67:14:51", "max_grad_norm": 0.8, "loss": 0.8834167718887329, "grad_norm": 1.9771125316619873, "learning_rate": 4.4727592267135326e-05} +{"ts": "2025-12-26T20:39:18", "event": "train_log", "step": 512, "epoch": 0.2160337552742616, "progress_pct": 3.6, "epoch_pct": 3.6, "eta": "67:07:37", "max_grad_norm": 0.8, "loss": 0.7938863039016724, "grad_norm": 1.785905361175537, "learning_rate": 4.4903339191564146e-05} +{"ts": "2025-12-26T20:39:39", "event": "train_log", "step": 514, "epoch": 0.2168776371308017, "progress_pct": 3.61, "epoch_pct": 3.61, "eta": "67:00:31", "max_grad_norm": 0.8, "loss": 0.8071596026420593, "grad_norm": 1.7946031093597412, "learning_rate": 4.507908611599297e-05} +{"ts": "2025-12-26T20:39:58", "event": "train_log", "step": 516, "epoch": 0.21772151898734177, "progress_pct": 3.63, "epoch_pct": 3.63, "eta": "66:52:56", "max_grad_norm": 0.8, "loss": 0.797417163848877, "grad_norm": 2.2217721939086914, "learning_rate": 4.52548330404218e-05} +{"ts": "2025-12-26T20:40:18", "event": "train_log", "step": 518, "epoch": 0.21856540084388185, "progress_pct": 3.64, "epoch_pct": 3.64, "eta": "66:45:34", "max_grad_norm": 0.8, "loss": 0.8109536170959473, "grad_norm": 1.9022471904754639, "learning_rate": 4.5430579964850617e-05} +{"ts": "2025-12-26T20:40:38", "event": "train_log", "step": 520, "epoch": 0.21940928270042195, "progress_pct": 3.66, "epoch_pct": 3.66, "eta": "66:38:08", "max_grad_norm": 0.8, "loss": 0.8647034168243408, "grad_norm": 1.8988343477249146, "learning_rate": 4.5606326889279436e-05} +{"ts": "2025-12-26T20:40:56", "event": "train_log", "step": 522, "epoch": 0.22025316455696203, "progress_pct": 3.67, "epoch_pct": 3.67, "eta": "66:30:08", "max_grad_norm": 0.8, "loss": 0.8763713240623474, "grad_norm": 2.6014881134033203, "learning_rate": 4.578207381370827e-05} +{"ts": "2025-12-26T20:41:14", "event": "train_log", "step": 524, "epoch": 0.2210970464135021, "progress_pct": 3.68, "epoch_pct": 3.68, "eta": "66:22:19", "max_grad_norm": 0.8, "loss": 0.9525764584541321, "grad_norm": 1.9512032270431519, "learning_rate": 4.595782073813709e-05} +{"ts": "2025-12-26T20:41:34", "event": "train_log", "step": 526, "epoch": 0.2219409282700422, "progress_pct": 3.7, "epoch_pct": 3.7, "eta": "66:15:04", "max_grad_norm": 0.8, "loss": 0.8839208483695984, "grad_norm": 1.9246160984039307, "learning_rate": 4.613356766256591e-05} +{"ts": "2025-12-26T20:41:53", "event": "train_log", "step": 528, "epoch": 0.22278481012658227, "progress_pct": 3.71, "epoch_pct": 3.71, "eta": "66:07:54", "max_grad_norm": 0.8, "loss": 0.8888868093490601, "grad_norm": 1.9713703393936157, "learning_rate": 4.6309314586994726e-05} +{"ts": "2025-12-26T20:42:11", "event": "train_log", "step": 530, "epoch": 0.22362869198312235, "progress_pct": 3.73, "epoch_pct": 3.73, "eta": "66:00:03", "max_grad_norm": 0.8, "loss": 0.8123540878295898, "grad_norm": 2.1175239086151123, "learning_rate": 4.648506151142355e-05} +{"ts": "2025-12-26T20:42:33", "event": "train_log", "step": 532, "epoch": 0.22447257383966246, "progress_pct": 3.74, "epoch_pct": 3.74, "eta": "65:53:48", "max_grad_norm": 0.8, "loss": 0.7447702884674072, "grad_norm": 1.7656135559082031, "learning_rate": 4.666080843585238e-05} +{"ts": "2025-12-26T20:42:52", "event": "train_log", "step": 534, "epoch": 0.22531645569620254, "progress_pct": 3.76, "epoch_pct": 3.76, "eta": "65:46:50", "max_grad_norm": 0.8, "loss": 0.8778411746025085, "grad_norm": 2.15748929977417, "learning_rate": 4.68365553602812e-05} +{"ts": "2025-12-26T20:43:11", "event": "train_log", "step": 536, "epoch": 0.22616033755274262, "progress_pct": 3.77, "epoch_pct": 3.77, "eta": "65:39:31", "max_grad_norm": 0.8, "loss": 0.8985894918441772, "grad_norm": 2.1733345985412598, "learning_rate": 4.7012302284710016e-05} +{"ts": "2025-12-26T20:43:31", "event": "train_log", "step": 538, "epoch": 0.2270042194092827, "progress_pct": 3.78, "epoch_pct": 3.78, "eta": "65:32:42", "max_grad_norm": 0.8, "loss": 0.8031114339828491, "grad_norm": 1.7182204723358154, "learning_rate": 4.718804920913884e-05} +{"ts": "2025-12-26T20:43:50", "event": "train_log", "step": 540, "epoch": 0.22784810126582278, "progress_pct": 3.8, "epoch_pct": 3.8, "eta": "65:25:43", "max_grad_norm": 0.8, "loss": 0.9399706721305847, "grad_norm": 1.8586329221725464, "learning_rate": 4.736379613356767e-05} +{"ts": "2025-12-26T20:44:08", "event": "train_log", "step": 542, "epoch": 0.22869198312236286, "progress_pct": 3.81, "epoch_pct": 3.81, "eta": "65:18:08", "max_grad_norm": 0.8, "loss": 0.8672119975090027, "grad_norm": 2.105637311935425, "learning_rate": 4.753954305799649e-05} +{"ts": "2025-12-26T20:44:29", "event": "train_log", "step": 544, "epoch": 0.22953586497890296, "progress_pct": 3.83, "epoch_pct": 3.83, "eta": "65:11:55", "max_grad_norm": 0.8, "loss": 0.8663905262947083, "grad_norm": 1.760584831237793, "learning_rate": 4.771528998242531e-05} +{"ts": "2025-12-26T20:44:50", "event": "train_log", "step": 546, "epoch": 0.23037974683544304, "progress_pct": 3.84, "epoch_pct": 3.84, "eta": "65:05:57", "max_grad_norm": 0.8, "loss": 0.8575801849365234, "grad_norm": 1.579990267753601, "learning_rate": 4.789103690685413e-05} +{"ts": "2025-12-26T20:45:12", "event": "train_log", "step": 548, "epoch": 0.23122362869198312, "progress_pct": 3.85, "epoch_pct": 3.85, "eta": "64:59:58", "max_grad_norm": 0.8, "loss": 0.828412652015686, "grad_norm": 1.9242485761642456, "learning_rate": 4.806678383128295e-05} +{"ts": "2025-12-26T20:45:32", "event": "train_log", "step": 550, "epoch": 0.2320675105485232, "progress_pct": 3.87, "epoch_pct": 3.87, "eta": "64:53:32", "max_grad_norm": 0.8, "loss": 0.8183464407920837, "grad_norm": 1.812137246131897, "learning_rate": 4.824253075571178e-05} +{"ts": "2025-12-26T20:45:52", "event": "train_log", "step": 552, "epoch": 0.23291139240506328, "progress_pct": 3.88, "epoch_pct": 3.88, "eta": "64:47:24", "max_grad_norm": 0.8, "loss": 0.7822491526603699, "grad_norm": 1.804733395576477, "learning_rate": 4.84182776801406e-05} +{"ts": "2025-12-26T20:46:11", "event": "train_log", "step": 554, "epoch": 0.23375527426160336, "progress_pct": 3.9, "epoch_pct": 3.9, "eta": "64:40:36", "max_grad_norm": 0.8, "loss": 0.9050943851470947, "grad_norm": 2.052257537841797, "learning_rate": 4.859402460456942e-05} +{"ts": "2025-12-26T20:46:32", "event": "train_log", "step": 556, "epoch": 0.23459915611814347, "progress_pct": 3.91, "epoch_pct": 3.91, "eta": "64:34:43", "max_grad_norm": 0.8, "loss": 0.8846852779388428, "grad_norm": 1.9803621768951416, "learning_rate": 4.876977152899824e-05} +{"ts": "2025-12-26T20:46:53", "event": "train_log", "step": 558, "epoch": 0.23544303797468355, "progress_pct": 3.92, "epoch_pct": 3.92, "eta": "64:28:32", "max_grad_norm": 0.8, "loss": 0.8649531602859497, "grad_norm": 1.820125937461853, "learning_rate": 4.894551845342707e-05} +{"ts": "2025-12-26T20:47:12", "event": "train_log", "step": 560, "epoch": 0.23628691983122363, "progress_pct": 3.94, "epoch_pct": 3.94, "eta": "64:22:05", "max_grad_norm": 0.8, "loss": 0.9307748079299927, "grad_norm": 2.0963921546936035, "learning_rate": 4.912126537785589e-05} +{"ts": "2025-12-26T20:47:30", "event": "train_log", "step": 562, "epoch": 0.2371308016877637, "progress_pct": 3.95, "epoch_pct": 3.95, "eta": "64:15:01", "max_grad_norm": 0.8, "loss": 0.9092473387718201, "grad_norm": 2.079697847366333, "learning_rate": 4.929701230228471e-05} +{"ts": "2025-12-26T20:47:49", "event": "train_log", "step": 564, "epoch": 0.2379746835443038, "progress_pct": 3.97, "epoch_pct": 3.97, "eta": "64:08:27", "max_grad_norm": 0.8, "loss": 0.8976567983627319, "grad_norm": 2.0291287899017334, "learning_rate": 4.947275922671353e-05} +{"ts": "2025-12-26T20:48:09", "event": "train_log", "step": 566, "epoch": 0.23881856540084387, "progress_pct": 3.98, "epoch_pct": 3.98, "eta": "64:02:21", "max_grad_norm": 0.8, "loss": 0.8931006193161011, "grad_norm": 1.9636707305908203, "learning_rate": 4.964850615114236e-05} +{"ts": "2025-12-26T20:48:28", "event": "train_log", "step": 568, "epoch": 0.23966244725738398, "progress_pct": 3.99, "epoch_pct": 3.99, "eta": "63:55:56", "max_grad_norm": 0.8, "loss": 0.829562246799469, "grad_norm": 1.922049880027771, "learning_rate": 4.982425307557118e-05} +{"ts": "2025-12-26T20:48:47", "event": "train_log", "step": 570, "epoch": 0.24050632911392406, "progress_pct": 4.01, "epoch_pct": 4.01, "eta": "63:49:19", "max_grad_norm": 0.8, "loss": 0.8568030595779419, "grad_norm": 2.150334596633911, "learning_rate": 5e-05} +{"ts": "2025-12-26T20:49:08", "event": "train_log", "step": 572, "epoch": 0.24135021097046414, "progress_pct": 4.02, "epoch_pct": 4.02, "eta": "63:43:50", "max_grad_norm": 0.8, "loss": 0.8623508810997009, "grad_norm": 2.024437427520752, "learning_rate": 5.017574692442882e-05} +{"ts": "2025-12-26T20:49:29", "event": "train_log", "step": 574, "epoch": 0.24219409282700421, "progress_pct": 4.04, "epoch_pct": 4.04, "eta": "63:38:02", "max_grad_norm": 0.8, "loss": 0.7853795886039734, "grad_norm": 1.8312673568725586, "learning_rate": 5.035149384885765e-05} +{"ts": "2025-12-26T20:49:49", "event": "train_log", "step": 576, "epoch": 0.2430379746835443, "progress_pct": 4.05, "epoch_pct": 4.05, "eta": "63:32:19", "max_grad_norm": 0.8, "loss": 0.9727587103843689, "grad_norm": 1.9271961450576782, "learning_rate": 5.0527240773286467e-05} +{"ts": "2025-12-26T20:50:09", "event": "train_log", "step": 578, "epoch": 0.2438818565400844, "progress_pct": 4.06, "epoch_pct": 4.06, "eta": "63:26:30", "max_grad_norm": 0.8, "loss": 0.8859632015228271, "grad_norm": 1.931249976158142, "learning_rate": 5.0702987697715286e-05} +{"ts": "2025-12-26T20:50:29", "event": "train_log", "step": 580, "epoch": 0.24472573839662448, "progress_pct": 4.08, "epoch_pct": 4.08, "eta": "63:20:46", "max_grad_norm": 0.8, "loss": 0.8959492444992065, "grad_norm": 1.8195210695266724, "learning_rate": 5.087873462214412e-05} +{"ts": "2025-12-26T20:50:49", "event": "train_log", "step": 582, "epoch": 0.24556962025316456, "progress_pct": 4.09, "epoch_pct": 4.09, "eta": "63:14:44", "max_grad_norm": 0.8, "loss": 0.8146185874938965, "grad_norm": 2.0018749237060547, "learning_rate": 5.105448154657294e-05} +{"ts": "2025-12-26T20:51:08", "event": "train_log", "step": 584, "epoch": 0.24641350210970464, "progress_pct": 4.11, "epoch_pct": 4.11, "eta": "63:08:36", "max_grad_norm": 0.8, "loss": 0.8545317053794861, "grad_norm": 2.09798526763916, "learning_rate": 5.1230228471001764e-05} +{"ts": "2025-12-26T20:51:28", "event": "train_log", "step": 586, "epoch": 0.24725738396624472, "progress_pct": 4.12, "epoch_pct": 4.12, "eta": "63:02:56", "max_grad_norm": 0.8, "loss": 0.8650105595588684, "grad_norm": 1.8063944578170776, "learning_rate": 5.140597539543058e-05} +{"ts": "2025-12-26T20:51:48", "event": "train_log", "step": 588, "epoch": 0.2481012658227848, "progress_pct": 4.14, "epoch_pct": 4.14, "eta": "62:57:05", "max_grad_norm": 0.8, "loss": 0.8395693302154541, "grad_norm": 1.8535740375518799, "learning_rate": 5.15817223198594e-05} +{"ts": "2025-12-26T20:52:06", "event": "train_log", "step": 590, "epoch": 0.2489451476793249, "progress_pct": 4.15, "epoch_pct": 4.15, "eta": "62:50:55", "max_grad_norm": 0.8, "loss": 0.8267397284507751, "grad_norm": 2.1443960666656494, "learning_rate": 5.175746924428823e-05} +{"ts": "2025-12-26T20:52:27", "event": "train_log", "step": 592, "epoch": 0.249789029535865, "progress_pct": 4.16, "epoch_pct": 4.16, "eta": "62:45:28", "max_grad_norm": 0.8, "loss": 0.8500015139579773, "grad_norm": 1.9637391567230225, "learning_rate": 5.193321616871705e-05} +{"ts": "2025-12-26T20:52:48", "event": "train_log", "step": 594, "epoch": 0.25063291139240507, "progress_pct": 4.18, "epoch_pct": 4.18, "eta": "62:40:28", "max_grad_norm": 0.8, "loss": 0.887481153011322, "grad_norm": 1.9457582235336304, "learning_rate": 5.2108963093145866e-05} +{"ts": "2025-12-26T20:53:09", "event": "train_log", "step": 596, "epoch": 0.2514767932489452, "progress_pct": 4.19, "epoch_pct": 4.19, "eta": "62:35:05", "max_grad_norm": 0.8, "loss": 0.8444154858589172, "grad_norm": 1.7458715438842773, "learning_rate": 5.228471001757469e-05} +{"ts": "2025-12-26T20:53:29", "event": "train_log", "step": 598, "epoch": 0.2523206751054852, "progress_pct": 4.21, "epoch_pct": 4.21, "eta": "62:29:42", "max_grad_norm": 0.8, "loss": 0.8301781415939331, "grad_norm": 1.8341439962387085, "learning_rate": 5.2460456942003525e-05} +{"ts": "2025-12-26T20:53:49", "event": "train_log", "step": 600, "epoch": 0.25316455696202533, "progress_pct": 4.22, "epoch_pct": 4.22, "eta": "62:24:01", "max_grad_norm": 0.8, "loss": 0.8921551704406738, "grad_norm": 2.127747058868408, "learning_rate": 5.2636203866432344e-05} +{"ts": "2025-12-26T21:07:55", "event": "train_log", "step": 600, "epoch": 0.25316455696202533, "progress_pct": 4.22, "epoch_pct": 4.22, "eta": "67:44:05", "max_grad_norm": 0.8, "eval_loss": 0.8903881311416626, "eval_runtime": 845.9969, "eval_samples_per_second": 2.491, "eval_steps_per_second": 2.491} +{"ts": "2025-12-26T21:08:16", "event": "train_log", "step": 602, "epoch": 0.2540084388185654, "progress_pct": 4.23, "epoch_pct": 4.23, "eta": "67:38:06", "max_grad_norm": 0.8, "loss": 0.8678019642829895, "grad_norm": 2.421459674835205, "learning_rate": 5.281195079086116e-05} +{"ts": "2025-12-26T21:08:38", "event": "train_log", "step": 604, "epoch": 0.2548523206751055, "progress_pct": 4.25, "epoch_pct": 4.25, "eta": "67:32:24", "max_grad_norm": 0.8, "loss": 0.8564275503158569, "grad_norm": 1.7736057043075562, "learning_rate": 5.298769771528999e-05} +{"ts": "2025-12-26T21:08:57", "event": "train_log", "step": 606, "epoch": 0.25569620253164554, "progress_pct": 4.26, "epoch_pct": 4.26, "eta": "67:25:27", "max_grad_norm": 0.8, "loss": 0.8529049158096313, "grad_norm": 2.28430438041687, "learning_rate": 5.316344463971881e-05} +{"ts": "2025-12-26T21:09:18", "event": "train_log", "step": 608, "epoch": 0.25654008438818565, "progress_pct": 4.28, "epoch_pct": 4.28, "eta": "67:19:22", "max_grad_norm": 0.8, "loss": 0.8672881126403809, "grad_norm": 1.8892366886138916, "learning_rate": 5.333919156414763e-05} +{"ts": "2025-12-26T21:09:37", "event": "train_log", "step": 610, "epoch": 0.25738396624472576, "progress_pct": 4.29, "epoch_pct": 4.29, "eta": "67:12:36", "max_grad_norm": 0.8, "loss": 0.9094445109367371, "grad_norm": 1.9059702157974243, "learning_rate": 5.3514938488576446e-05} +{"ts": "2025-12-26T21:09:56", "event": "train_log", "step": 612, "epoch": 0.2582278481012658, "progress_pct": 4.3, "epoch_pct": 4.3, "eta": "67:05:50", "max_grad_norm": 0.8, "loss": 0.8361946940422058, "grad_norm": 2.0657339096069336, "learning_rate": 5.369068541300527e-05} +{"ts": "2025-12-26T21:10:16", "event": "train_log", "step": 614, "epoch": 0.2590717299578059, "progress_pct": 4.32, "epoch_pct": 4.32, "eta": "66:59:33", "max_grad_norm": 0.8, "loss": 0.8319925665855408, "grad_norm": 1.8987553119659424, "learning_rate": 5.3866432337434105e-05} +{"ts": "2025-12-26T21:10:34", "event": "train_log", "step": 616, "epoch": 0.25991561181434597, "progress_pct": 4.33, "epoch_pct": 4.33, "eta": "66:52:30", "max_grad_norm": 0.8, "loss": 0.9818069934844971, "grad_norm": 2.1176226139068604, "learning_rate": 5.4042179261862924e-05} +{"ts": "2025-12-26T21:10:53", "event": "train_log", "step": 618, "epoch": 0.2607594936708861, "progress_pct": 4.35, "epoch_pct": 4.35, "eta": "66:45:53", "max_grad_norm": 0.8, "loss": 0.8675919771194458, "grad_norm": 2.142096519470215, "learning_rate": 5.421792618629174e-05} +{"ts": "2025-12-26T21:11:11", "event": "train_log", "step": 620, "epoch": 0.2616033755274262, "progress_pct": 4.36, "epoch_pct": 4.36, "eta": "66:39:06", "max_grad_norm": 0.8, "loss": 0.8845479488372803, "grad_norm": 1.9527089595794678, "learning_rate": 5.439367311072057e-05} +{"ts": "2025-12-26T21:11:31", "event": "train_log", "step": 622, "epoch": 0.26244725738396624, "progress_pct": 4.37, "epoch_pct": 4.37, "eta": "66:32:50", "max_grad_norm": 0.8, "loss": 0.809393048286438, "grad_norm": 1.7071453332901, "learning_rate": 5.456942003514939e-05} +{"ts": "2025-12-26T21:11:51", "event": "train_log", "step": 624, "epoch": 0.26329113924050634, "progress_pct": 4.39, "epoch_pct": 4.39, "eta": "66:26:54", "max_grad_norm": 0.8, "loss": 0.8262377977371216, "grad_norm": 1.9133527278900146, "learning_rate": 5.474516695957821e-05} +{"ts": "2025-12-26T21:12:10", "event": "train_log", "step": 626, "epoch": 0.2641350210970464, "progress_pct": 4.4, "epoch_pct": 4.4, "eta": "66:20:22", "max_grad_norm": 0.8, "loss": 0.9006736278533936, "grad_norm": 2.0217554569244385, "learning_rate": 5.492091388400703e-05} +{"ts": "2025-12-26T21:12:30", "event": "train_log", "step": 628, "epoch": 0.2649789029535865, "progress_pct": 4.42, "epoch_pct": 4.42, "eta": "66:14:08", "max_grad_norm": 0.8, "loss": 0.8243603110313416, "grad_norm": 1.773273229598999, "learning_rate": 5.509666080843585e-05} +{"ts": "2025-12-26T21:12:50", "event": "train_log", "step": 630, "epoch": 0.26582278481012656, "progress_pct": 4.43, "epoch_pct": 4.43, "eta": "66:08:20", "max_grad_norm": 0.8, "loss": 0.8112778663635254, "grad_norm": 1.6580880880355835, "learning_rate": 5.527240773286467e-05} +{"ts": "2025-12-26T21:13:11", "event": "train_log", "step": 632, "epoch": 0.26666666666666666, "progress_pct": 4.44, "epoch_pct": 4.44, "eta": "66:02:37", "max_grad_norm": 0.8, "loss": 0.8390820622444153, "grad_norm": 1.8342082500457764, "learning_rate": 5.5448154657293504e-05} +{"ts": "2025-12-26T21:13:31", "event": "train_log", "step": 634, "epoch": 0.26751054852320677, "progress_pct": 4.46, "epoch_pct": 4.46, "eta": "65:56:47", "max_grad_norm": 0.8, "loss": 0.8264521360397339, "grad_norm": 1.863695502281189, "learning_rate": 5.5623901581722323e-05} +{"ts": "2025-12-26T21:13:48", "event": "train_log", "step": 636, "epoch": 0.2683544303797468, "progress_pct": 4.47, "epoch_pct": 4.47, "eta": "65:49:50", "max_grad_norm": 0.8, "loss": 0.9512701630592346, "grad_norm": 1.9462928771972656, "learning_rate": 5.579964850615115e-05} +{"ts": "2025-12-26T21:14:07", "event": "train_log", "step": 638, "epoch": 0.26919831223628693, "progress_pct": 4.49, "epoch_pct": 4.49, "eta": "65:43:35", "max_grad_norm": 0.8, "loss": 0.9422703981399536, "grad_norm": 1.7776058912277222, "learning_rate": 5.597539543057997e-05} +{"ts": "2025-12-26T21:14:27", "event": "train_log", "step": 640, "epoch": 0.270042194092827, "progress_pct": 4.5, "epoch_pct": 4.5, "eta": "65:37:40", "max_grad_norm": 0.8, "loss": 0.7991042137145996, "grad_norm": 2.9457077980041504, "learning_rate": 5.615114235500879e-05} +{"ts": "2025-12-26T21:14:48", "event": "train_log", "step": 642, "epoch": 0.2708860759493671, "progress_pct": 4.51, "epoch_pct": 4.51, "eta": "65:32:21", "max_grad_norm": 0.8, "loss": 0.8188099265098572, "grad_norm": 1.445265531539917, "learning_rate": 5.6326889279437614e-05} +{"ts": "2025-12-26T21:15:07", "event": "train_log", "step": 644, "epoch": 0.2717299578059072, "progress_pct": 4.53, "epoch_pct": 4.53, "eta": "65:26:12", "max_grad_norm": 0.8, "loss": 0.9799772500991821, "grad_norm": 2.063850164413452, "learning_rate": 5.650263620386643e-05} +{"ts": "2025-12-26T21:15:27", "event": "train_log", "step": 646, "epoch": 0.27257383966244725, "progress_pct": 4.54, "epoch_pct": 4.54, "eta": "65:20:20", "max_grad_norm": 0.8, "loss": 0.8462742567062378, "grad_norm": 2.0488009452819824, "learning_rate": 5.667838312829525e-05} +{"ts": "2025-12-26T21:15:46", "event": "train_log", "step": 648, "epoch": 0.27341772151898736, "progress_pct": 4.56, "epoch_pct": 4.56, "eta": "65:14:19", "max_grad_norm": 0.8, "loss": 0.8226412534713745, "grad_norm": 1.8747851848602295, "learning_rate": 5.685413005272408e-05} +{"ts": "2025-12-26T21:16:07", "event": "train_log", "step": 650, "epoch": 0.2742616033755274, "progress_pct": 4.57, "epoch_pct": 4.57, "eta": "65:08:52", "max_grad_norm": 0.8, "loss": 0.9146338105201721, "grad_norm": 1.849074125289917, "learning_rate": 5.702987697715291e-05} +{"ts": "2025-12-26T21:16:26", "event": "train_log", "step": 652, "epoch": 0.2751054852320675, "progress_pct": 4.59, "epoch_pct": 4.59, "eta": "65:03:00", "max_grad_norm": 0.8, "loss": 0.7574424147605896, "grad_norm": 1.7738500833511353, "learning_rate": 5.720562390158173e-05} +{"ts": "2025-12-26T21:16:46", "event": "train_log", "step": 654, "epoch": 0.2759493670886076, "progress_pct": 4.6, "epoch_pct": 4.6, "eta": "64:57:30", "max_grad_norm": 0.8, "loss": 0.8930003046989441, "grad_norm": 1.911102294921875, "learning_rate": 5.738137082601055e-05} +{"ts": "2025-12-26T21:17:08", "event": "train_log", "step": 656, "epoch": 0.2767932489451477, "progress_pct": 4.61, "epoch_pct": 4.61, "eta": "64:52:37", "max_grad_norm": 0.8, "loss": 0.7578965425491333, "grad_norm": 1.5716617107391357, "learning_rate": 5.755711775043937e-05} +{"ts": "2025-12-26T21:17:27", "event": "train_log", "step": 658, "epoch": 0.2776371308016878, "progress_pct": 4.63, "epoch_pct": 4.63, "eta": "64:46:41", "max_grad_norm": 0.8, "loss": 0.8149038553237915, "grad_norm": 1.789036512374878, "learning_rate": 5.7732864674868194e-05} +{"ts": "2025-12-26T21:17:47", "event": "train_log", "step": 660, "epoch": 0.27848101265822783, "progress_pct": 4.64, "epoch_pct": 4.64, "eta": "64:41:09", "max_grad_norm": 0.8, "loss": 0.8265765905380249, "grad_norm": 1.68622624874115, "learning_rate": 5.790861159929701e-05} +{"ts": "2025-12-26T21:18:05", "event": "train_log", "step": 662, "epoch": 0.27932489451476794, "progress_pct": 4.66, "epoch_pct": 4.66, "eta": "64:35:13", "max_grad_norm": 0.8, "loss": 0.9651970267295837, "grad_norm": 2.078423261642456, "learning_rate": 5.808435852372583e-05} +{"ts": "2025-12-26T21:18:25", "event": "train_log", "step": 664, "epoch": 0.280168776371308, "progress_pct": 4.67, "epoch_pct": 4.67, "eta": "64:29:44", "max_grad_norm": 0.8, "loss": 0.8295148015022278, "grad_norm": 1.7878645658493042, "learning_rate": 5.826010544815466e-05} +{"ts": "2025-12-26T21:18:45", "event": "train_log", "step": 666, "epoch": 0.2810126582278481, "progress_pct": 4.68, "epoch_pct": 4.68, "eta": "64:24:18", "max_grad_norm": 0.8, "loss": 0.7778491377830505, "grad_norm": 1.970838189125061, "learning_rate": 5.843585237258348e-05} +{"ts": "2025-12-26T21:19:04", "event": "train_log", "step": 668, "epoch": 0.2818565400843882, "progress_pct": 4.7, "epoch_pct": 4.7, "eta": "64:18:26", "max_grad_norm": 0.8, "loss": 0.9818071722984314, "grad_norm": 1.943596363067627, "learning_rate": 5.861159929701231e-05} +{"ts": "2025-12-26T21:19:22", "event": "train_log", "step": 670, "epoch": 0.28270042194092826, "progress_pct": 4.71, "epoch_pct": 4.71, "eta": "64:12:33", "max_grad_norm": 0.8, "loss": 0.9297797083854675, "grad_norm": 1.8793812990188599, "learning_rate": 5.878734622144113e-05} +{"ts": "2025-12-26T21:19:41", "event": "train_log", "step": 672, "epoch": 0.28354430379746837, "progress_pct": 4.73, "epoch_pct": 4.73, "eta": "64:06:46", "max_grad_norm": 0.8, "loss": 0.8748109936714172, "grad_norm": 1.8813483715057373, "learning_rate": 5.8963093145869955e-05} +{"ts": "2025-12-26T21:19:59", "event": "train_log", "step": 674, "epoch": 0.2843881856540084, "progress_pct": 4.74, "epoch_pct": 4.74, "eta": "64:00:55", "max_grad_norm": 0.8, "loss": 0.8505244851112366, "grad_norm": 1.7658562660217285, "learning_rate": 5.9138840070298774e-05} +{"ts": "2025-12-26T21:20:19", "event": "train_log", "step": 676, "epoch": 0.2852320675105485, "progress_pct": 4.75, "epoch_pct": 4.75, "eta": "63:55:35", "max_grad_norm": 0.8, "loss": 0.8476597666740417, "grad_norm": 1.6767617464065552, "learning_rate": 5.931458699472759e-05} +{"ts": "2025-12-26T21:20:37", "event": "train_log", "step": 678, "epoch": 0.28607594936708863, "progress_pct": 4.77, "epoch_pct": 4.77, "eta": "63:49:52", "max_grad_norm": 0.8, "loss": 0.8775192499160767, "grad_norm": 2.703104257583618, "learning_rate": 5.949033391915641e-05} +{"ts": "2025-12-26T21:20:57", "event": "train_log", "step": 680, "epoch": 0.2869198312236287, "progress_pct": 4.78, "epoch_pct": 4.78, "eta": "63:44:25", "max_grad_norm": 0.8, "loss": 0.855262279510498, "grad_norm": 1.9959728717803955, "learning_rate": 5.966608084358524e-05} +{"ts": "2025-12-26T21:21:16", "event": "train_log", "step": 682, "epoch": 0.2877637130801688, "progress_pct": 4.8, "epoch_pct": 4.8, "eta": "63:39:12", "max_grad_norm": 0.8, "loss": 0.7574936151504517, "grad_norm": 1.9093716144561768, "learning_rate": 5.984182776801406e-05} +{"ts": "2025-12-26T21:21:36", "event": "train_log", "step": 684, "epoch": 0.28860759493670884, "progress_pct": 4.81, "epoch_pct": 4.81, "eta": "63:33:54", "max_grad_norm": 0.8, "loss": 0.8630690574645996, "grad_norm": 1.9829599857330322, "learning_rate": 6.001757469244289e-05} +{"ts": "2025-12-26T21:21:55", "event": "train_log", "step": 686, "epoch": 0.28945147679324895, "progress_pct": 4.82, "epoch_pct": 4.82, "eta": "63:28:34", "max_grad_norm": 0.8, "loss": 0.8513249158859253, "grad_norm": 1.8777490854263306, "learning_rate": 6.019332161687171e-05} +{"ts": "2025-12-26T21:22:14", "event": "train_log", "step": 688, "epoch": 0.290295358649789, "progress_pct": 4.84, "epoch_pct": 4.84, "eta": "63:23:08", "max_grad_norm": 0.8, "loss": 0.9097008109092712, "grad_norm": 1.9453173875808716, "learning_rate": 6.0369068541300535e-05} +{"ts": "2025-12-26T21:22:34", "event": "train_log", "step": 690, "epoch": 0.2911392405063291, "progress_pct": 4.85, "epoch_pct": 4.85, "eta": "63:18:09", "max_grad_norm": 0.8, "loss": 0.8291722536087036, "grad_norm": 1.8527908325195312, "learning_rate": 6.0544815465729354e-05} +{"ts": "2025-12-26T21:22:53", "event": "train_log", "step": 692, "epoch": 0.2919831223628692, "progress_pct": 4.87, "epoch_pct": 4.87, "eta": "63:12:41", "max_grad_norm": 0.8, "loss": 0.880009651184082, "grad_norm": 1.9255812168121338, "learning_rate": 6.0720562390158174e-05} +{"ts": "2025-12-26T21:23:13", "event": "train_log", "step": 694, "epoch": 0.29282700421940927, "progress_pct": 4.88, "epoch_pct": 4.88, "eta": "63:07:41", "max_grad_norm": 0.8, "loss": 0.8791794180870056, "grad_norm": 1.6637977361679077, "learning_rate": 6.0896309314587e-05} +{"ts": "2025-12-26T21:23:32", "event": "train_log", "step": 696, "epoch": 0.2936708860759494, "progress_pct": 4.89, "epoch_pct": 4.89, "eta": "63:02:34", "max_grad_norm": 0.8, "loss": 0.8662407398223877, "grad_norm": 1.825940728187561, "learning_rate": 6.107205623901582e-05} +{"ts": "2025-12-26T21:23:51", "event": "train_log", "step": 698, "epoch": 0.29451476793248943, "progress_pct": 4.91, "epoch_pct": 4.91, "eta": "62:57:09", "max_grad_norm": 0.8, "loss": 0.8984515070915222, "grad_norm": 1.9348198175430298, "learning_rate": 6.124780316344464e-05} +{"ts": "2025-12-26T21:24:11", "event": "train_log", "step": 700, "epoch": 0.29535864978902954, "progress_pct": 4.92, "epoch_pct": 4.92, "eta": "62:52:10", "max_grad_norm": 0.8, "loss": 0.827385663986206, "grad_norm": 1.659345030784607, "learning_rate": 6.142355008787346e-05} +{"ts": "2025-12-26T21:38:29", "event": "train_log", "step": 700, "epoch": 0.29535864978902954, "progress_pct": 4.92, "epoch_pct": 4.92, "eta": "67:28:25", "max_grad_norm": 0.8, "eval_loss": 0.8730722069740295, "eval_runtime": 858.184, "eval_samples_per_second": 2.455, "eval_steps_per_second": 2.455} +{"ts": "2025-12-26T21:38:49", "event": "train_log", "step": 702, "epoch": 0.29620253164556964, "progress_pct": 4.94, "epoch_pct": 4.94, "eta": "67:22:51", "max_grad_norm": 0.8, "loss": 0.9337764382362366, "grad_norm": 1.6531789302825928, "learning_rate": 6.159929701230229e-05} +{"ts": "2025-12-26T21:39:09", "event": "train_log", "step": 704, "epoch": 0.2970464135021097, "progress_pct": 4.95, "epoch_pct": 4.95, "eta": "67:16:57", "max_grad_norm": 0.8, "loss": 0.8250943422317505, "grad_norm": 1.8269121646881104, "learning_rate": 6.177504393673111e-05} +{"ts": "2025-12-26T21:39:28", "event": "train_log", "step": 706, "epoch": 0.2978902953586498, "progress_pct": 4.96, "epoch_pct": 4.96, "eta": "67:11:11", "max_grad_norm": 0.8, "loss": 0.8657428026199341, "grad_norm": 1.692808747291565, "learning_rate": 6.195079086115994e-05} +{"ts": "2025-12-26T21:39:47", "event": "train_log", "step": 708, "epoch": 0.29873417721518986, "progress_pct": 4.98, "epoch_pct": 4.98, "eta": "67:05:05", "max_grad_norm": 0.8, "loss": 0.8889590501785278, "grad_norm": 1.6736913919448853, "learning_rate": 6.212653778558876e-05} +{"ts": "2025-12-26T21:40:06", "event": "train_log", "step": 710, "epoch": 0.29957805907172996, "progress_pct": 4.99, "epoch_pct": 4.99, "eta": "66:59:19", "max_grad_norm": 0.8, "loss": 0.7822914123535156, "grad_norm": 1.6841140985488892, "learning_rate": 6.230228471001758e-05} +{"ts": "2025-12-26T21:40:27", "event": "train_log", "step": 712, "epoch": 0.30042194092827, "progress_pct": 5.01, "epoch_pct": 5.01, "eta": "66:53:55", "max_grad_norm": 0.8, "loss": 0.8747053742408752, "grad_norm": 1.6644599437713623, "learning_rate": 6.24780316344464e-05} +{"ts": "2025-12-26T21:40:46", "event": "train_log", "step": 714, "epoch": 0.3012658227848101, "progress_pct": 5.02, "epoch_pct": 5.02, "eta": "66:48:10", "max_grad_norm": 0.8, "loss": 0.8976446390151978, "grad_norm": 1.8187819719314575, "learning_rate": 6.265377855887522e-05} +{"ts": "2025-12-26T21:41:05", "event": "train_log", "step": 716, "epoch": 0.30210970464135023, "progress_pct": 5.04, "epoch_pct": 5.04, "eta": "66:42:23", "max_grad_norm": 0.8, "loss": 0.9401160478591919, "grad_norm": 1.7845178842544556, "learning_rate": 6.282952548330404e-05} +{"ts": "2025-12-26T21:41:25", "event": "train_log", "step": 718, "epoch": 0.3029535864978903, "progress_pct": 5.05, "epoch_pct": 5.05, "eta": "66:36:49", "max_grad_norm": 0.8, "loss": 0.8754280209541321, "grad_norm": 1.559773564338684, "learning_rate": 6.300527240773286e-05} +{"ts": "2025-12-26T21:41:47", "event": "train_log", "step": 720, "epoch": 0.3037974683544304, "progress_pct": 5.06, "epoch_pct": 5.06, "eta": "66:31:59", "max_grad_norm": 0.8, "loss": 0.8278581500053406, "grad_norm": 1.5919631719589233, "learning_rate": 6.318101933216169e-05} +{"ts": "2025-12-26T21:42:07", "event": "train_log", "step": 722, "epoch": 0.30464135021097044, "progress_pct": 5.08, "epoch_pct": 5.08, "eta": "66:26:32", "max_grad_norm": 0.8, "loss": 0.8868640065193176, "grad_norm": 1.8551076650619507, "learning_rate": 6.335676625659052e-05} +{"ts": "2025-12-26T21:42:28", "event": "train_log", "step": 724, "epoch": 0.30548523206751055, "progress_pct": 5.09, "epoch_pct": 5.09, "eta": "66:21:33", "max_grad_norm": 0.8, "loss": 0.8631605505943298, "grad_norm": 1.6907769441604614, "learning_rate": 6.353251318101934e-05} +{"ts": "2025-12-26T21:42:46", "event": "train_log", "step": 726, "epoch": 0.30632911392405066, "progress_pct": 5.11, "epoch_pct": 5.11, "eta": "66:15:31", "max_grad_norm": 0.8, "loss": 0.9142873883247375, "grad_norm": 1.820867657661438, "learning_rate": 6.370826010544816e-05} +{"ts": "2025-12-26T21:43:06", "event": "train_log", "step": 728, "epoch": 0.3071729957805907, "progress_pct": 5.12, "epoch_pct": 5.12, "eta": "66:10:06", "max_grad_norm": 0.8, "loss": 0.8258634805679321, "grad_norm": 1.685154676437378, "learning_rate": 6.388400702987698e-05} +{"ts": "2025-12-26T21:43:25", "event": "train_log", "step": 730, "epoch": 0.3080168776371308, "progress_pct": 5.13, "epoch_pct": 5.13, "eta": "66:04:28", "max_grad_norm": 0.8, "loss": 0.9545516967773438, "grad_norm": 1.9294627904891968, "learning_rate": 6.40597539543058e-05} +{"ts": "2025-12-26T21:43:43", "event": "train_log", "step": 732, "epoch": 0.30886075949367087, "progress_pct": 5.15, "epoch_pct": 5.15, "eta": "65:58:51", "max_grad_norm": 0.8, "loss": 0.8370757699012756, "grad_norm": 1.6075409650802612, "learning_rate": 6.423550087873462e-05} +{"ts": "2025-12-26T21:44:03", "event": "train_log", "step": 734, "epoch": 0.309704641350211, "progress_pct": 5.16, "epoch_pct": 5.16, "eta": "65:53:20", "max_grad_norm": 0.8, "loss": 0.8356084823608398, "grad_norm": 1.635750651359558, "learning_rate": 6.441124780316345e-05} +{"ts": "2025-12-26T21:44:22", "event": "train_log", "step": 736, "epoch": 0.3105485232067511, "progress_pct": 5.18, "epoch_pct": 5.18, "eta": "65:47:59", "max_grad_norm": 0.8, "loss": 0.7579531669616699, "grad_norm": 1.6376131772994995, "learning_rate": 6.458699472759227e-05} +{"ts": "2025-12-26T21:44:41", "event": "train_log", "step": 738, "epoch": 0.31139240506329113, "progress_pct": 5.19, "epoch_pct": 5.19, "eta": "65:42:27", "max_grad_norm": 0.8, "loss": 0.8436318039894104, "grad_norm": 1.7135766744613647, "learning_rate": 6.47627416520211e-05} +{"ts": "2025-12-26T21:45:00", "event": "train_log", "step": 740, "epoch": 0.31223628691983124, "progress_pct": 5.2, "epoch_pct": 5.2, "eta": "65:36:53", "max_grad_norm": 0.8, "loss": 0.7998805046081543, "grad_norm": 1.7095093727111816, "learning_rate": 6.493848857644992e-05} +{"ts": "2025-12-26T21:45:19", "event": "train_log", "step": 742, "epoch": 0.3130801687763713, "progress_pct": 5.22, "epoch_pct": 5.22, "eta": "65:31:24", "max_grad_norm": 0.8, "loss": 0.915776789188385, "grad_norm": 1.782615303993225, "learning_rate": 6.511423550087874e-05} +{"ts": "2025-12-26T21:45:38", "event": "train_log", "step": 744, "epoch": 0.3139240506329114, "progress_pct": 5.23, "epoch_pct": 5.23, "eta": "65:26:05", "max_grad_norm": 0.8, "loss": 0.8300962448120117, "grad_norm": 1.8461172580718994, "learning_rate": 6.528998242530756e-05} +{"ts": "2025-12-26T21:45:57", "event": "train_log", "step": 746, "epoch": 0.31476793248945145, "progress_pct": 5.25, "epoch_pct": 5.25, "eta": "65:20:43", "max_grad_norm": 0.8, "loss": 0.8239848017692566, "grad_norm": 1.5659871101379395, "learning_rate": 6.546572934973638e-05} +{"ts": "2025-12-26T21:46:15", "event": "train_log", "step": 748, "epoch": 0.31561181434599156, "progress_pct": 5.26, "epoch_pct": 5.26, "eta": "65:15:09", "max_grad_norm": 0.8, "loss": 0.8236988186836243, "grad_norm": 1.9997349977493286, "learning_rate": 6.56414762741652e-05} +{"ts": "2025-12-26T21:46:34", "event": "train_log", "step": 750, "epoch": 0.31645569620253167, "progress_pct": 5.27, "epoch_pct": 5.27, "eta": "65:09:36", "max_grad_norm": 0.8, "loss": 0.8516603112220764, "grad_norm": 1.9811526536941528, "learning_rate": 6.581722319859403e-05} +{"ts": "2025-12-26T21:46:52", "event": "train_log", "step": 752, "epoch": 0.3172995780590717, "progress_pct": 5.29, "epoch_pct": 5.29, "eta": "65:04:17", "max_grad_norm": 0.8, "loss": 0.9037567973136902, "grad_norm": 1.9877923727035522, "learning_rate": 6.599297012302285e-05} +{"ts": "2025-12-26T21:47:11", "event": "train_log", "step": 754, "epoch": 0.3181434599156118, "progress_pct": 5.3, "epoch_pct": 5.3, "eta": "64:58:52", "max_grad_norm": 0.8, "loss": 0.8350864052772522, "grad_norm": 1.6729352474212646, "learning_rate": 6.616871704745168e-05} +{"ts": "2025-12-26T21:47:30", "event": "train_log", "step": 756, "epoch": 0.3189873417721519, "progress_pct": 5.32, "epoch_pct": 5.32, "eta": "64:53:40", "max_grad_norm": 0.8, "loss": 0.8246616125106812, "grad_norm": 1.9055802822113037, "learning_rate": 6.63444639718805e-05} +{"ts": "2025-12-26T21:47:50", "event": "train_log", "step": 758, "epoch": 0.319831223628692, "progress_pct": 5.33, "epoch_pct": 5.33, "eta": "64:48:37", "max_grad_norm": 0.8, "loss": 0.8014416098594666, "grad_norm": 1.597999930381775, "learning_rate": 6.652021089630932e-05} +{"ts": "2025-12-26T21:48:10", "event": "train_log", "step": 760, "epoch": 0.3206751054852321, "progress_pct": 5.34, "epoch_pct": 5.34, "eta": "64:43:47", "max_grad_norm": 0.8, "loss": 0.9199523329734802, "grad_norm": 1.7432531118392944, "learning_rate": 6.669595782073814e-05} +{"ts": "2025-12-26T21:48:29", "event": "train_log", "step": 762, "epoch": 0.32151898734177214, "progress_pct": 5.36, "epoch_pct": 5.36, "eta": "64:38:39", "max_grad_norm": 0.8, "loss": 0.7764829397201538, "grad_norm": 1.820164442062378, "learning_rate": 6.687170474516696e-05} +{"ts": "2025-12-26T21:48:48", "event": "train_log", "step": 764, "epoch": 0.32236286919831225, "progress_pct": 5.37, "epoch_pct": 5.37, "eta": "64:33:33", "max_grad_norm": 0.8, "loss": 0.8072620630264282, "grad_norm": 1.6408652067184448, "learning_rate": 6.704745166959578e-05} +{"ts": "2025-12-26T21:49:07", "event": "train_log", "step": 766, "epoch": 0.3232067510548523, "progress_pct": 5.39, "epoch_pct": 5.39, "eta": "64:28:25", "max_grad_norm": 0.8, "loss": 0.9006885886192322, "grad_norm": 1.8894155025482178, "learning_rate": 6.722319859402461e-05} +{"ts": "2025-12-26T21:49:27", "event": "train_log", "step": 768, "epoch": 0.3240506329113924, "progress_pct": 5.4, "epoch_pct": 5.4, "eta": "64:23:29", "max_grad_norm": 0.8, "loss": 0.7772189378738403, "grad_norm": 1.6903613805770874, "learning_rate": 6.739894551845343e-05} +{"ts": "2025-12-26T21:49:46", "event": "train_log", "step": 770, "epoch": 0.32489451476793246, "progress_pct": 5.41, "epoch_pct": 5.41, "eta": "64:18:21", "max_grad_norm": 0.8, "loss": 0.8825590014457703, "grad_norm": 1.7540696859359741, "learning_rate": 6.757469244288225e-05} +{"ts": "2025-12-26T21:50:05", "event": "train_log", "step": 772, "epoch": 0.32573839662447257, "progress_pct": 5.43, "epoch_pct": 5.43, "eta": "64:13:26", "max_grad_norm": 0.8, "loss": 0.8376453518867493, "grad_norm": 1.603008508682251, "learning_rate": 6.775043936731108e-05} +{"ts": "2025-12-26T21:50:25", "event": "train_log", "step": 774, "epoch": 0.3265822784810127, "progress_pct": 5.44, "epoch_pct": 5.44, "eta": "64:08:38", "max_grad_norm": 0.8, "loss": 0.92608243227005, "grad_norm": 1.5381462574005127, "learning_rate": 6.79261862917399e-05} +{"ts": "2025-12-26T21:50:46", "event": "train_log", "step": 776, "epoch": 0.32742616033755273, "progress_pct": 5.46, "epoch_pct": 5.46, "eta": "64:04:15", "max_grad_norm": 0.8, "loss": 0.6842183470726013, "grad_norm": 1.4815537929534912, "learning_rate": 6.810193321616872e-05} +{"ts": "2025-12-26T21:51:06", "event": "train_log", "step": 778, "epoch": 0.32827004219409284, "progress_pct": 5.47, "epoch_pct": 5.47, "eta": "63:59:27", "max_grad_norm": 0.8, "loss": 0.8868235349655151, "grad_norm": 1.8543411493301392, "learning_rate": 6.827768014059754e-05} +{"ts": "2025-12-26T21:51:25", "event": "train_log", "step": 780, "epoch": 0.3291139240506329, "progress_pct": 5.49, "epoch_pct": 5.49, "eta": "63:54:37", "max_grad_norm": 0.8, "loss": 0.8148112297058105, "grad_norm": 1.8895748853683472, "learning_rate": 6.845342706502637e-05} +{"ts": "2025-12-26T21:51:44", "event": "train_log", "step": 782, "epoch": 0.329957805907173, "progress_pct": 5.5, "epoch_pct": 5.5, "eta": "63:49:41", "max_grad_norm": 0.8, "loss": 0.8760337829589844, "grad_norm": 1.8150591850280762, "learning_rate": 6.862917398945519e-05} +{"ts": "2025-12-26T21:52:04", "event": "train_log", "step": 784, "epoch": 0.3308016877637131, "progress_pct": 5.51, "epoch_pct": 5.51, "eta": "63:45:07", "max_grad_norm": 0.8, "loss": 0.8266322612762451, "grad_norm": 1.6661378145217896, "learning_rate": 6.880492091388401e-05} +{"ts": "2025-12-26T21:52:23", "event": "train_log", "step": 786, "epoch": 0.33164556962025316, "progress_pct": 5.53, "epoch_pct": 5.53, "eta": "63:40:15", "max_grad_norm": 0.8, "loss": 0.8599053025245667, "grad_norm": 2.2849128246307373, "learning_rate": 6.898066783831283e-05} +{"ts": "2025-12-26T21:52:43", "event": "train_log", "step": 788, "epoch": 0.33248945147679326, "progress_pct": 5.54, "epoch_pct": 5.54, "eta": "63:35:27", "max_grad_norm": 0.8, "loss": 0.8312317132949829, "grad_norm": 1.7233171463012695, "learning_rate": 6.915641476274165e-05} +{"ts": "2025-12-26T21:53:02", "event": "train_log", "step": 790, "epoch": 0.3333333333333333, "progress_pct": 5.56, "epoch_pct": 5.56, "eta": "63:30:39", "max_grad_norm": 0.8, "loss": 0.8379700779914856, "grad_norm": 1.7637618780136108, "learning_rate": 6.933216168717048e-05} +{"ts": "2025-12-26T21:53:21", "event": "train_log", "step": 792, "epoch": 0.3341772151898734, "progress_pct": 5.57, "epoch_pct": 5.57, "eta": "63:25:53", "max_grad_norm": 0.8, "loss": 0.8994934558868408, "grad_norm": 1.7780474424362183, "learning_rate": 6.95079086115993e-05} +{"ts": "2025-12-26T21:53:41", "event": "train_log", "step": 794, "epoch": 0.33502109704641353, "progress_pct": 5.58, "epoch_pct": 5.58, "eta": "63:21:22", "max_grad_norm": 0.8, "loss": 0.8021857738494873, "grad_norm": 1.5798883438110352, "learning_rate": 6.968365553602812e-05} +{"ts": "2025-12-26T21:54:00", "event": "train_log", "step": 796, "epoch": 0.3358649789029536, "progress_pct": 5.6, "epoch_pct": 5.6, "eta": "63:16:35", "max_grad_norm": 0.8, "loss": 0.8814419507980347, "grad_norm": 1.7316070795059204, "learning_rate": 6.985940246045695e-05} +{"ts": "2025-12-26T21:54:18", "event": "train_log", "step": 798, "epoch": 0.3367088607594937, "progress_pct": 5.61, "epoch_pct": 5.61, "eta": "63:11:39", "max_grad_norm": 0.8, "loss": 0.8545029163360596, "grad_norm": 1.711315631866455, "learning_rate": 7.003514938488577e-05} +{"ts": "2025-12-26T21:54:38", "event": "train_log", "step": 800, "epoch": 0.33755274261603374, "progress_pct": 5.63, "epoch_pct": 5.63, "eta": "63:07:09", "max_grad_norm": 0.8, "loss": 0.8006189465522766, "grad_norm": 1.5023137331008911, "learning_rate": 7.021089630931459e-05} +{"ts": "2025-12-26T22:09:04", "event": "train_log", "step": 800, "epoch": 0.33755274261603374, "progress_pct": 5.63, "epoch_pct": 5.63, "eta": "67:09:15", "max_grad_norm": 0.8, "eval_loss": 0.8635594248771667, "eval_runtime": 865.9348, "eval_samples_per_second": 2.433, "eval_steps_per_second": 2.433} +{"ts": "2025-12-26T22:09:24", "event": "train_log", "step": 802, "epoch": 0.33839662447257385, "progress_pct": 5.64, "epoch_pct": 5.64, "eta": "67:04:03", "max_grad_norm": 0.8, "loss": 0.7625874280929565, "grad_norm": 1.8377124071121216, "learning_rate": 7.038664323374341e-05} +{"ts": "2025-12-26T22:09:43", "event": "train_log", "step": 804, "epoch": 0.3392405063291139, "progress_pct": 5.65, "epoch_pct": 5.65, "eta": "66:58:42", "max_grad_norm": 0.8, "loss": 0.8490484356880188, "grad_norm": 1.5361332893371582, "learning_rate": 7.056239015817223e-05} +{"ts": "2025-12-26T22:10:02", "event": "train_log", "step": 806, "epoch": 0.340084388185654, "progress_pct": 5.67, "epoch_pct": 5.67, "eta": "66:53:33", "max_grad_norm": 0.8, "loss": 0.8915753364562988, "grad_norm": 1.8727388381958008, "learning_rate": 7.073813708260105e-05} +{"ts": "2025-12-26T22:10:22", "event": "train_log", "step": 808, "epoch": 0.3409282700421941, "progress_pct": 5.68, "epoch_pct": 5.68, "eta": "66:48:25", "max_grad_norm": 0.8, "loss": 0.8902620077133179, "grad_norm": 1.567700743675232, "learning_rate": 7.091388400702988e-05} +{"ts": "2025-12-26T22:10:43", "event": "train_log", "step": 810, "epoch": 0.34177215189873417, "progress_pct": 5.7, "epoch_pct": 5.7, "eta": "66:43:44", "max_grad_norm": 0.8, "loss": 0.7897103428840637, "grad_norm": 1.5302914381027222, "learning_rate": 7.10896309314587e-05} +{"ts": "2025-12-26T22:11:01", "event": "train_log", "step": 812, "epoch": 0.3426160337552743, "progress_pct": 5.71, "epoch_pct": 5.71, "eta": "66:38:23", "max_grad_norm": 0.8, "loss": 0.8648831248283386, "grad_norm": 1.8819153308868408, "learning_rate": 7.126537785588753e-05} +{"ts": "2025-12-26T22:11:21", "event": "train_log", "step": 814, "epoch": 0.3434599156118143, "progress_pct": 5.72, "epoch_pct": 5.72, "eta": "66:33:28", "max_grad_norm": 0.8, "loss": 0.8449499607086182, "grad_norm": 1.5671379566192627, "learning_rate": 7.144112478031635e-05} +{"ts": "2025-12-26T22:11:40", "event": "train_log", "step": 816, "epoch": 0.34430379746835443, "progress_pct": 5.74, "epoch_pct": 5.74, "eta": "66:28:11", "max_grad_norm": 0.8, "loss": 0.848559558391571, "grad_norm": 1.6570971012115479, "learning_rate": 7.161687170474517e-05} +{"ts": "2025-12-26T22:11:59", "event": "train_log", "step": 818, "epoch": 0.34514767932489454, "progress_pct": 5.75, "epoch_pct": 5.75, "eta": "66:22:57", "max_grad_norm": 0.8, "loss": 0.8847543597221375, "grad_norm": 1.9108437299728394, "learning_rate": 7.179261862917399e-05} +{"ts": "2025-12-26T22:12:19", "event": "train_log", "step": 820, "epoch": 0.3459915611814346, "progress_pct": 5.77, "epoch_pct": 5.77, "eta": "66:18:15", "max_grad_norm": 0.8, "loss": 0.7642563581466675, "grad_norm": 1.4909496307373047, "learning_rate": 7.196836555360281e-05} +{"ts": "2025-12-26T22:12:38", "event": "train_log", "step": 822, "epoch": 0.3468354430379747, "progress_pct": 5.78, "epoch_pct": 5.78, "eta": "66:13:11", "max_grad_norm": 0.8, "loss": 0.8714305758476257, "grad_norm": 1.768518328666687, "learning_rate": 7.214411247803163e-05} +{"ts": "2025-12-26T22:12:58", "event": "train_log", "step": 824, "epoch": 0.34767932489451475, "progress_pct": 5.79, "epoch_pct": 5.79, "eta": "66:08:23", "max_grad_norm": 0.8, "loss": 0.7712987661361694, "grad_norm": 1.715343952178955, "learning_rate": 7.231985940246046e-05} +{"ts": "2025-12-26T22:13:17", "event": "train_log", "step": 826, "epoch": 0.34852320675105486, "progress_pct": 5.81, "epoch_pct": 5.81, "eta": "66:03:20", "max_grad_norm": 0.8, "loss": 0.8122798204421997, "grad_norm": 1.6687803268432617, "learning_rate": 7.24956063268893e-05} +{"ts": "2025-12-26T22:13:38", "event": "train_log", "step": 828, "epoch": 0.3493670886075949, "progress_pct": 5.82, "epoch_pct": 5.82, "eta": "65:58:37", "max_grad_norm": 0.8, "loss": 0.793245792388916, "grad_norm": 1.5160514116287231, "learning_rate": 7.267135325131811e-05} +{"ts": "2025-12-26T22:13:56", "event": "train_log", "step": 830, "epoch": 0.350210970464135, "progress_pct": 5.84, "epoch_pct": 5.84, "eta": "65:53:28", "max_grad_norm": 0.8, "loss": 0.8747497200965881, "grad_norm": 1.6449401378631592, "learning_rate": 7.284710017574693e-05} +{"ts": "2025-12-26T22:14:17", "event": "train_log", "step": 832, "epoch": 0.3510548523206751, "progress_pct": 5.85, "epoch_pct": 5.85, "eta": "65:48:53", "max_grad_norm": 0.8, "loss": 0.6743978261947632, "grad_norm": 1.3907722234725952, "learning_rate": 7.302284710017575e-05} +{"ts": "2025-12-26T22:14:36", "event": "train_log", "step": 834, "epoch": 0.3518987341772152, "progress_pct": 5.86, "epoch_pct": 5.86, "eta": "65:43:53", "max_grad_norm": 0.8, "loss": 0.8524789214134216, "grad_norm": 1.633555293083191, "learning_rate": 7.319859402460457e-05} +{"ts": "2025-12-26T22:14:56", "event": "train_log", "step": 836, "epoch": 0.3527426160337553, "progress_pct": 5.88, "epoch_pct": 5.88, "eta": "65:39:22", "max_grad_norm": 0.8, "loss": 0.8045110702514648, "grad_norm": 1.5414257049560547, "learning_rate": 7.337434094903339e-05} +{"ts": "2025-12-26T22:15:15", "event": "train_log", "step": 838, "epoch": 0.35358649789029534, "progress_pct": 5.89, "epoch_pct": 5.89, "eta": "65:34:19", "max_grad_norm": 0.8, "loss": 0.8319593071937561, "grad_norm": 1.8520616292953491, "learning_rate": 7.355008787346221e-05} +{"ts": "2025-12-26T22:15:33", "event": "train_log", "step": 840, "epoch": 0.35443037974683544, "progress_pct": 5.91, "epoch_pct": 5.91, "eta": "65:29:14", "max_grad_norm": 0.8, "loss": 0.8188939094543457, "grad_norm": 1.6629763841629028, "learning_rate": 7.372583479789104e-05} +{"ts": "2025-12-26T22:15:53", "event": "train_log", "step": 842, "epoch": 0.35527426160337555, "progress_pct": 5.92, "epoch_pct": 5.92, "eta": "65:24:29", "max_grad_norm": 0.8, "loss": 0.8875360488891602, "grad_norm": 1.804087519645691, "learning_rate": 7.390158172231987e-05} +{"ts": "2025-12-26T22:16:13", "event": "train_log", "step": 844, "epoch": 0.3561181434599156, "progress_pct": 5.94, "epoch_pct": 5.94, "eta": "65:19:54", "max_grad_norm": 0.8, "loss": 0.8159612417221069, "grad_norm": 1.6031663417816162, "learning_rate": 7.407732864674869e-05} +{"ts": "2025-12-26T22:16:33", "event": "train_log", "step": 846, "epoch": 0.3569620253164557, "progress_pct": 5.95, "epoch_pct": 5.95, "eta": "65:15:15", "max_grad_norm": 0.8, "loss": 0.8422684669494629, "grad_norm": 1.7413033246994019, "learning_rate": 7.425307557117751e-05} +{"ts": "2025-12-26T22:16:52", "event": "train_log", "step": 848, "epoch": 0.35780590717299576, "progress_pct": 5.96, "epoch_pct": 5.96, "eta": "65:10:35", "max_grad_norm": 0.8, "loss": 0.9343502521514893, "grad_norm": 1.7699719667434692, "learning_rate": 7.442882249560633e-05} +{"ts": "2025-12-26T22:17:12", "event": "train_log", "step": 850, "epoch": 0.35864978902953587, "progress_pct": 5.98, "epoch_pct": 5.98, "eta": "65:06:06", "max_grad_norm": 0.8, "loss": 0.8168979287147522, "grad_norm": 1.4613301753997803, "learning_rate": 7.460456942003515e-05} +{"ts": "2025-12-26T22:17:32", "event": "train_log", "step": 852, "epoch": 0.3594936708860759, "progress_pct": 5.99, "epoch_pct": 5.99, "eta": "65:01:25", "max_grad_norm": 0.8, "loss": 0.9014382362365723, "grad_norm": 1.542431354522705, "learning_rate": 7.478031634446397e-05} +{"ts": "2025-12-26T22:17:50", "event": "train_log", "step": 854, "epoch": 0.36033755274261603, "progress_pct": 6.01, "epoch_pct": 6.01, "eta": "64:56:32", "max_grad_norm": 0.8, "loss": 0.8162738084793091, "grad_norm": 1.6070159673690796, "learning_rate": 7.49560632688928e-05} +{"ts": "2025-12-26T22:18:09", "event": "train_log", "step": 856, "epoch": 0.36118143459915614, "progress_pct": 6.02, "epoch_pct": 6.02, "eta": "64:51:50", "max_grad_norm": 0.8, "loss": 0.8354527950286865, "grad_norm": 1.7979451417922974, "learning_rate": 7.513181019332162e-05} +{"ts": "2025-12-26T22:18:28", "event": "train_log", "step": 858, "epoch": 0.3620253164556962, "progress_pct": 6.03, "epoch_pct": 6.03, "eta": "64:47:07", "max_grad_norm": 0.8, "loss": 0.8214042782783508, "grad_norm": 2.327045202255249, "learning_rate": 7.530755711775044e-05} +{"ts": "2025-12-26T22:18:49", "event": "train_log", "step": 860, "epoch": 0.3628691983122363, "progress_pct": 6.05, "epoch_pct": 6.05, "eta": "64:42:44", "max_grad_norm": 0.8, "loss": 0.7472147941589355, "grad_norm": 1.5085111856460571, "learning_rate": 7.548330404217927e-05} +{"ts": "2025-12-26T22:19:09", "event": "train_log", "step": 862, "epoch": 0.36371308016877635, "progress_pct": 6.06, "epoch_pct": 6.06, "eta": "64:38:16", "max_grad_norm": 0.8, "loss": 0.7586950063705444, "grad_norm": 1.6006290912628174, "learning_rate": 7.565905096660809e-05} +{"ts": "2025-12-26T22:19:29", "event": "train_log", "step": 864, "epoch": 0.36455696202531646, "progress_pct": 6.08, "epoch_pct": 6.08, "eta": "64:34:04", "max_grad_norm": 0.8, "loss": 0.8169914484024048, "grad_norm": 1.5170620679855347, "learning_rate": 7.583479789103691e-05} +{"ts": "2025-12-26T22:19:49", "event": "train_log", "step": 866, "epoch": 0.36540084388185656, "progress_pct": 6.09, "epoch_pct": 6.09, "eta": "64:29:33", "max_grad_norm": 0.8, "loss": 0.8263922929763794, "grad_norm": 1.5848352909088135, "learning_rate": 7.601054481546573e-05} +{"ts": "2025-12-26T22:20:08", "event": "train_log", "step": 868, "epoch": 0.3662447257383966, "progress_pct": 6.1, "epoch_pct": 6.1, "eta": "64:25:00", "max_grad_norm": 0.8, "loss": 0.8726240992546082, "grad_norm": 1.8502342700958252, "learning_rate": 7.618629173989455e-05} +{"ts": "2025-12-26T22:20:28", "event": "train_log", "step": 870, "epoch": 0.3670886075949367, "progress_pct": 6.12, "epoch_pct": 6.12, "eta": "64:20:39", "max_grad_norm": 0.8, "loss": 0.7220374941825867, "grad_norm": 1.506847620010376, "learning_rate": 7.636203866432338e-05} +{"ts": "2025-12-26T22:20:48", "event": "train_log", "step": 872, "epoch": 0.3679324894514768, "progress_pct": 6.13, "epoch_pct": 6.13, "eta": "64:16:17", "max_grad_norm": 0.8, "loss": 0.8028547167778015, "grad_norm": 1.5350452661514282, "learning_rate": 7.65377855887522e-05} +{"ts": "2025-12-26T22:21:08", "event": "train_log", "step": 874, "epoch": 0.3687763713080169, "progress_pct": 6.15, "epoch_pct": 6.15, "eta": "64:11:55", "max_grad_norm": 0.8, "loss": 0.7659649848937988, "grad_norm": 1.5011043548583984, "learning_rate": 7.671353251318102e-05} +{"ts": "2025-12-26T22:21:27", "event": "train_log", "step": 876, "epoch": 0.369620253164557, "progress_pct": 6.16, "epoch_pct": 6.16, "eta": "64:07:31", "max_grad_norm": 0.8, "loss": 0.8773653507232666, "grad_norm": 1.7019832134246826, "learning_rate": 7.688927943760984e-05} +{"ts": "2025-12-26T22:21:47", "event": "train_log", "step": 878, "epoch": 0.37046413502109704, "progress_pct": 6.17, "epoch_pct": 6.17, "eta": "64:03:10", "max_grad_norm": 0.8, "loss": 0.7977569103240967, "grad_norm": 1.4918498992919922, "learning_rate": 7.706502636203867e-05} +{"ts": "2025-12-26T22:22:07", "event": "train_log", "step": 880, "epoch": 0.37130801687763715, "progress_pct": 6.19, "epoch_pct": 6.19, "eta": "63:59:01", "max_grad_norm": 0.8, "loss": 0.7491976022720337, "grad_norm": 1.6422638893127441, "learning_rate": 7.724077328646749e-05} +{"ts": "2025-12-26T22:22:25", "event": "train_log", "step": 882, "epoch": 0.3721518987341772, "progress_pct": 6.2, "epoch_pct": 6.2, "eta": "63:54:14", "max_grad_norm": 0.8, "loss": 0.8754181265830994, "grad_norm": 1.7590434551239014, "learning_rate": 7.741652021089631e-05} +{"ts": "2025-12-26T22:22:47", "event": "train_log", "step": 884, "epoch": 0.3729957805907173, "progress_pct": 6.22, "epoch_pct": 6.22, "eta": "63:50:20", "max_grad_norm": 0.8, "loss": 0.8482301235198975, "grad_norm": 3.868894100189209, "learning_rate": 7.759226713532513e-05} +{"ts": "2025-12-26T22:23:06", "event": "train_log", "step": 886, "epoch": 0.37383966244725736, "progress_pct": 6.23, "epoch_pct": 6.23, "eta": "63:46:07", "max_grad_norm": 0.8, "loss": 0.8109031915664673, "grad_norm": 2.111875534057617, "learning_rate": 7.776801405975396e-05} +{"ts": "2025-12-26T22:23:26", "event": "train_log", "step": 888, "epoch": 0.37468354430379747, "progress_pct": 6.24, "epoch_pct": 6.24, "eta": "63:41:56", "max_grad_norm": 0.8, "loss": 0.8660775423049927, "grad_norm": 2.0838418006896973, "learning_rate": 7.794376098418278e-05} +{"ts": "2025-12-26T22:23:46", "event": "train_log", "step": 890, "epoch": 0.3755274261603376, "progress_pct": 6.26, "epoch_pct": 6.26, "eta": "63:37:35", "max_grad_norm": 0.8, "loss": 0.8418024778366089, "grad_norm": 1.553022027015686, "learning_rate": 7.81195079086116e-05} +{"ts": "2025-12-26T22:24:07", "event": "train_log", "step": 892, "epoch": 0.3763713080168776, "progress_pct": 6.27, "epoch_pct": 6.27, "eta": "63:33:49", "max_grad_norm": 0.8, "loss": 0.7764869928359985, "grad_norm": 1.334747314453125, "learning_rate": 7.829525483304042e-05} +{"ts": "2025-12-26T22:24:27", "event": "train_log", "step": 894, "epoch": 0.37721518987341773, "progress_pct": 6.29, "epoch_pct": 6.29, "eta": "63:29:32", "max_grad_norm": 0.8, "loss": 0.7460401654243469, "grad_norm": 1.4692286252975464, "learning_rate": 7.847100175746925e-05} +{"ts": "2025-12-26T22:24:47", "event": "train_log", "step": 896, "epoch": 0.3780590717299578, "progress_pct": 6.3, "epoch_pct": 6.3, "eta": "63:25:31", "max_grad_norm": 0.8, "loss": 0.7662873268127441, "grad_norm": 1.5374023914337158, "learning_rate": 7.864674868189807e-05} +{"ts": "2025-12-26T22:25:08", "event": "train_log", "step": 898, "epoch": 0.3789029535864979, "progress_pct": 6.32, "epoch_pct": 6.32, "eta": "63:21:35", "max_grad_norm": 0.8, "loss": 0.8165306448936462, "grad_norm": 1.5662524700164795, "learning_rate": 7.882249560632689e-05} +{"ts": "2025-12-26T22:25:29", "event": "train_log", "step": 900, "epoch": 0.379746835443038, "progress_pct": 6.33, "epoch_pct": 6.33, "eta": "63:17:53", "max_grad_norm": 0.8, "loss": 0.7913232445716858, "grad_norm": 4.498590469360352, "learning_rate": 7.899824253075572e-05} +{"ts": "2025-12-26T22:39:42", "event": "train_log", "step": 900, "epoch": 0.379746835443038, "progress_pct": 6.33, "epoch_pct": 6.33, "eta": "66:48:12", "max_grad_norm": 0.8, "eval_loss": 0.8491304516792297, "eval_runtime": 852.6211, "eval_samples_per_second": 2.471, "eval_steps_per_second": 2.471} +{"ts": "2025-12-26T22:40:01", "event": "train_log", "step": 902, "epoch": 0.38059071729957805, "progress_pct": 6.34, "epoch_pct": 6.34, "eta": "66:43:29", "max_grad_norm": 0.8, "loss": 0.8097161054611206, "grad_norm": 1.6320613622665405, "learning_rate": 7.917398945518454e-05} +{"ts": "2025-12-26T22:40:22", "event": "train_log", "step": 904, "epoch": 0.38143459915611816, "progress_pct": 6.36, "epoch_pct": 6.36, "eta": "66:39:04", "max_grad_norm": 0.8, "loss": 0.786399781703949, "grad_norm": 1.2562934160232544, "learning_rate": 7.934973637961336e-05} +{"ts": "2025-12-26T22:40:40", "event": "train_log", "step": 906, "epoch": 0.3822784810126582, "progress_pct": 6.37, "epoch_pct": 6.37, "eta": "66:34:07", "max_grad_norm": 0.8, "loss": 0.8385500311851501, "grad_norm": 1.6957594156265259, "learning_rate": 7.952548330404218e-05} +{"ts": "2025-12-26T22:40:58", "event": "train_log", "step": 908, "epoch": 0.3831223628691983, "progress_pct": 6.39, "epoch_pct": 6.39, "eta": "66:29:11", "max_grad_norm": 0.8, "loss": 0.8157848715782166, "grad_norm": 1.6662386655807495, "learning_rate": 7.9701230228471e-05} +{"ts": "2025-12-26T22:41:17", "event": "train_log", "step": 910, "epoch": 0.38396624472573837, "progress_pct": 6.4, "epoch_pct": 6.4, "eta": "66:24:24", "max_grad_norm": 0.8, "loss": 0.7937968373298645, "grad_norm": 1.6717777252197266, "learning_rate": 7.987697715289982e-05} +{"ts": "2025-12-26T22:41:37", "event": "train_log", "step": 912, "epoch": 0.3848101265822785, "progress_pct": 6.41, "epoch_pct": 6.41, "eta": "66:19:56", "max_grad_norm": 0.8, "loss": 0.7800109386444092, "grad_norm": 1.399484395980835, "learning_rate": 8.005272407732865e-05} +{"ts": "2025-12-26T22:41:57", "event": "train_log", "step": 914, "epoch": 0.3856540084388186, "progress_pct": 6.43, "epoch_pct": 6.43, "eta": "66:15:26", "max_grad_norm": 0.8, "loss": 0.8135939240455627, "grad_norm": 1.5671080350875854, "learning_rate": 8.022847100175747e-05} +{"ts": "2025-12-26T22:42:16", "event": "train_log", "step": 916, "epoch": 0.38649789029535864, "progress_pct": 6.44, "epoch_pct": 6.44, "eta": "66:10:38", "max_grad_norm": 0.8, "loss": 0.7482035160064697, "grad_norm": 1.4427763223648071, "learning_rate": 8.04042179261863e-05} +{"ts": "2025-12-26T22:42:35", "event": "train_log", "step": 918, "epoch": 0.38734177215189874, "progress_pct": 6.46, "epoch_pct": 6.46, "eta": "66:06:11", "max_grad_norm": 0.8, "loss": 0.7201873064041138, "grad_norm": 1.3314121961593628, "learning_rate": 8.057996485061512e-05} +{"ts": "2025-12-26T22:42:55", "event": "train_log", "step": 920, "epoch": 0.3881856540084388, "progress_pct": 6.47, "epoch_pct": 6.47, "eta": "66:01:45", "max_grad_norm": 0.8, "loss": 0.7933040857315063, "grad_norm": 1.5695286989212036, "learning_rate": 8.075571177504394e-05} +{"ts": "2025-12-26T22:43:16", "event": "train_log", "step": 922, "epoch": 0.3890295358649789, "progress_pct": 6.48, "epoch_pct": 6.48, "eta": "65:57:32", "max_grad_norm": 0.8, "loss": 0.8058338165283203, "grad_norm": 1.5091747045516968, "learning_rate": 8.093145869947276e-05} +{"ts": "2025-12-26T22:43:37", "event": "train_log", "step": 924, "epoch": 0.389873417721519, "progress_pct": 6.5, "epoch_pct": 6.5, "eta": "65:53:19", "max_grad_norm": 0.8, "loss": 0.7617828249931335, "grad_norm": 1.6287630796432495, "learning_rate": 8.110720562390158e-05} +{"ts": "2025-12-26T22:43:55", "event": "train_log", "step": 926, "epoch": 0.39071729957805906, "progress_pct": 6.51, "epoch_pct": 6.51, "eta": "65:48:36", "max_grad_norm": 0.8, "loss": 0.8710150122642517, "grad_norm": 1.6129482984542847, "learning_rate": 8.12829525483304e-05} +{"ts": "2025-12-26T22:44:13", "event": "train_log", "step": 928, "epoch": 0.39156118143459917, "progress_pct": 6.53, "epoch_pct": 6.53, "eta": "65:43:54", "max_grad_norm": 0.8, "loss": 0.9122233390808105, "grad_norm": 1.6457173824310303, "learning_rate": 8.145869947275922e-05} +{"ts": "2025-12-26T22:44:35", "event": "train_log", "step": 930, "epoch": 0.3924050632911392, "progress_pct": 6.54, "epoch_pct": 6.54, "eta": "65:40:02", "max_grad_norm": 0.8, "loss": 0.8339303731918335, "grad_norm": 1.6768827438354492, "learning_rate": 8.163444639718805e-05} +{"ts": "2025-12-26T22:44:56", "event": "train_log", "step": 932, "epoch": 0.39324894514767933, "progress_pct": 6.55, "epoch_pct": 6.55, "eta": "65:35:50", "max_grad_norm": 0.8, "loss": 0.8220396041870117, "grad_norm": 1.5419740676879883, "learning_rate": 8.181019332161688e-05} +{"ts": "2025-12-26T22:45:16", "event": "train_log", "step": 934, "epoch": 0.39409282700421944, "progress_pct": 6.57, "epoch_pct": 6.57, "eta": "65:31:44", "max_grad_norm": 0.8, "loss": 0.8531478047370911, "grad_norm": 1.4563747644424438, "learning_rate": 8.19859402460457e-05} +{"ts": "2025-12-26T22:45:36", "event": "train_log", "step": 936, "epoch": 0.3949367088607595, "progress_pct": 6.58, "epoch_pct": 6.58, "eta": "65:27:21", "max_grad_norm": 0.8, "loss": 0.8330869078636169, "grad_norm": 1.6208328008651733, "learning_rate": 8.216168717047452e-05} +{"ts": "2025-12-26T22:45:56", "event": "train_log", "step": 938, "epoch": 0.3957805907172996, "progress_pct": 6.6, "epoch_pct": 6.6, "eta": "65:23:00", "max_grad_norm": 0.8, "loss": 0.8011296987533569, "grad_norm": 1.6492482423782349, "learning_rate": 8.233743409490334e-05} +{"ts": "2025-12-26T22:46:15", "event": "train_log", "step": 940, "epoch": 0.39662447257383965, "progress_pct": 6.61, "epoch_pct": 6.61, "eta": "65:18:39", "max_grad_norm": 0.8, "loss": 0.8111353516578674, "grad_norm": 2.1611905097961426, "learning_rate": 8.251318101933216e-05} +{"ts": "2025-12-26T22:46:36", "event": "train_log", "step": 942, "epoch": 0.39746835443037976, "progress_pct": 6.62, "epoch_pct": 6.62, "eta": "65:14:39", "max_grad_norm": 0.8, "loss": 0.8282017111778259, "grad_norm": 1.7108231782913208, "learning_rate": 8.268892794376098e-05} +{"ts": "2025-12-26T22:46:56", "event": "train_log", "step": 944, "epoch": 0.3983122362869198, "progress_pct": 6.64, "epoch_pct": 6.64, "eta": "65:10:34", "max_grad_norm": 0.8, "loss": 0.7770059704780579, "grad_norm": 1.543465495109558, "learning_rate": 8.286467486818981e-05} +{"ts": "2025-12-26T22:47:17", "event": "train_log", "step": 946, "epoch": 0.3991561181434599, "progress_pct": 6.65, "epoch_pct": 6.65, "eta": "65:06:34", "max_grad_norm": 0.8, "loss": 0.8646430373191833, "grad_norm": 1.419969081878662, "learning_rate": 8.304042179261863e-05} +{"ts": "2025-12-26T22:47:36", "event": "train_log", "step": 948, "epoch": 0.4, "progress_pct": 6.67, "epoch_pct": 6.67, "eta": "65:02:07", "max_grad_norm": 0.8, "loss": 0.7949403524398804, "grad_norm": 1.5002100467681885, "learning_rate": 8.321616871704746e-05} +{"ts": "2025-12-26T22:47:57", "event": "train_log", "step": 950, "epoch": 0.4008438818565401, "progress_pct": 6.68, "epoch_pct": 6.68, "eta": "64:58:15", "max_grad_norm": 0.8, "loss": 0.8124079704284668, "grad_norm": 1.38933265209198, "learning_rate": 8.339191564147628e-05} +{"ts": "2025-12-26T22:48:16", "event": "train_log", "step": 952, "epoch": 0.4016877637130802, "progress_pct": 6.69, "epoch_pct": 6.69, "eta": "64:53:46", "max_grad_norm": 0.8, "loss": 0.8634148836135864, "grad_norm": 1.5948443412780762, "learning_rate": 8.35676625659051e-05} +{"ts": "2025-12-26T22:48:36", "event": "train_log", "step": 954, "epoch": 0.40253164556962023, "progress_pct": 6.71, "epoch_pct": 6.71, "eta": "64:49:50", "max_grad_norm": 0.8, "loss": 0.7410681247711182, "grad_norm": 1.4437624216079712, "learning_rate": 8.374340949033392e-05} +{"ts": "2025-12-26T22:48:58", "event": "train_log", "step": 956, "epoch": 0.40337552742616034, "progress_pct": 6.72, "epoch_pct": 6.72, "eta": "64:46:01", "max_grad_norm": 0.8, "loss": 0.7680280208587646, "grad_norm": 1.3457095623016357, "learning_rate": 8.391915641476274e-05} +{"ts": "2025-12-26T22:49:16", "event": "train_log", "step": 958, "epoch": 0.40421940928270045, "progress_pct": 6.74, "epoch_pct": 6.74, "eta": "64:41:28", "max_grad_norm": 0.8, "loss": 0.7921904921531677, "grad_norm": 1.610288143157959, "learning_rate": 8.409490333919156e-05} +{"ts": "2025-12-26T22:49:35", "event": "train_log", "step": 960, "epoch": 0.4050632911392405, "progress_pct": 6.75, "epoch_pct": 6.75, "eta": "64:37:20", "max_grad_norm": 0.8, "loss": 0.8320037126541138, "grad_norm": 1.5321530103683472, "learning_rate": 8.427065026362039e-05} +{"ts": "2025-12-26T22:49:55", "event": "train_log", "step": 962, "epoch": 0.4059071729957806, "progress_pct": 6.77, "epoch_pct": 6.77, "eta": "64:33:12", "max_grad_norm": 0.8, "loss": 0.8303092122077942, "grad_norm": 1.699881672859192, "learning_rate": 8.444639718804921e-05} +{"ts": "2025-12-26T22:50:13", "event": "train_log", "step": 964, "epoch": 0.40675105485232066, "progress_pct": 6.78, "epoch_pct": 6.78, "eta": "64:28:40", "max_grad_norm": 0.8, "loss": 0.9029796719551086, "grad_norm": 1.591515064239502, "learning_rate": 8.462214411247804e-05} +{"ts": "2025-12-26T22:50:35", "event": "train_log", "step": 966, "epoch": 0.40759493670886077, "progress_pct": 6.79, "epoch_pct": 6.79, "eta": "64:25:12", "max_grad_norm": 0.8, "loss": 0.8165359497070312, "grad_norm": 1.5930429697036743, "learning_rate": 8.479789103690686e-05} +{"ts": "2025-12-26T22:50:55", "event": "train_log", "step": 968, "epoch": 0.4084388185654008, "progress_pct": 6.81, "epoch_pct": 6.81, "eta": "64:21:05", "max_grad_norm": 0.8, "loss": 0.8276026248931885, "grad_norm": 1.509774923324585, "learning_rate": 8.497363796133568e-05} +{"ts": "2025-12-26T22:51:15", "event": "train_log", "step": 970, "epoch": 0.4092827004219409, "progress_pct": 6.82, "epoch_pct": 6.82, "eta": "64:17:13", "max_grad_norm": 0.8, "loss": 0.8159419894218445, "grad_norm": 1.3617016077041626, "learning_rate": 8.51493848857645e-05} +{"ts": "2025-12-26T22:51:35", "event": "train_log", "step": 972, "epoch": 0.41012658227848103, "progress_pct": 6.84, "epoch_pct": 6.84, "eta": "64:13:16", "max_grad_norm": 0.8, "loss": 0.7882336378097534, "grad_norm": 1.3580708503723145, "learning_rate": 8.532513181019332e-05} +{"ts": "2025-12-26T22:51:55", "event": "train_log", "step": 974, "epoch": 0.4109704641350211, "progress_pct": 6.85, "epoch_pct": 6.85, "eta": "64:09:15", "max_grad_norm": 0.8, "loss": 0.7462319731712341, "grad_norm": 1.3337358236312866, "learning_rate": 8.550087873462214e-05} +{"ts": "2025-12-26T22:52:16", "event": "train_log", "step": 976, "epoch": 0.4118143459915612, "progress_pct": 6.86, "epoch_pct": 6.86, "eta": "64:05:27", "max_grad_norm": 0.8, "loss": 0.7500866651535034, "grad_norm": 1.450363278388977, "learning_rate": 8.567662565905097e-05} +{"ts": "2025-12-26T22:52:35", "event": "train_log", "step": 978, "epoch": 0.41265822784810124, "progress_pct": 6.88, "epoch_pct": 6.88, "eta": "64:01:28", "max_grad_norm": 0.8, "loss": 0.8432503342628479, "grad_norm": 1.5305321216583252, "learning_rate": 8.585237258347979e-05} +{"ts": "2025-12-26T22:52:56", "event": "train_log", "step": 980, "epoch": 0.41350210970464135, "progress_pct": 6.89, "epoch_pct": 6.89, "eta": "63:57:42", "max_grad_norm": 0.8, "loss": 0.8330482840538025, "grad_norm": 1.2097326517105103, "learning_rate": 8.602811950790861e-05} +{"ts": "2025-12-26T22:53:15", "event": "train_log", "step": 982, "epoch": 0.41434599156118146, "progress_pct": 6.91, "epoch_pct": 6.91, "eta": "63:53:35", "max_grad_norm": 0.8, "loss": 0.8137149810791016, "grad_norm": 1.3916101455688477, "learning_rate": 8.620386643233744e-05} +{"ts": "2025-12-26T22:53:34", "event": "train_log", "step": 984, "epoch": 0.4151898734177215, "progress_pct": 6.92, "epoch_pct": 6.92, "eta": "63:49:24", "max_grad_norm": 0.8, "loss": 0.8273854851722717, "grad_norm": 1.6411453485488892, "learning_rate": 8.637961335676626e-05} +{"ts": "2025-12-26T22:53:54", "event": "train_log", "step": 986, "epoch": 0.4160337552742616, "progress_pct": 6.93, "epoch_pct": 6.93, "eta": "63:45:40", "max_grad_norm": 0.8, "loss": 0.794026255607605, "grad_norm": 1.6734566688537598, "learning_rate": 8.655536028119508e-05} +{"ts": "2025-12-26T22:54:13", "event": "train_log", "step": 988, "epoch": 0.41687763713080167, "progress_pct": 6.95, "epoch_pct": 6.95, "eta": "63:41:35", "max_grad_norm": 0.8, "loss": 0.7721655368804932, "grad_norm": 1.352325677871704, "learning_rate": 8.67311072056239e-05} +{"ts": "2025-12-26T22:54:34", "event": "train_log", "step": 990, "epoch": 0.4177215189873418, "progress_pct": 6.96, "epoch_pct": 6.96, "eta": "63:37:47", "max_grad_norm": 0.8, "loss": 0.8123438954353333, "grad_norm": 1.5368729829788208, "learning_rate": 8.690685413005273e-05} +{"ts": "2025-12-26T22:54:54", "event": "train_log", "step": 992, "epoch": 0.41856540084388183, "progress_pct": 6.98, "epoch_pct": 6.98, "eta": "63:34:06", "max_grad_norm": 0.8, "loss": 0.8370974659919739, "grad_norm": 1.4903568029403687, "learning_rate": 8.708260105448155e-05} +{"ts": "2025-12-26T22:55:13", "event": "train_log", "step": 994, "epoch": 0.41940928270042194, "progress_pct": 6.99, "epoch_pct": 6.99, "eta": "63:30:01", "max_grad_norm": 0.8, "loss": 0.780426561832428, "grad_norm": 1.3405622243881226, "learning_rate": 8.725834797891037e-05} +{"ts": "2025-12-26T22:55:33", "event": "train_log", "step": 996, "epoch": 0.42025316455696204, "progress_pct": 7.0, "epoch_pct": 7.0, "eta": "63:26:10", "max_grad_norm": 0.8, "loss": 0.8304934501647949, "grad_norm": 1.4761021137237549, "learning_rate": 8.743409490333919e-05} +{"ts": "2025-12-26T22:55:52", "event": "train_log", "step": 998, "epoch": 0.4210970464135021, "progress_pct": 7.02, "epoch_pct": 7.02, "eta": "63:22:16", "max_grad_norm": 0.8, "loss": 0.7960568070411682, "grad_norm": 1.520033359527588, "learning_rate": 8.760984182776801e-05} +{"ts": "2025-12-26T22:56:12", "event": "train_log", "step": 1000, "epoch": 0.4219409282700422, "progress_pct": 7.03, "epoch_pct": 7.03, "eta": "63:18:25", "max_grad_norm": 0.8, "loss": 0.7884663939476013, "grad_norm": 1.6916255950927734, "learning_rate": 8.778558875219684e-05} +{"ts": "2025-12-26T23:10:19", "event": "train_log", "step": 1000, "epoch": 0.4219409282700422, "progress_pct": 7.03, "epoch_pct": 7.03, "eta": "66:25:09", "max_grad_norm": 0.8, "eval_loss": 0.8388314247131348, "eval_runtime": 847.4828, "eval_samples_per_second": 2.486, "eval_steps_per_second": 2.486} +{"ts": "2025-12-26T23:10:41", "event": "train_log", "step": 1002, "epoch": 0.42278481012658226, "progress_pct": 7.05, "epoch_pct": 7.05, "eta": "66:21:14", "max_grad_norm": 0.8, "loss": 0.7930826544761658, "grad_norm": 1.6796396970748901, "learning_rate": 8.796133567662566e-05} +{"ts": "2025-12-26T23:11:02", "event": "train_log", "step": 1004, "epoch": 0.42362869198312236, "progress_pct": 7.06, "epoch_pct": 7.06, "eta": "66:17:31", "max_grad_norm": 0.8, "loss": 0.7138194441795349, "grad_norm": 1.4480048418045044, "learning_rate": 8.813708260105448e-05} +{"ts": "2025-12-26T23:11:23", "event": "train_log", "step": 1006, "epoch": 0.42447257383966247, "progress_pct": 7.07, "epoch_pct": 7.07, "eta": "66:13:35", "max_grad_norm": 0.8, "loss": 0.7367453575134277, "grad_norm": 1.2499021291732788, "learning_rate": 8.831282952548331e-05} +{"ts": "2025-12-26T23:11:42", "event": "train_log", "step": 1008, "epoch": 0.4253164556962025, "progress_pct": 7.09, "epoch_pct": 7.09, "eta": "66:09:09", "max_grad_norm": 0.8, "loss": 0.9051005244255066, "grad_norm": 1.6906769275665283, "learning_rate": 8.848857644991213e-05} +{"ts": "2025-12-26T23:12:02", "event": "train_log", "step": 1010, "epoch": 0.42616033755274263, "progress_pct": 7.1, "epoch_pct": 7.1, "eta": "66:05:02", "max_grad_norm": 0.8, "loss": 0.7469457387924194, "grad_norm": 1.4196792840957642, "learning_rate": 8.866432337434095e-05} +{"ts": "2025-12-26T23:12:21", "event": "train_log", "step": 1012, "epoch": 0.4270042194092827, "progress_pct": 7.12, "epoch_pct": 7.12, "eta": "66:00:45", "max_grad_norm": 0.8, "loss": 0.7443049550056458, "grad_norm": 1.5132776498794556, "learning_rate": 8.884007029876977e-05} +{"ts": "2025-12-26T23:12:42", "event": "train_log", "step": 1014, "epoch": 0.4278481012658228, "progress_pct": 7.13, "epoch_pct": 7.13, "eta": "65:56:50", "max_grad_norm": 0.8, "loss": 0.784084677696228, "grad_norm": 1.335705280303955, "learning_rate": 8.901581722319859e-05} +{"ts": "2025-12-26T23:13:01", "event": "train_log", "step": 1016, "epoch": 0.4286919831223629, "progress_pct": 7.14, "epoch_pct": 7.14, "eta": "65:52:33", "max_grad_norm": 0.8, "loss": 0.8603647947311401, "grad_norm": 1.6510252952575684, "learning_rate": 8.919156414762741e-05} +{"ts": "2025-12-26T23:13:20", "event": "train_log", "step": 1018, "epoch": 0.42953586497890295, "progress_pct": 7.16, "epoch_pct": 7.16, "eta": "65:48:28", "max_grad_norm": 0.8, "loss": 0.7921645641326904, "grad_norm": 1.35535728931427, "learning_rate": 8.936731107205624e-05} +{"ts": "2025-12-26T23:13:40", "event": "train_log", "step": 1020, "epoch": 0.43037974683544306, "progress_pct": 7.17, "epoch_pct": 7.17, "eta": "65:44:17", "max_grad_norm": 0.8, "loss": 0.799993634223938, "grad_norm": 1.4952049255371094, "learning_rate": 8.954305799648506e-05} +{"ts": "2025-12-26T23:14:00", "event": "train_log", "step": 1022, "epoch": 0.4312236286919831, "progress_pct": 7.19, "epoch_pct": 7.19, "eta": "65:40:18", "max_grad_norm": 0.8, "loss": 0.7697094082832336, "grad_norm": 1.5026042461395264, "learning_rate": 8.97188049209139e-05} +{"ts": "2025-12-26T23:14:19", "event": "train_log", "step": 1024, "epoch": 0.4320675105485232, "progress_pct": 7.2, "epoch_pct": 7.2, "eta": "65:36:12", "max_grad_norm": 0.8, "loss": 0.7988215684890747, "grad_norm": 1.5424275398254395, "learning_rate": 8.989455184534271e-05} +{"ts": "2025-12-26T23:14:39", "event": "train_log", "step": 1026, "epoch": 0.43291139240506327, "progress_pct": 7.22, "epoch_pct": 7.22, "eta": "65:32:10", "max_grad_norm": 0.8, "loss": 0.7841635942459106, "grad_norm": 1.438716173171997, "learning_rate": 9.007029876977153e-05} +{"ts": "2025-12-26T23:15:00", "event": "train_log", "step": 1028, "epoch": 0.4337552742616034, "progress_pct": 7.23, "epoch_pct": 7.23, "eta": "65:28:24", "max_grad_norm": 0.8, "loss": 0.7485025525093079, "grad_norm": 1.5040369033813477, "learning_rate": 9.024604569420035e-05} +{"ts": "2025-12-26T23:15:20", "event": "train_log", "step": 1030, "epoch": 0.4345991561181435, "progress_pct": 7.24, "epoch_pct": 7.24, "eta": "65:24:22", "max_grad_norm": 0.8, "loss": 0.7735623121261597, "grad_norm": 1.4354394674301147, "learning_rate": 9.042179261862917e-05} +{"ts": "2025-12-26T23:15:39", "event": "train_log", "step": 1032, "epoch": 0.43544303797468353, "progress_pct": 7.26, "epoch_pct": 7.26, "eta": "65:20:15", "max_grad_norm": 0.8, "loss": 0.8918828964233398, "grad_norm": 1.4841680526733398, "learning_rate": 9.059753954305799e-05} +{"ts": "2025-12-26T23:15:58", "event": "train_log", "step": 1034, "epoch": 0.43628691983122364, "progress_pct": 7.27, "epoch_pct": 7.27, "eta": "65:16:04", "max_grad_norm": 0.8, "loss": 0.835110068321228, "grad_norm": 1.428813099861145, "learning_rate": 9.077328646748682e-05} +{"ts": "2025-12-26T23:16:18", "event": "train_log", "step": 1036, "epoch": 0.4371308016877637, "progress_pct": 7.29, "epoch_pct": 7.29, "eta": "65:12:13", "max_grad_norm": 0.8, "loss": 0.746295690536499, "grad_norm": 1.559020757675171, "learning_rate": 9.094903339191566e-05} +{"ts": "2025-12-26T23:16:38", "event": "train_log", "step": 1038, "epoch": 0.4379746835443038, "progress_pct": 7.3, "epoch_pct": 7.3, "eta": "65:08:21", "max_grad_norm": 0.8, "loss": 0.8089123368263245, "grad_norm": 1.6996115446090698, "learning_rate": 9.112478031634448e-05} +{"ts": "2025-12-26T23:16:57", "event": "train_log", "step": 1040, "epoch": 0.4388185654008439, "progress_pct": 7.31, "epoch_pct": 7.31, "eta": "65:04:14", "max_grad_norm": 0.8, "loss": 0.8807073831558228, "grad_norm": 1.6615465879440308, "learning_rate": 9.13005272407733e-05} +{"ts": "2025-12-26T23:17:18", "event": "train_log", "step": 1042, "epoch": 0.43966244725738396, "progress_pct": 7.33, "epoch_pct": 7.33, "eta": "65:00:30", "max_grad_norm": 0.8, "loss": 0.7638427019119263, "grad_norm": 1.239142894744873, "learning_rate": 9.147627416520211e-05} +{"ts": "2025-12-26T23:17:39", "event": "train_log", "step": 1044, "epoch": 0.44050632911392407, "progress_pct": 7.34, "epoch_pct": 7.34, "eta": "64:56:59", "max_grad_norm": 0.8, "loss": 0.7817409634590149, "grad_norm": 1.1915178298950195, "learning_rate": 9.165202108963093e-05} +{"ts": "2025-12-26T23:18:00", "event": "train_log", "step": 1046, "epoch": 0.4413502109704641, "progress_pct": 7.36, "epoch_pct": 7.36, "eta": "64:53:15", "max_grad_norm": 0.8, "loss": 0.8586427569389343, "grad_norm": 1.6276934146881104, "learning_rate": 9.182776801405975e-05} +{"ts": "2025-12-26T23:18:21", "event": "train_log", "step": 1048, "epoch": 0.4421940928270042, "progress_pct": 7.37, "epoch_pct": 7.37, "eta": "64:49:41", "max_grad_norm": 0.8, "loss": 0.7481811046600342, "grad_norm": 1.480345606803894, "learning_rate": 9.200351493848857e-05} +{"ts": "2025-12-26T23:18:42", "event": "train_log", "step": 1050, "epoch": 0.4430379746835443, "progress_pct": 7.38, "epoch_pct": 7.38, "eta": "64:46:10", "max_grad_norm": 0.8, "loss": 0.8074686527252197, "grad_norm": 1.308419108390808, "learning_rate": 9.21792618629174e-05} +{"ts": "2025-12-26T23:19:02", "event": "train_log", "step": 1052, "epoch": 0.4438818565400844, "progress_pct": 7.4, "epoch_pct": 7.4, "eta": "64:42:14", "max_grad_norm": 0.8, "loss": 0.8455166816711426, "grad_norm": 1.6167182922363281, "learning_rate": 9.235500878734624e-05} +{"ts": "2025-12-26T23:19:22", "event": "train_log", "step": 1054, "epoch": 0.4447257383966245, "progress_pct": 7.41, "epoch_pct": 7.41, "eta": "64:38:25", "max_grad_norm": 0.8, "loss": 0.7255295515060425, "grad_norm": 1.6058826446533203, "learning_rate": 9.253075571177506e-05} +{"ts": "2025-12-26T23:19:41", "event": "train_log", "step": 1056, "epoch": 0.44556962025316454, "progress_pct": 7.43, "epoch_pct": 7.43, "eta": "64:34:36", "max_grad_norm": 0.8, "loss": 0.8329368233680725, "grad_norm": 1.6745728254318237, "learning_rate": 9.270650263620387e-05} +{"ts": "2025-12-26T23:20:00", "event": "train_log", "step": 1058, "epoch": 0.44641350210970465, "progress_pct": 7.44, "epoch_pct": 7.44, "eta": "64:30:35", "max_grad_norm": 0.8, "loss": 0.8583613634109497, "grad_norm": 1.5657380819320679, "learning_rate": 9.28822495606327e-05} +{"ts": "2025-12-26T23:20:19", "event": "train_log", "step": 1060, "epoch": 0.4472573839662447, "progress_pct": 7.45, "epoch_pct": 7.45, "eta": "64:26:39", "max_grad_norm": 0.8, "loss": 0.8546127080917358, "grad_norm": 1.5052601099014282, "learning_rate": 9.305799648506151e-05} +{"ts": "2025-12-26T23:20:38", "event": "train_log", "step": 1062, "epoch": 0.4481012658227848, "progress_pct": 7.47, "epoch_pct": 7.47, "eta": "64:22:41", "max_grad_norm": 0.8, "loss": 0.8416863679885864, "grad_norm": 1.510636806488037, "learning_rate": 9.323374340949033e-05} +{"ts": "2025-12-26T23:20:57", "event": "train_log", "step": 1064, "epoch": 0.4489451476793249, "progress_pct": 7.48, "epoch_pct": 7.48, "eta": "64:18:40", "max_grad_norm": 0.8, "loss": 0.830390453338623, "grad_norm": 1.4446617364883423, "learning_rate": 9.340949033391916e-05} +{"ts": "2025-12-26T23:21:15", "event": "train_log", "step": 1066, "epoch": 0.44978902953586497, "progress_pct": 7.5, "epoch_pct": 7.5, "eta": "64:14:36", "max_grad_norm": 0.8, "loss": 0.8000447154045105, "grad_norm": 1.6032582521438599, "learning_rate": 9.358523725834798e-05} +{"ts": "2025-12-26T23:21:35", "event": "train_log", "step": 1068, "epoch": 0.4506329113924051, "progress_pct": 7.51, "epoch_pct": 7.51, "eta": "64:10:54", "max_grad_norm": 0.8, "loss": 0.8310818672180176, "grad_norm": 1.5295692682266235, "learning_rate": 9.37609841827768e-05} +{"ts": "2025-12-26T23:21:54", "event": "train_log", "step": 1070, "epoch": 0.45147679324894513, "progress_pct": 7.52, "epoch_pct": 7.52, "eta": "64:06:59", "max_grad_norm": 0.8, "loss": 0.8377846479415894, "grad_norm": 1.3161942958831787, "learning_rate": 9.393673110720564e-05} +{"ts": "2025-12-26T23:22:13", "event": "train_log", "step": 1072, "epoch": 0.45232067510548524, "progress_pct": 7.54, "epoch_pct": 7.54, "eta": "64:03:07", "max_grad_norm": 0.8, "loss": 0.7852389216423035, "grad_norm": 1.4101601839065552, "learning_rate": 9.411247803163445e-05} +{"ts": "2025-12-26T23:22:32", "event": "train_log", "step": 1074, "epoch": 0.4531645569620253, "progress_pct": 7.55, "epoch_pct": 7.55, "eta": "63:59:17", "max_grad_norm": 0.8, "loss": 0.8763723969459534, "grad_norm": 1.4352775812149048, "learning_rate": 9.428822495606327e-05} +{"ts": "2025-12-26T23:22:51", "event": "train_log", "step": 1076, "epoch": 0.4540084388185654, "progress_pct": 7.57, "epoch_pct": 7.57, "eta": "63:55:22", "max_grad_norm": 0.8, "loss": 0.8177199363708496, "grad_norm": 1.4584673643112183, "learning_rate": 9.44639718804921e-05} +{"ts": "2025-12-26T23:23:10", "event": "train_log", "step": 1078, "epoch": 0.4548523206751055, "progress_pct": 7.58, "epoch_pct": 7.58, "eta": "63:51:28", "max_grad_norm": 0.8, "loss": 0.8333053588867188, "grad_norm": 1.6470575332641602, "learning_rate": 9.463971880492091e-05} +{"ts": "2025-12-26T23:23:28", "event": "train_log", "step": 1080, "epoch": 0.45569620253164556, "progress_pct": 7.59, "epoch_pct": 7.59, "eta": "63:47:37", "max_grad_norm": 0.8, "loss": 0.8546649217605591, "grad_norm": 1.4429512023925781, "learning_rate": 9.481546572934975e-05} +{"ts": "2025-12-26T23:23:47", "event": "train_log", "step": 1082, "epoch": 0.45654008438818566, "progress_pct": 7.61, "epoch_pct": 7.61, "eta": "63:43:40", "max_grad_norm": 0.8, "loss": 0.838036298751831, "grad_norm": 1.4885371923446655, "learning_rate": 9.499121265377856e-05} +{"ts": "2025-12-26T23:24:06", "event": "train_log", "step": 1084, "epoch": 0.4573839662447257, "progress_pct": 7.62, "epoch_pct": 7.62, "eta": "63:39:57", "max_grad_norm": 0.8, "loss": 0.7295010089874268, "grad_norm": 1.4601678848266602, "learning_rate": 9.516695957820738e-05} +{"ts": "2025-12-26T23:24:26", "event": "train_log", "step": 1086, "epoch": 0.4582278481012658, "progress_pct": 7.64, "epoch_pct": 7.64, "eta": "63:36:21", "max_grad_norm": 0.8, "loss": 0.6990782618522644, "grad_norm": 1.2399365901947021, "learning_rate": 9.53427065026362e-05} +{"ts": "2025-12-26T23:24:47", "event": "train_log", "step": 1088, "epoch": 0.45907172995780593, "progress_pct": 7.65, "epoch_pct": 7.65, "eta": "63:32:59", "max_grad_norm": 0.8, "loss": 0.7790928483009338, "grad_norm": 1.2936921119689941, "learning_rate": 9.551845342706504e-05} +{"ts": "2025-12-26T23:25:06", "event": "train_log", "step": 1090, "epoch": 0.459915611814346, "progress_pct": 7.67, "epoch_pct": 7.67, "eta": "63:29:09", "max_grad_norm": 0.8, "loss": 0.8061056733131409, "grad_norm": 1.3408331871032715, "learning_rate": 9.569420035149385e-05} +{"ts": "2025-12-26T23:25:24", "event": "train_log", "step": 1092, "epoch": 0.4607594936708861, "progress_pct": 7.68, "epoch_pct": 7.68, "eta": "63:25:21", "max_grad_norm": 0.8, "loss": 0.856796383857727, "grad_norm": 1.5525178909301758, "learning_rate": 9.586994727592267e-05} +{"ts": "2025-12-26T23:25:44", "event": "train_log", "step": 1094, "epoch": 0.46160337552742614, "progress_pct": 7.69, "epoch_pct": 7.69, "eta": "63:21:42", "max_grad_norm": 0.8, "loss": 0.7626663446426392, "grad_norm": 1.2944618463516235, "learning_rate": 9.604569420035149e-05} +{"ts": "2025-12-26T23:26:03", "event": "train_log", "step": 1096, "epoch": 0.46244725738396625, "progress_pct": 7.71, "epoch_pct": 7.71, "eta": "63:18:00", "max_grad_norm": 0.8, "loss": 0.7524681091308594, "grad_norm": 1.412204623222351, "learning_rate": 9.622144112478033e-05} +{"ts": "2025-12-26T23:26:22", "event": "train_log", "step": 1098, "epoch": 0.46329113924050636, "progress_pct": 7.72, "epoch_pct": 7.72, "eta": "63:14:16", "max_grad_norm": 0.8, "loss": 0.8430375456809998, "grad_norm": 1.4851596355438232, "learning_rate": 9.639718804920914e-05} +{"ts": "2025-12-26T23:26:40", "event": "train_log", "step": 1100, "epoch": 0.4641350210970464, "progress_pct": 7.74, "epoch_pct": 7.74, "eta": "63:10:18", "max_grad_norm": 0.8, "loss": 0.8374918103218079, "grad_norm": 1.831943154335022, "learning_rate": 9.657293497363796e-05} +{"ts": "2025-12-26T23:41:01", "event": "train_log", "step": 1100, "epoch": 0.4641350210970464, "progress_pct": 7.74, "epoch_pct": 7.74, "eta": "66:01:28", "max_grad_norm": 0.8, "eval_loss": 0.8283821940422058, "eval_runtime": 861.0464, "eval_samples_per_second": 2.447, "eval_steps_per_second": 2.447} +{"ts": "2025-12-26T23:41:20", "event": "train_log", "step": 1102, "epoch": 0.4649789029535865, "progress_pct": 7.75, "epoch_pct": 7.75, "eta": "65:57:30", "max_grad_norm": 0.8, "loss": 0.8063139915466309, "grad_norm": 1.4989945888519287, "learning_rate": 9.674868189806678e-05} +{"ts": "2025-12-26T23:41:40", "event": "train_log", "step": 1104, "epoch": 0.46582278481012657, "progress_pct": 7.76, "epoch_pct": 7.76, "eta": "65:53:43", "max_grad_norm": 0.8, "loss": 0.8109207153320312, "grad_norm": 1.3772722482681274, "learning_rate": 9.692442882249562e-05} +{"ts": "2025-12-26T23:42:00", "event": "train_log", "step": 1106, "epoch": 0.4666666666666667, "progress_pct": 7.78, "epoch_pct": 7.78, "eta": "65:49:58", "max_grad_norm": 0.8, "loss": 0.8667853474617004, "grad_norm": 1.4963124990463257, "learning_rate": 9.710017574692443e-05} +{"ts": "2025-12-26T23:42:22", "event": "train_log", "step": 1108, "epoch": 0.4675105485232067, "progress_pct": 7.79, "epoch_pct": 7.79, "eta": "65:46:33", "max_grad_norm": 0.8, "loss": 0.8020523190498352, "grad_norm": 1.4250836372375488, "learning_rate": 9.727592267135325e-05} +{"ts": "2025-12-26T23:42:42", "event": "train_log", "step": 1110, "epoch": 0.46835443037974683, "progress_pct": 7.81, "epoch_pct": 7.81, "eta": "65:42:48", "max_grad_norm": 0.8, "loss": 0.8271048069000244, "grad_norm": 1.475599765777588, "learning_rate": 9.745166959578209e-05} +{"ts": "2025-12-26T23:43:02", "event": "train_log", "step": 1112, "epoch": 0.46919831223628694, "progress_pct": 7.82, "epoch_pct": 7.82, "eta": "65:38:57", "max_grad_norm": 0.8, "loss": 0.7615619897842407, "grad_norm": 1.3727436065673828, "learning_rate": 9.76274165202109e-05} +{"ts": "2025-12-26T23:43:22", "event": "train_log", "step": 1114, "epoch": 0.470042194092827, "progress_pct": 7.83, "epoch_pct": 7.83, "eta": "65:35:14", "max_grad_norm": 0.8, "loss": 0.7843242883682251, "grad_norm": 1.2233914136886597, "learning_rate": 9.780316344463972e-05} +{"ts": "2025-12-26T23:43:41", "event": "train_log", "step": 1116, "epoch": 0.4708860759493671, "progress_pct": 7.85, "epoch_pct": 7.85, "eta": "65:31:20", "max_grad_norm": 0.8, "loss": 0.834839940071106, "grad_norm": 1.5734832286834717, "learning_rate": 9.797891036906854e-05} +{"ts": "2025-12-26T23:44:01", "event": "train_log", "step": 1118, "epoch": 0.47172995780590715, "progress_pct": 7.86, "epoch_pct": 7.86, "eta": "65:27:32", "max_grad_norm": 0.8, "loss": 0.7584373950958252, "grad_norm": 1.3778531551361084, "learning_rate": 9.815465729349736e-05} +{"ts": "2025-12-26T23:44:20", "event": "train_log", "step": 1120, "epoch": 0.47257383966244726, "progress_pct": 7.88, "epoch_pct": 7.88, "eta": "65:23:37", "max_grad_norm": 0.8, "loss": 0.8204697370529175, "grad_norm": 1.5535035133361816, "learning_rate": 9.833040421792618e-05} +{"ts": "2025-12-26T23:44:39", "event": "train_log", "step": 1122, "epoch": 0.47341772151898737, "progress_pct": 7.89, "epoch_pct": 7.89, "eta": "65:19:41", "max_grad_norm": 0.8, "loss": 0.9012852311134338, "grad_norm": 1.4743636846542358, "learning_rate": 9.850615114235501e-05} +{"ts": "2025-12-26T23:44:58", "event": "train_log", "step": 1124, "epoch": 0.4742616033755274, "progress_pct": 7.9, "epoch_pct": 7.9, "eta": "65:15:57", "max_grad_norm": 0.8, "loss": 0.8392805457115173, "grad_norm": 1.4134864807128906, "learning_rate": 9.868189806678383e-05} +{"ts": "2025-12-26T23:45:19", "event": "train_log", "step": 1126, "epoch": 0.4751054852320675, "progress_pct": 7.92, "epoch_pct": 7.92, "eta": "65:12:21", "max_grad_norm": 0.8, "loss": 0.7135441303253174, "grad_norm": 1.3308019638061523, "learning_rate": 9.885764499121267e-05} +{"ts": "2025-12-26T23:45:38", "event": "train_log", "step": 1128, "epoch": 0.4759493670886076, "progress_pct": 7.93, "epoch_pct": 7.93, "eta": "65:08:34", "max_grad_norm": 0.8, "loss": 0.8464727401733398, "grad_norm": 1.5354844331741333, "learning_rate": 9.903339191564149e-05} +{"ts": "2025-12-26T23:45:58", "event": "train_log", "step": 1130, "epoch": 0.4767932489451477, "progress_pct": 7.95, "epoch_pct": 7.95, "eta": "65:04:54", "max_grad_norm": 0.8, "loss": 0.7691597938537598, "grad_norm": 1.2730523347854614, "learning_rate": 9.92091388400703e-05} +{"ts": "2025-12-26T23:46:17", "event": "train_log", "step": 1132, "epoch": 0.47763713080168774, "progress_pct": 7.96, "epoch_pct": 7.96, "eta": "65:00:59", "max_grad_norm": 0.8, "loss": 0.8068788647651672, "grad_norm": 1.5459758043289185, "learning_rate": 9.938488576449912e-05} +{"ts": "2025-12-26T23:46:36", "event": "train_log", "step": 1134, "epoch": 0.47848101265822784, "progress_pct": 7.97, "epoch_pct": 7.97, "eta": "64:57:13", "max_grad_norm": 0.8, "loss": 0.8091006278991699, "grad_norm": 1.345678687095642, "learning_rate": 9.956063268892794e-05} +{"ts": "2025-12-26T23:46:55", "event": "train_log", "step": 1136, "epoch": 0.47932489451476795, "progress_pct": 7.99, "epoch_pct": 7.99, "eta": "64:53:30", "max_grad_norm": 0.8, "loss": 0.735533595085144, "grad_norm": 1.317076563835144, "learning_rate": 9.973637961335676e-05} +{"ts": "2025-12-26T23:47:14", "event": "train_log", "step": 1138, "epoch": 0.480168776371308, "progress_pct": 8.0, "epoch_pct": 8.0, "eta": "64:49:39", "max_grad_norm": 0.8, "loss": 0.7935182452201843, "grad_norm": 1.5011168718338013, "learning_rate": 9.99121265377856e-05} +{"ts": "2025-12-26T23:47:32", "event": "train_log", "step": 1140, "epoch": 0.4810126582278481, "progress_pct": 8.02, "epoch_pct": 8.02, "eta": "64:45:38", "max_grad_norm": 0.8, "loss": 0.8203520774841309, "grad_norm": 1.673899531364441, "learning_rate": 9.999999855824502e-05} +{"ts": "2025-12-26T23:47:51", "event": "train_log", "step": 1142, "epoch": 0.48185654008438816, "progress_pct": 8.03, "epoch_pct": 8.03, "eta": "64:41:57", "max_grad_norm": 0.8, "loss": 0.7233241200447083, "grad_norm": 1.344337821006775, "learning_rate": 9.999998702420562e-05} +{"ts": "2025-12-26T23:48:09", "event": "train_log", "step": 1144, "epoch": 0.48270042194092827, "progress_pct": 8.05, "epoch_pct": 8.05, "eta": "64:37:53", "max_grad_norm": 0.8, "loss": 0.8795552849769592, "grad_norm": 1.5819076299667358, "learning_rate": 9.999996395612948e-05} +{"ts": "2025-12-26T23:48:27", "event": "train_log", "step": 1146, "epoch": 0.4835443037974684, "progress_pct": 8.06, "epoch_pct": 8.06, "eta": "64:34:00", "max_grad_norm": 0.8, "loss": 0.8482733964920044, "grad_norm": 1.7427241802215576, "learning_rate": 9.999992935402192e-05} +{"ts": "2025-12-26T23:48:46", "event": "train_log", "step": 1148, "epoch": 0.48438818565400843, "progress_pct": 8.07, "epoch_pct": 8.07, "eta": "64:30:21", "max_grad_norm": 0.8, "loss": 0.7905706167221069, "grad_norm": 1.2877503633499146, "learning_rate": 9.999988321789093e-05} +{"ts": "2025-12-26T23:49:05", "event": "train_log", "step": 1150, "epoch": 0.48523206751054854, "progress_pct": 8.09, "epoch_pct": 8.09, "eta": "64:26:27", "max_grad_norm": 0.8, "loss": 0.8609708547592163, "grad_norm": 1.4887222051620483, "learning_rate": 9.999982554774715e-05} +{"ts": "2025-12-26T23:49:24", "event": "train_log", "step": 1152, "epoch": 0.4860759493670886, "progress_pct": 8.1, "epoch_pct": 8.1, "eta": "64:22:52", "max_grad_norm": 0.8, "loss": 0.7890065908432007, "grad_norm": 1.3625136613845825, "learning_rate": 9.999975634360388e-05} +{"ts": "2025-12-26T23:49:43", "event": "train_log", "step": 1154, "epoch": 0.4869198312236287, "progress_pct": 8.12, "epoch_pct": 8.12, "eta": "64:19:10", "max_grad_norm": 0.8, "loss": 0.7908958196640015, "grad_norm": 1.3631492853164673, "learning_rate": 9.999967560547708e-05} +{"ts": "2025-12-26T23:50:02", "event": "train_log", "step": 1156, "epoch": 0.4877637130801688, "progress_pct": 8.13, "epoch_pct": 8.13, "eta": "64:15:22", "max_grad_norm": 0.8, "loss": 0.8509655594825745, "grad_norm": 1.5244156122207642, "learning_rate": 9.99995833333854e-05} +{"ts": "2025-12-26T23:50:21", "event": "train_log", "step": 1158, "epoch": 0.48860759493670886, "progress_pct": 8.14, "epoch_pct": 8.14, "eta": "64:11:45", "max_grad_norm": 0.8, "loss": 0.7329106330871582, "grad_norm": 1.2513200044631958, "learning_rate": 9.999947952735007e-05} +{"ts": "2025-12-26T23:50:41", "event": "train_log", "step": 1160, "epoch": 0.48945147679324896, "progress_pct": 8.16, "epoch_pct": 8.16, "eta": "64:08:19", "max_grad_norm": 0.8, "loss": 0.7237489223480225, "grad_norm": 1.1539413928985596, "learning_rate": 9.99993641873951e-05} +{"ts": "2025-12-26T23:51:00", "event": "train_log", "step": 1162, "epoch": 0.490295358649789, "progress_pct": 8.17, "epoch_pct": 8.17, "eta": "64:04:33", "max_grad_norm": 0.8, "loss": 0.8650591373443604, "grad_norm": 1.3859314918518066, "learning_rate": 9.999923731354706e-05} +{"ts": "2025-12-26T23:51:20", "event": "train_log", "step": 1164, "epoch": 0.4911392405063291, "progress_pct": 8.19, "epoch_pct": 8.19, "eta": "64:01:10", "max_grad_norm": 0.8, "loss": 0.7516807913780212, "grad_norm": 1.2910805940628052, "learning_rate": 9.999909890583521e-05} +{"ts": "2025-12-26T23:51:41", "event": "train_log", "step": 1166, "epoch": 0.4919831223628692, "progress_pct": 8.2, "epoch_pct": 8.2, "eta": "63:57:54", "max_grad_norm": 0.8, "loss": 0.7082475423812866, "grad_norm": 1.6100077629089355, "learning_rate": 9.999894896429152e-05} +{"ts": "2025-12-26T23:52:02", "event": "train_log", "step": 1168, "epoch": 0.4928270042194093, "progress_pct": 8.21, "epoch_pct": 8.21, "eta": "63:54:39", "max_grad_norm": 0.8, "loss": 0.8403750658035278, "grad_norm": 1.2313556671142578, "learning_rate": 9.999878748895053e-05} +{"ts": "2025-12-26T23:52:20", "event": "train_log", "step": 1170, "epoch": 0.4936708860759494, "progress_pct": 8.23, "epoch_pct": 8.23, "eta": "63:50:56", "max_grad_norm": 0.8, "loss": 0.8083041906356812, "grad_norm": 1.3402830362319946, "learning_rate": 9.999861447984952e-05} +{"ts": "2025-12-26T23:52:39", "event": "train_log", "step": 1172, "epoch": 0.49451476793248944, "progress_pct": 8.24, "epoch_pct": 8.24, "eta": "63:47:14", "max_grad_norm": 0.8, "loss": 0.8339354991912842, "grad_norm": 1.516775131225586, "learning_rate": 9.999842993702839e-05} +{"ts": "2025-12-26T23:52:58", "event": "train_log", "step": 1174, "epoch": 0.49535864978902955, "progress_pct": 8.26, "epoch_pct": 8.26, "eta": "63:43:42", "max_grad_norm": 0.8, "loss": 0.7708724141120911, "grad_norm": 1.2698423862457275, "learning_rate": 9.999823386052971e-05} +{"ts": "2025-12-26T23:53:18", "event": "train_log", "step": 1176, "epoch": 0.4962025316455696, "progress_pct": 8.27, "epoch_pct": 8.27, "eta": "63:40:23", "max_grad_norm": 0.8, "loss": 0.7589715719223022, "grad_norm": 1.339390516281128, "learning_rate": 9.999802625039872e-05} +{"ts": "2025-12-26T23:53:37", "event": "train_log", "step": 1178, "epoch": 0.4970464135021097, "progress_pct": 8.28, "epoch_pct": 8.28, "eta": "63:36:50", "max_grad_norm": 0.8, "loss": 0.8523206114768982, "grad_norm": 1.4618452787399292, "learning_rate": 9.99978071066833e-05} +{"ts": "2025-12-26T23:53:57", "event": "train_log", "step": 1180, "epoch": 0.4978902953586498, "progress_pct": 8.3, "epoch_pct": 8.3, "eta": "63:33:22", "max_grad_norm": 0.8, "loss": 0.8143196105957031, "grad_norm": 1.4812564849853516, "learning_rate": 9.9997576429434e-05} +{"ts": "2025-12-26T23:54:15", "event": "train_log", "step": 1182, "epoch": 0.49873417721518987, "progress_pct": 8.31, "epoch_pct": 8.31, "eta": "63:29:39", "max_grad_norm": 0.8, "loss": 0.800125002861023, "grad_norm": 1.5720716714859009, "learning_rate": 9.999733421870405e-05} +{"ts": "2025-12-26T23:54:33", "event": "train_log", "step": 1184, "epoch": 0.49957805907173, "progress_pct": 8.33, "epoch_pct": 8.33, "eta": "63:25:56", "max_grad_norm": 0.8, "loss": 0.7618259191513062, "grad_norm": 1.4421230554580688, "learning_rate": 9.99970804745493e-05} +{"ts": "2025-12-26T23:54:53", "event": "train_log", "step": 1186, "epoch": 0.5004219409282701, "progress_pct": 8.34, "epoch_pct": 8.34, "eta": "63:22:33", "max_grad_norm": 0.8, "loss": 0.7162163853645325, "grad_norm": 1.5794934034347534, "learning_rate": 9.99968151970283e-05} +{"ts": "2025-12-26T23:55:12", "event": "train_log", "step": 1188, "epoch": 0.5012658227848101, "progress_pct": 8.35, "epoch_pct": 8.35, "eta": "63:19:08", "max_grad_norm": 0.8, "loss": 0.8089820146560669, "grad_norm": 1.8590432405471802, "learning_rate": 9.999653838620225e-05} +{"ts": "2025-12-26T23:55:32", "event": "train_log", "step": 1190, "epoch": 0.5021097046413502, "progress_pct": 8.37, "epoch_pct": 8.37, "eta": "63:15:42", "max_grad_norm": 0.8, "loss": 0.8011203408241272, "grad_norm": 1.5194507837295532, "learning_rate": 9.999625004213498e-05} +{"ts": "2025-12-26T23:55:51", "event": "train_log", "step": 1192, "epoch": 0.5029535864978903, "progress_pct": 8.38, "epoch_pct": 8.38, "eta": "63:12:14", "max_grad_norm": 0.8, "loss": 0.761158287525177, "grad_norm": 1.6986470222473145, "learning_rate": 9.999595016489303e-05} +{"ts": "2025-12-26T23:56:11", "event": "train_log", "step": 1194, "epoch": 0.5037974683544304, "progress_pct": 8.4, "epoch_pct": 8.4, "eta": "63:09:01", "max_grad_norm": 0.8, "loss": 0.7898027300834656, "grad_norm": 1.4413946866989136, "learning_rate": 9.999563875454559e-05} +{"ts": "2025-12-26T23:56:31", "event": "train_log", "step": 1196, "epoch": 0.5046413502109705, "progress_pct": 8.41, "epoch_pct": 8.41, "eta": "63:05:43", "max_grad_norm": 0.8, "loss": 0.8018442392349243, "grad_norm": 1.4509994983673096, "learning_rate": 9.999531581116443e-05} +{"ts": "2025-12-26T23:56:51", "event": "train_log", "step": 1198, "epoch": 0.5054852320675105, "progress_pct": 8.42, "epoch_pct": 8.42, "eta": "63:02:20", "max_grad_norm": 0.8, "loss": 0.7804076075553894, "grad_norm": 1.400659441947937, "learning_rate": 9.999498133482412e-05} +{"ts": "2025-12-26T23:57:09", "event": "train_log", "step": 1200, "epoch": 0.5063291139240507, "progress_pct": 8.44, "epoch_pct": 8.44, "eta": "62:58:51", "max_grad_norm": 0.8, "loss": 0.82496178150177, "grad_norm": 1.486840009689331, "learning_rate": 9.999463532560178e-05} +{"ts": "2025-12-27T00:11:32", "event": "train_log", "step": 1200, "epoch": 0.5063291139240507, "progress_pct": 8.44, "epoch_pct": 8.44, "eta": "65:34:46", "max_grad_norm": 0.8, "eval_loss": 0.8186545968055725, "eval_runtime": 862.1638, "eval_samples_per_second": 2.444, "eval_steps_per_second": 2.444} +{"ts": "2025-12-27T00:11:51", "event": "train_log", "step": 1202, "epoch": 0.5071729957805907, "progress_pct": 8.45, "epoch_pct": 8.45, "eta": "65:31:06", "max_grad_norm": 0.8, "loss": 0.8037722706794739, "grad_norm": 1.2770357131958008, "learning_rate": 9.999427778357723e-05} +{"ts": "2025-12-27T00:12:11", "event": "train_log", "step": 1204, "epoch": 0.5080168776371308, "progress_pct": 8.47, "epoch_pct": 8.47, "eta": "65:27:31", "max_grad_norm": 0.8, "loss": 0.7329373359680176, "grad_norm": 1.4540977478027344, "learning_rate": 9.999390870883297e-05} +{"ts": "2025-12-27T00:12:29", "event": "train_log", "step": 1206, "epoch": 0.5088607594936709, "progress_pct": 8.48, "epoch_pct": 8.48, "eta": "65:23:39", "max_grad_norm": 0.8, "loss": 0.8224589824676514, "grad_norm": 1.4469913244247437, "learning_rate": 9.999352810145412e-05} +{"ts": "2025-12-27T00:12:48", "event": "train_log", "step": 1208, "epoch": 0.509704641350211, "progress_pct": 8.5, "epoch_pct": 8.5, "eta": "65:19:58", "max_grad_norm": 0.8, "loss": 0.8106292486190796, "grad_norm": 1.46500563621521, "learning_rate": 9.999313596152847e-05} +{"ts": "2025-12-27T00:13:08", "event": "train_log", "step": 1210, "epoch": 0.510548523206751, "progress_pct": 8.51, "epoch_pct": 8.51, "eta": "65:16:27", "max_grad_norm": 0.8, "loss": 0.747698187828064, "grad_norm": 1.3526637554168701, "learning_rate": 9.999273228914649e-05} +{"ts": "2025-12-27T00:13:26", "event": "train_log", "step": 1212, "epoch": 0.5113924050632911, "progress_pct": 8.52, "epoch_pct": 8.52, "eta": "65:12:46", "max_grad_norm": 0.8, "loss": 0.7612425684928894, "grad_norm": 1.28840172290802, "learning_rate": 9.999231708440131e-05} +{"ts": "2025-12-27T00:13:49", "event": "train_log", "step": 1214, "epoch": 0.5122362869198313, "progress_pct": 8.54, "epoch_pct": 8.54, "eta": "65:09:40", "max_grad_norm": 0.8, "loss": 0.6839463710784912, "grad_norm": 1.0283230543136597, "learning_rate": 9.99918903473887e-05} +{"ts": "2025-12-27T00:14:06", "event": "train_log", "step": 1216, "epoch": 0.5130801687763713, "progress_pct": 8.55, "epoch_pct": 8.55, "eta": "65:05:49", "max_grad_norm": 0.8, "loss": 0.8539203405380249, "grad_norm": 1.5231431722640991, "learning_rate": 9.999145207820708e-05} +{"ts": "2025-12-27T00:14:25", "event": "train_log", "step": 1218, "epoch": 0.5139240506329114, "progress_pct": 8.57, "epoch_pct": 8.57, "eta": "65:02:10", "max_grad_norm": 0.8, "loss": 0.7960102558135986, "grad_norm": 1.3289231061935425, "learning_rate": 9.999100227695758e-05} +{"ts": "2025-12-27T00:14:44", "event": "train_log", "step": 1220, "epoch": 0.5147679324894515, "progress_pct": 8.58, "epoch_pct": 8.58, "eta": "64:58:28", "max_grad_norm": 0.8, "loss": 0.7639255523681641, "grad_norm": 1.3770930767059326, "learning_rate": 9.999054094374396e-05} +{"ts": "2025-12-27T00:15:02", "event": "train_log", "step": 1222, "epoch": 0.5156118143459916, "progress_pct": 8.59, "epoch_pct": 8.59, "eta": "64:54:44", "max_grad_norm": 0.8, "loss": 0.7743061780929565, "grad_norm": 1.3028030395507812, "learning_rate": 9.999006807867262e-05} +{"ts": "2025-12-27T00:15:23", "event": "train_log", "step": 1224, "epoch": 0.5164556962025316, "progress_pct": 8.61, "epoch_pct": 8.61, "eta": "64:51:27", "max_grad_norm": 0.8, "loss": 0.7922407984733582, "grad_norm": 1.1827034950256348, "learning_rate": 9.998958368185265e-05} +{"ts": "2025-12-27T00:15:42", "event": "train_log", "step": 1226, "epoch": 0.5172995780590718, "progress_pct": 8.62, "epoch_pct": 8.62, "eta": "64:47:53", "max_grad_norm": 0.8, "loss": 0.7671286463737488, "grad_norm": 1.2973705530166626, "learning_rate": 9.99890877533958e-05} +{"ts": "2025-12-27T00:16:02", "event": "train_log", "step": 1228, "epoch": 0.5181434599156118, "progress_pct": 8.64, "epoch_pct": 8.64, "eta": "64:44:29", "max_grad_norm": 0.8, "loss": 0.7546951174736023, "grad_norm": 1.5820153951644897, "learning_rate": 9.998858029341646e-05} +{"ts": "2025-12-27T00:16:20", "event": "train_log", "step": 1230, "epoch": 0.5189873417721519, "progress_pct": 8.65, "epoch_pct": 8.65, "eta": "64:40:42", "max_grad_norm": 0.8, "loss": 0.8734183311462402, "grad_norm": 1.6140317916870117, "learning_rate": 9.99880613020317e-05} +{"ts": "2025-12-27T00:16:39", "event": "train_log", "step": 1232, "epoch": 0.5198312236286919, "progress_pct": 8.66, "epoch_pct": 8.66, "eta": "64:37:08", "max_grad_norm": 0.8, "loss": 0.8410643339157104, "grad_norm": 1.1190184354782104, "learning_rate": 9.998753077936122e-05} +{"ts": "2025-12-27T00:16:58", "event": "train_log", "step": 1234, "epoch": 0.5206751054852321, "progress_pct": 8.68, "epoch_pct": 8.68, "eta": "64:33:39", "max_grad_norm": 0.8, "loss": 0.7769841551780701, "grad_norm": 1.3876196146011353, "learning_rate": 9.998698872552744e-05} +{"ts": "2025-12-27T00:17:17", "event": "train_log", "step": 1236, "epoch": 0.5215189873417722, "progress_pct": 8.69, "epoch_pct": 8.69, "eta": "64:30:07", "max_grad_norm": 0.8, "loss": 0.8846109509468079, "grad_norm": 1.699522852897644, "learning_rate": 9.998643514065535e-05} +{"ts": "2025-12-27T00:17:37", "event": "train_log", "step": 1238, "epoch": 0.5223628691983122, "progress_pct": 8.71, "epoch_pct": 8.71, "eta": "64:26:39", "max_grad_norm": 0.8, "loss": 0.7664945125579834, "grad_norm": 1.3805134296417236, "learning_rate": 9.998587002487271e-05} +{"ts": "2025-12-27T00:17:56", "event": "train_log", "step": 1240, "epoch": 0.5232067510548524, "progress_pct": 8.72, "epoch_pct": 8.72, "eta": "64:23:10", "max_grad_norm": 0.8, "loss": 0.7243514060974121, "grad_norm": 1.3679476976394653, "learning_rate": 9.998529337830984e-05} +{"ts": "2025-12-27T00:18:15", "event": "train_log", "step": 1242, "epoch": 0.5240506329113924, "progress_pct": 8.73, "epoch_pct": 8.73, "eta": "64:19:42", "max_grad_norm": 0.8, "loss": 0.8061941862106323, "grad_norm": 1.399200677871704, "learning_rate": 9.998470520109977e-05} +{"ts": "2025-12-27T00:18:37", "event": "train_log", "step": 1244, "epoch": 0.5248945147679325, "progress_pct": 8.75, "epoch_pct": 8.75, "eta": "64:16:39", "max_grad_norm": 0.8, "loss": 0.7741840481758118, "grad_norm": 1.3441044092178345, "learning_rate": 9.99841054933782e-05} +{"ts": "2025-12-27T00:18:55", "event": "train_log", "step": 1246, "epoch": 0.5257383966244725, "progress_pct": 8.76, "epoch_pct": 8.76, "eta": "64:13:08", "max_grad_norm": 0.8, "loss": 0.7619491815567017, "grad_norm": 1.3375325202941895, "learning_rate": 9.998349425528344e-05} +{"ts": "2025-12-27T00:19:14", "event": "train_log", "step": 1248, "epoch": 0.5265822784810127, "progress_pct": 8.78, "epoch_pct": 8.78, "eta": "64:09:36", "max_grad_norm": 0.8, "loss": 0.8315094113349915, "grad_norm": 1.5517847537994385, "learning_rate": 9.998287148695651e-05} +{"ts": "2025-12-27T00:19:34", "event": "train_log", "step": 1250, "epoch": 0.5274261603375527, "progress_pct": 8.79, "epoch_pct": 8.79, "eta": "64:06:19", "max_grad_norm": 0.8, "loss": 0.7536082863807678, "grad_norm": 1.244997501373291, "learning_rate": 9.998223718854107e-05} +{"ts": "2025-12-27T00:19:53", "event": "train_log", "step": 1252, "epoch": 0.5282700421940928, "progress_pct": 8.8, "epoch_pct": 8.8, "eta": "64:02:49", "max_grad_norm": 0.8, "loss": 0.826419472694397, "grad_norm": 1.3190033435821533, "learning_rate": 9.998159136018344e-05} +{"ts": "2025-12-27T00:20:12", "event": "train_log", "step": 1254, "epoch": 0.529113924050633, "progress_pct": 8.82, "epoch_pct": 8.82, "eta": "63:59:27", "max_grad_norm": 0.8, "loss": 0.7866435647010803, "grad_norm": 1.2750061750411987, "learning_rate": 9.998093400203259e-05} +{"ts": "2025-12-27T00:20:32", "event": "train_log", "step": 1256, "epoch": 0.529957805907173, "progress_pct": 8.83, "epoch_pct": 8.83, "eta": "63:56:11", "max_grad_norm": 0.8, "loss": 0.7796626687049866, "grad_norm": 1.422908067703247, "learning_rate": 9.998026511424017e-05} +{"ts": "2025-12-27T00:20:52", "event": "train_log", "step": 1258, "epoch": 0.5308016877637131, "progress_pct": 8.85, "epoch_pct": 8.85, "eta": "63:52:51", "max_grad_norm": 0.8, "loss": 0.815027117729187, "grad_norm": 1.435552954673767, "learning_rate": 9.997958469696048e-05} +{"ts": "2025-12-27T00:21:14", "event": "train_log", "step": 1260, "epoch": 0.5316455696202531, "progress_pct": 8.86, "epoch_pct": 8.86, "eta": "63:50:02", "max_grad_norm": 0.8, "loss": 0.6925795674324036, "grad_norm": 1.1950994729995728, "learning_rate": 9.997889275035049e-05} +{"ts": "2025-12-27T00:21:35", "event": "train_log", "step": 1262, "epoch": 0.5324894514767933, "progress_pct": 8.87, "epoch_pct": 8.87, "eta": "63:46:57", "max_grad_norm": 0.8, "loss": 0.822464108467102, "grad_norm": 1.3049622774124146, "learning_rate": 9.997818927456978e-05} +{"ts": "2025-12-27T00:21:57", "event": "train_log", "step": 1264, "epoch": 0.5333333333333333, "progress_pct": 8.89, "epoch_pct": 8.89, "eta": "63:43:56", "max_grad_norm": 0.8, "loss": 0.7955381274223328, "grad_norm": 1.2197340726852417, "learning_rate": 9.997747426978066e-05} +{"ts": "2025-12-27T00:22:17", "event": "train_log", "step": 1266, "epoch": 0.5341772151898734, "progress_pct": 8.9, "epoch_pct": 8.9, "eta": "63:40:49", "max_grad_norm": 0.8, "loss": 0.8642181754112244, "grad_norm": 1.2463661432266235, "learning_rate": 9.997674773614807e-05} +{"ts": "2025-12-27T00:22:36", "event": "train_log", "step": 1268, "epoch": 0.5350210970464135, "progress_pct": 8.92, "epoch_pct": 8.92, "eta": "63:37:27", "max_grad_norm": 0.8, "loss": 0.8776891827583313, "grad_norm": 1.421393871307373, "learning_rate": 9.99760096738396e-05} +{"ts": "2025-12-27T00:22:56", "event": "train_log", "step": 1270, "epoch": 0.5358649789029536, "progress_pct": 8.93, "epoch_pct": 8.93, "eta": "63:34:17", "max_grad_norm": 0.8, "loss": 0.7446491122245789, "grad_norm": 1.4347561597824097, "learning_rate": 9.997526008302549e-05} +{"ts": "2025-12-27T00:23:17", "event": "train_log", "step": 1272, "epoch": 0.5367088607594936, "progress_pct": 8.95, "epoch_pct": 8.95, "eta": "63:31:06", "max_grad_norm": 0.8, "loss": 0.8581281304359436, "grad_norm": 1.2056710720062256, "learning_rate": 9.99744989638787e-05} +{"ts": "2025-12-27T00:23:36", "event": "train_log", "step": 1274, "epoch": 0.5375527426160338, "progress_pct": 8.96, "epoch_pct": 8.96, "eta": "63:27:47", "max_grad_norm": 0.8, "loss": 0.7386330366134644, "grad_norm": 1.1672608852386475, "learning_rate": 9.997372631657475e-05} +{"ts": "2025-12-27T00:23:54", "event": "train_log", "step": 1276, "epoch": 0.5383966244725739, "progress_pct": 8.97, "epoch_pct": 8.97, "eta": "63:24:22", "max_grad_norm": 0.8, "loss": 0.7806804776191711, "grad_norm": 1.4313966035842896, "learning_rate": 9.997294214129191e-05} +{"ts": "2025-12-27T00:24:16", "event": "train_log", "step": 1278, "epoch": 0.5392405063291139, "progress_pct": 8.99, "epoch_pct": 8.99, "eta": "63:21:32", "max_grad_norm": 0.8, "loss": 0.6830351948738098, "grad_norm": 1.1666971445083618, "learning_rate": 9.997214643821107e-05} +{"ts": "2025-12-27T00:24:37", "event": "train_log", "step": 1280, "epoch": 0.540084388185654, "progress_pct": 9.0, "epoch_pct": 9.0, "eta": "63:18:29", "max_grad_norm": 0.8, "loss": 0.8570694327354431, "grad_norm": 1.491783857345581, "learning_rate": 9.997133920751578e-05} +{"ts": "2025-12-27T00:24:59", "event": "train_log", "step": 1282, "epoch": 0.5409282700421941, "progress_pct": 9.02, "epoch_pct": 9.02, "eta": "63:15:45", "max_grad_norm": 0.8, "loss": 0.7016772031784058, "grad_norm": 1.1879212856292725, "learning_rate": 9.997052044939226e-05} +{"ts": "2025-12-27T00:25:18", "event": "train_log", "step": 1284, "epoch": 0.5417721518987342, "progress_pct": 9.03, "epoch_pct": 9.03, "eta": "63:12:28", "max_grad_norm": 0.8, "loss": 0.7711107134819031, "grad_norm": 1.2692012786865234, "learning_rate": 9.996969016402935e-05} +{"ts": "2025-12-27T00:25:38", "event": "train_log", "step": 1286, "epoch": 0.5426160337552742, "progress_pct": 9.04, "epoch_pct": 9.04, "eta": "63:09:19", "max_grad_norm": 0.8, "loss": 0.7807164788246155, "grad_norm": 1.3318448066711426, "learning_rate": 9.996884835161863e-05} +{"ts": "2025-12-27T00:26:00", "event": "train_log", "step": 1288, "epoch": 0.5434599156118144, "progress_pct": 9.06, "epoch_pct": 9.06, "eta": "63:06:29", "max_grad_norm": 0.8, "loss": 0.7331319451332092, "grad_norm": 1.1786744594573975, "learning_rate": 9.996799501235425e-05} +{"ts": "2025-12-27T00:26:19", "event": "train_log", "step": 1290, "epoch": 0.5443037974683544, "progress_pct": 9.07, "epoch_pct": 9.07, "eta": "63:03:11", "max_grad_norm": 0.8, "loss": 0.7191547155380249, "grad_norm": 1.4092369079589844, "learning_rate": 9.996713014643309e-05} +{"ts": "2025-12-27T00:26:41", "event": "train_log", "step": 1292, "epoch": 0.5451476793248945, "progress_pct": 9.09, "epoch_pct": 9.09, "eta": "63:00:23", "max_grad_norm": 0.8, "loss": 0.7233871221542358, "grad_norm": 1.377099633216858, "learning_rate": 9.996625375405463e-05} +{"ts": "2025-12-27T00:27:00", "event": "train_log", "step": 1294, "epoch": 0.5459915611814345, "progress_pct": 9.1, "epoch_pct": 9.1, "eta": "62:57:05", "max_grad_norm": 0.8, "loss": 0.7925472855567932, "grad_norm": 1.404945969581604, "learning_rate": 9.996536583542105e-05} +{"ts": "2025-12-27T00:27:20", "event": "train_log", "step": 1296, "epoch": 0.5468354430379747, "progress_pct": 9.11, "epoch_pct": 9.11, "eta": "62:54:07", "max_grad_norm": 0.8, "loss": 0.7749786376953125, "grad_norm": 1.2555286884307861, "learning_rate": 9.996446639073718e-05} +{"ts": "2025-12-27T00:27:41", "event": "train_log", "step": 1298, "epoch": 0.5476793248945148, "progress_pct": 9.13, "epoch_pct": 9.13, "eta": "62:51:14", "max_grad_norm": 0.8, "loss": 0.7647517919540405, "grad_norm": 1.2577459812164307, "learning_rate": 9.996355542021048e-05} +{"ts": "2025-12-27T00:28:01", "event": "train_log", "step": 1300, "epoch": 0.5485232067510548, "progress_pct": 9.14, "epoch_pct": 9.14, "eta": "62:48:02", "max_grad_norm": 0.8, "loss": 0.8621891140937805, "grad_norm": 1.3587758541107178, "learning_rate": 9.996263292405113e-05} +{"ts": "2025-12-27T00:42:14", "event": "train_log", "step": 1300, "epoch": 0.5485232067510548, "progress_pct": 9.14, "epoch_pct": 9.14, "eta": "65:09:26", "max_grad_norm": 0.8, "eval_loss": 0.808323085308075, "eval_runtime": 853.577, "eval_samples_per_second": 2.468, "eval_steps_per_second": 2.468} +{"ts": "2025-12-27T00:42:33", "event": "train_log", "step": 1302, "epoch": 0.549367088607595, "progress_pct": 9.16, "epoch_pct": 9.16, "eta": "65:05:57", "max_grad_norm": 0.8, "loss": 0.749254584312439, "grad_norm": 1.327125906944275, "learning_rate": 9.996169890247191e-05} +{"ts": "2025-12-27T00:42:53", "event": "train_log", "step": 1304, "epoch": 0.550210970464135, "progress_pct": 9.17, "epoch_pct": 9.17, "eta": "65:02:35", "max_grad_norm": 0.8, "loss": 0.7362856268882751, "grad_norm": 1.4620670080184937, "learning_rate": 9.99607533556883e-05} +{"ts": "2025-12-27T00:43:12", "event": "train_log", "step": 1306, "epoch": 0.5510548523206751, "progress_pct": 9.18, "epoch_pct": 9.18, "eta": "64:59:13", "max_grad_norm": 0.8, "loss": 0.7918445467948914, "grad_norm": 1.4119454622268677, "learning_rate": 9.99597962839184e-05} +{"ts": "2025-12-27T00:43:32", "event": "train_log", "step": 1308, "epoch": 0.5518987341772152, "progress_pct": 9.2, "epoch_pct": 9.2, "eta": "64:55:50", "max_grad_norm": 0.8, "loss": 0.7348005175590515, "grad_norm": 1.497522234916687, "learning_rate": 9.995882768738298e-05} +{"ts": "2025-12-27T00:43:50", "event": "train_log", "step": 1310, "epoch": 0.5527426160337553, "progress_pct": 9.21, "epoch_pct": 9.21, "eta": "64:52:16", "max_grad_norm": 0.8, "loss": 0.8310725688934326, "grad_norm": 1.535741925239563, "learning_rate": 9.99578475663055e-05} +{"ts": "2025-12-27T00:44:10", "event": "train_log", "step": 1312, "epoch": 0.5535864978902953, "progress_pct": 9.23, "epoch_pct": 9.23, "eta": "64:49:02", "max_grad_norm": 0.8, "loss": 0.8232766389846802, "grad_norm": 1.4606215953826904, "learning_rate": 9.995685592091204e-05} +{"ts": "2025-12-27T00:44:30", "event": "train_log", "step": 1314, "epoch": 0.5544303797468354, "progress_pct": 9.24, "epoch_pct": 9.24, "eta": "64:45:52", "max_grad_norm": 0.8, "loss": 0.8273071050643921, "grad_norm": 1.2442357540130615, "learning_rate": 9.995585275143136e-05} +{"ts": "2025-12-27T00:44:50", "event": "train_log", "step": 1316, "epoch": 0.5552742616033756, "progress_pct": 9.25, "epoch_pct": 9.25, "eta": "64:42:29", "max_grad_norm": 0.8, "loss": 0.7518656253814697, "grad_norm": 1.5128520727157593, "learning_rate": 9.995483805809487e-05} +{"ts": "2025-12-27T00:45:10", "event": "train_log", "step": 1318, "epoch": 0.5561181434599156, "progress_pct": 9.27, "epoch_pct": 9.27, "eta": "64:39:15", "max_grad_norm": 0.8, "loss": 0.8261662721633911, "grad_norm": 1.340149998664856, "learning_rate": 9.995381184113664e-05} +{"ts": "2025-12-27T00:45:31", "event": "train_log", "step": 1320, "epoch": 0.5569620253164557, "progress_pct": 9.28, "epoch_pct": 9.28, "eta": "64:36:21", "max_grad_norm": 0.8, "loss": 0.5775256156921387, "grad_norm": 1.1409451961517334, "learning_rate": 9.99527741007934e-05} +{"ts": "2025-12-27T00:45:50", "event": "train_log", "step": 1322, "epoch": 0.5578059071729958, "progress_pct": 9.3, "epoch_pct": 9.3, "eta": "64:32:56", "max_grad_norm": 0.8, "loss": 0.7698423862457275, "grad_norm": 1.3489247560501099, "learning_rate": 9.995172483730455e-05} +{"ts": "2025-12-27T00:46:07", "event": "train_log", "step": 1324, "epoch": 0.5586497890295359, "progress_pct": 9.31, "epoch_pct": 9.31, "eta": "64:29:13", "max_grad_norm": 0.8, "loss": 0.8053334355354309, "grad_norm": 1.4950530529022217, "learning_rate": 9.995066405091211e-05} +{"ts": "2025-12-27T00:46:26", "event": "train_log", "step": 1326, "epoch": 0.5594936708860759, "progress_pct": 9.32, "epoch_pct": 9.32, "eta": "64:25:48", "max_grad_norm": 0.8, "loss": 0.7826266288757324, "grad_norm": 1.3814653158187866, "learning_rate": 9.994959174186078e-05} +{"ts": "2025-12-27T00:46:45", "event": "train_log", "step": 1328, "epoch": 0.560337552742616, "progress_pct": 9.34, "epoch_pct": 9.34, "eta": "64:22:34", "max_grad_norm": 0.8, "loss": 0.7862131595611572, "grad_norm": 1.3383625745773315, "learning_rate": 9.994850791039796e-05} +{"ts": "2025-12-27T00:47:05", "event": "train_log", "step": 1330, "epoch": 0.5611814345991561, "progress_pct": 9.35, "epoch_pct": 9.35, "eta": "64:19:15", "max_grad_norm": 0.8, "loss": 0.8428501486778259, "grad_norm": 1.3529670238494873, "learning_rate": 9.994741255677363e-05} +{"ts": "2025-12-27T00:47:25", "event": "train_log", "step": 1332, "epoch": 0.5620253164556962, "progress_pct": 9.37, "epoch_pct": 9.37, "eta": "64:16:06", "max_grad_norm": 0.8, "loss": 0.7340869307518005, "grad_norm": 1.254215121269226, "learning_rate": 9.994630568124049e-05} +{"ts": "2025-12-27T00:47:45", "event": "train_log", "step": 1334, "epoch": 0.5628691983122363, "progress_pct": 9.38, "epoch_pct": 9.38, "eta": "64:12:57", "max_grad_norm": 0.8, "loss": 0.7052226662635803, "grad_norm": 1.2869828939437866, "learning_rate": 9.994518728405386e-05} +{"ts": "2025-12-27T00:48:02", "event": "train_log", "step": 1336, "epoch": 0.5637130801687764, "progress_pct": 9.4, "epoch_pct": 9.4, "eta": "64:09:25", "max_grad_norm": 0.8, "loss": 0.8297074437141418, "grad_norm": 1.4321808815002441, "learning_rate": 9.994405736547174e-05} +{"ts": "2025-12-27T00:48:23", "event": "train_log", "step": 1338, "epoch": 0.5645569620253165, "progress_pct": 9.41, "epoch_pct": 9.41, "eta": "64:06:23", "max_grad_norm": 0.8, "loss": 0.7183220982551575, "grad_norm": 1.4638891220092773, "learning_rate": 9.994291592575478e-05} +{"ts": "2025-12-27T00:48:43", "event": "train_log", "step": 1340, "epoch": 0.5654008438818565, "progress_pct": 9.42, "epoch_pct": 9.42, "eta": "64:03:17", "max_grad_norm": 0.8, "loss": 0.8146093487739563, "grad_norm": 1.4947413206100464, "learning_rate": 9.994176296516628e-05} +{"ts": "2025-12-27T00:49:01", "event": "train_log", "step": 1342, "epoch": 0.5662447257383966, "progress_pct": 9.44, "epoch_pct": 9.44, "eta": "63:59:50", "max_grad_norm": 0.8, "loss": 0.7583593130111694, "grad_norm": 1.343862533569336, "learning_rate": 9.994059848397221e-05} +{"ts": "2025-12-27T00:49:21", "event": "train_log", "step": 1344, "epoch": 0.5670886075949367, "progress_pct": 9.45, "epoch_pct": 9.45, "eta": "63:56:41", "max_grad_norm": 0.8, "loss": 0.7682924270629883, "grad_norm": 1.203550100326538, "learning_rate": 9.993942248244121e-05} +{"ts": "2025-12-27T00:49:40", "event": "train_log", "step": 1346, "epoch": 0.5679324894514768, "progress_pct": 9.47, "epoch_pct": 9.47, "eta": "63:53:26", "max_grad_norm": 0.8, "loss": 0.8139828443527222, "grad_norm": 1.287660002708435, "learning_rate": 9.993823496084455e-05} +{"ts": "2025-12-27T00:50:01", "event": "train_log", "step": 1348, "epoch": 0.5687763713080168, "progress_pct": 9.48, "epoch_pct": 9.48, "eta": "63:50:25", "max_grad_norm": 0.8, "loss": 0.7529099583625793, "grad_norm": 1.3326014280319214, "learning_rate": 9.993703591945616e-05} +{"ts": "2025-12-27T00:50:20", "event": "train_log", "step": 1350, "epoch": 0.569620253164557, "progress_pct": 9.49, "epoch_pct": 9.49, "eta": "63:47:15", "max_grad_norm": 0.8, "loss": 0.6997471451759338, "grad_norm": 1.2441487312316895, "learning_rate": 9.993582535855263e-05} +{"ts": "2025-12-27T00:50:41", "event": "train_log", "step": 1352, "epoch": 0.570464135021097, "progress_pct": 9.51, "epoch_pct": 9.51, "eta": "63:44:15", "max_grad_norm": 0.8, "loss": 0.7421218752861023, "grad_norm": 1.2647649049758911, "learning_rate": 9.993460327841325e-05} +{"ts": "2025-12-27T00:51:03", "event": "train_log", "step": 1354, "epoch": 0.5713080168776371, "progress_pct": 9.52, "epoch_pct": 9.52, "eta": "63:41:30", "max_grad_norm": 0.8, "loss": 0.7342398166656494, "grad_norm": 1.146399974822998, "learning_rate": 9.99333696793199e-05} +{"ts": "2025-12-27T00:51:24", "event": "train_log", "step": 1356, "epoch": 0.5721518987341773, "progress_pct": 9.54, "epoch_pct": 9.54, "eta": "63:38:38", "max_grad_norm": 0.8, "loss": 0.7175891399383545, "grad_norm": 1.3346691131591797, "learning_rate": 9.993212456155715e-05} +{"ts": "2025-12-27T00:51:44", "event": "train_log", "step": 1358, "epoch": 0.5729957805907173, "progress_pct": 9.55, "epoch_pct": 9.55, "eta": "63:35:31", "max_grad_norm": 0.8, "loss": 0.8108891248703003, "grad_norm": 1.3950672149658203, "learning_rate": 9.993086792541222e-05} +{"ts": "2025-12-27T00:52:04", "event": "train_log", "step": 1360, "epoch": 0.5738396624472574, "progress_pct": 9.56, "epoch_pct": 9.56, "eta": "63:32:34", "max_grad_norm": 0.8, "loss": 0.6979889273643494, "grad_norm": 1.339931845664978, "learning_rate": 9.992959977117502e-05} +{"ts": "2025-12-27T00:52:26", "event": "train_log", "step": 1362, "epoch": 0.5746835443037974, "progress_pct": 9.58, "epoch_pct": 9.58, "eta": "63:29:45", "max_grad_norm": 0.8, "loss": 0.7635799050331116, "grad_norm": 1.3276840448379517, "learning_rate": 9.992832009913806e-05} +{"ts": "2025-12-27T00:52:46", "event": "train_log", "step": 1364, "epoch": 0.5755274261603376, "progress_pct": 9.59, "epoch_pct": 9.59, "eta": "63:26:42", "max_grad_norm": 0.8, "loss": 0.7575043439865112, "grad_norm": 1.5015610456466675, "learning_rate": 9.992702890959653e-05} +{"ts": "2025-12-27T00:53:05", "event": "train_log", "step": 1366, "epoch": 0.5763713080168776, "progress_pct": 9.61, "epoch_pct": 9.61, "eta": "63:23:39", "max_grad_norm": 0.8, "loss": 0.8134847283363342, "grad_norm": 1.4755414724349976, "learning_rate": 9.99257262028483e-05} +{"ts": "2025-12-27T00:53:25", "event": "train_log", "step": 1368, "epoch": 0.5772151898734177, "progress_pct": 9.62, "epoch_pct": 9.62, "eta": "63:20:32", "max_grad_norm": 0.8, "loss": 0.7663828134536743, "grad_norm": 1.3788783550262451, "learning_rate": 9.992441197919388e-05} +{"ts": "2025-12-27T00:53:47", "event": "train_log", "step": 1370, "epoch": 0.5780590717299579, "progress_pct": 9.63, "epoch_pct": 9.63, "eta": "63:17:50", "max_grad_norm": 0.8, "loss": 0.6711251735687256, "grad_norm": 1.2814711332321167, "learning_rate": 9.992308623893644e-05} +{"ts": "2025-12-27T00:54:05", "event": "train_log", "step": 1372, "epoch": 0.5789029535864979, "progress_pct": 9.65, "epoch_pct": 9.65, "eta": "63:14:34", "max_grad_norm": 0.8, "loss": 0.8097200393676758, "grad_norm": 1.5343635082244873, "learning_rate": 9.99217489823818e-05} +{"ts": "2025-12-27T00:54:23", "event": "train_log", "step": 1374, "epoch": 0.579746835443038, "progress_pct": 9.66, "epoch_pct": 9.66, "eta": "63:11:17", "max_grad_norm": 0.8, "loss": 0.8274240493774414, "grad_norm": 1.3029557466506958, "learning_rate": 9.992040020983843e-05} +{"ts": "2025-12-27T00:54:43", "event": "train_log", "step": 1376, "epoch": 0.580590717299578, "progress_pct": 9.68, "epoch_pct": 9.68, "eta": "63:08:15", "max_grad_norm": 0.8, "loss": 0.7758964896202087, "grad_norm": 1.4034144878387451, "learning_rate": 9.991903992161746e-05} +{"ts": "2025-12-27T00:55:03", "event": "train_log", "step": 1378, "epoch": 0.5814345991561182, "progress_pct": 9.69, "epoch_pct": 9.69, "eta": "63:05:14", "max_grad_norm": 0.8, "loss": 0.6571930050849915, "grad_norm": 1.2340021133422852, "learning_rate": 9.991766811803271e-05} +{"ts": "2025-12-27T00:55:22", "event": "train_log", "step": 1380, "epoch": 0.5822784810126582, "progress_pct": 9.7, "epoch_pct": 9.7, "eta": "63:02:05", "max_grad_norm": 0.8, "loss": 0.7381542921066284, "grad_norm": 1.3082842826843262, "learning_rate": 9.991628479940061e-05} +{"ts": "2025-12-27T00:55:42", "event": "train_log", "step": 1382, "epoch": 0.5831223628691983, "progress_pct": 9.72, "epoch_pct": 9.72, "eta": "62:59:14", "max_grad_norm": 0.8, "loss": 0.8081237077713013, "grad_norm": 1.8134801387786865, "learning_rate": 9.991488996604025e-05} +{"ts": "2025-12-27T00:56:03", "event": "train_log", "step": 1384, "epoch": 0.5839662447257384, "progress_pct": 9.73, "epoch_pct": 9.73, "eta": "62:56:20", "max_grad_norm": 0.8, "loss": 0.7761610746383667, "grad_norm": 1.4598309993743896, "learning_rate": 9.991348361827343e-05} +{"ts": "2025-12-27T00:56:25", "event": "train_log", "step": 1386, "epoch": 0.5848101265822785, "progress_pct": 9.75, "epoch_pct": 9.75, "eta": "62:53:47", "max_grad_norm": 0.8, "loss": 0.6872953176498413, "grad_norm": 1.2974225282669067, "learning_rate": 9.991206575642453e-05} +{"ts": "2025-12-27T00:56:45", "event": "train_log", "step": 1388, "epoch": 0.5856540084388185, "progress_pct": 9.76, "epoch_pct": 9.76, "eta": "62:50:46", "max_grad_norm": 0.8, "loss": 0.7601345777511597, "grad_norm": 1.24009370803833, "learning_rate": 9.991063638082065e-05} +{"ts": "2025-12-27T00:57:05", "event": "train_log", "step": 1390, "epoch": 0.5864978902953587, "progress_pct": 9.77, "epoch_pct": 9.77, "eta": "62:47:48", "max_grad_norm": 0.8, "loss": 0.7138593792915344, "grad_norm": 1.176713228225708, "learning_rate": 9.99091954917915e-05} +{"ts": "2025-12-27T00:57:25", "event": "train_log", "step": 1392, "epoch": 0.5873417721518988, "progress_pct": 9.79, "epoch_pct": 9.79, "eta": "62:44:55", "max_grad_norm": 0.8, "loss": 0.7730305194854736, "grad_norm": 1.1056525707244873, "learning_rate": 9.990774308966949e-05} +{"ts": "2025-12-27T00:57:46", "event": "train_log", "step": 1394, "epoch": 0.5881856540084388, "progress_pct": 9.8, "epoch_pct": 9.8, "eta": "62:42:06", "max_grad_norm": 0.8, "loss": 0.7076689600944519, "grad_norm": 1.382847547531128, "learning_rate": 9.990627917478962e-05} +{"ts": "2025-12-27T00:58:04", "event": "train_log", "step": 1396, "epoch": 0.5890295358649789, "progress_pct": 9.82, "epoch_pct": 9.82, "eta": "62:38:57", "max_grad_norm": 0.8, "loss": 0.7970513105392456, "grad_norm": 1.2507930994033813, "learning_rate": 9.990480374748964e-05} +{"ts": "2025-12-27T00:58:24", "event": "train_log", "step": 1398, "epoch": 0.589873417721519, "progress_pct": 9.83, "epoch_pct": 9.83, "eta": "62:35:57", "max_grad_norm": 0.8, "loss": 0.7906717658042908, "grad_norm": 1.2266724109649658, "learning_rate": 9.990331680810987e-05} +{"ts": "2025-12-27T00:58:42", "event": "train_log", "step": 1400, "epoch": 0.5907172995780591, "progress_pct": 9.85, "epoch_pct": 9.85, "eta": "62:32:53", "max_grad_norm": 0.8, "loss": 0.853204607963562, "grad_norm": 1.299920916557312, "learning_rate": 9.99018183569933e-05} +{"ts": "2025-12-27T01:12:54", "event": "train_log", "step": 1400, "epoch": 0.5907172995780591, "progress_pct": 9.85, "epoch_pct": 9.85, "eta": "64:42:54", "max_grad_norm": 0.8, "eval_loss": 0.8009664416313171, "eval_runtime": 851.9417, "eval_samples_per_second": 2.473, "eval_steps_per_second": 2.473} +{"ts": "2025-12-27T01:13:15", "event": "train_log", "step": 1402, "epoch": 0.5915611814345991, "progress_pct": 9.86, "epoch_pct": 9.86, "eta": "64:39:57", "max_grad_norm": 0.8, "loss": 0.8140703439712524, "grad_norm": 1.2114863395690918, "learning_rate": 9.990030839448564e-05} +{"ts": "2025-12-27T01:13:35", "event": "train_log", "step": 1404, "epoch": 0.5924050632911393, "progress_pct": 9.87, "epoch_pct": 9.87, "eta": "64:36:52", "max_grad_norm": 0.8, "loss": 0.7471320629119873, "grad_norm": 1.3301794528961182, "learning_rate": 9.989878692093518e-05} +{"ts": "2025-12-27T01:13:55", "event": "train_log", "step": 1406, "epoch": 0.5932489451476793, "progress_pct": 9.89, "epoch_pct": 9.89, "eta": "64:33:43", "max_grad_norm": 0.8, "loss": 0.7307024002075195, "grad_norm": 1.2611899375915527, "learning_rate": 9.98972539366929e-05} +{"ts": "2025-12-27T01:14:15", "event": "train_log", "step": 1408, "epoch": 0.5940928270042194, "progress_pct": 9.9, "epoch_pct": 9.9, "eta": "64:30:39", "max_grad_norm": 0.8, "loss": 0.6843112111091614, "grad_norm": 1.1717802286148071, "learning_rate": 9.989570944211244e-05} +{"ts": "2025-12-27T01:14:34", "event": "train_log", "step": 1410, "epoch": 0.5949367088607594, "progress_pct": 9.92, "epoch_pct": 9.92, "eta": "64:27:23", "max_grad_norm": 0.8, "loss": 0.7025372385978699, "grad_norm": 1.3323513269424438, "learning_rate": 9.989415343755006e-05} +{"ts": "2025-12-27T01:14:53", "event": "train_log", "step": 1412, "epoch": 0.5957805907172996, "progress_pct": 9.93, "epoch_pct": 9.93, "eta": "64:24:15", "max_grad_norm": 0.8, "loss": 0.7792683839797974, "grad_norm": 1.4225109815597534, "learning_rate": 9.989258592336473e-05} +{"ts": "2025-12-27T01:15:12", "event": "train_log", "step": 1414, "epoch": 0.5966244725738397, "progress_pct": 9.94, "epoch_pct": 9.94, "eta": "64:21:02", "max_grad_norm": 0.8, "loss": 0.8328315019607544, "grad_norm": 1.2878522872924805, "learning_rate": 9.989100689991804e-05} +{"ts": "2025-12-27T01:15:33", "event": "train_log", "step": 1416, "epoch": 0.5974683544303797, "progress_pct": 9.96, "epoch_pct": 9.96, "eta": "64:18:06", "max_grad_norm": 0.8, "loss": 0.7700617909431458, "grad_norm": 1.2067214250564575, "learning_rate": 9.988941636757421e-05} +{"ts": "2025-12-27T01:15:54", "event": "train_log", "step": 1418, "epoch": 0.5983122362869199, "progress_pct": 9.97, "epoch_pct": 9.97, "eta": "64:15:16", "max_grad_norm": 0.8, "loss": 0.6872363090515137, "grad_norm": 1.1213195323944092, "learning_rate": 9.988781432670019e-05} +{"ts": "2025-12-27T01:16:14", "event": "train_log", "step": 1420, "epoch": 0.5991561181434599, "progress_pct": 9.99, "epoch_pct": 9.99, "eta": "64:12:10", "max_grad_norm": 0.8, "loss": 0.7184111475944519, "grad_norm": 1.3211694955825806, "learning_rate": 9.98862007776655e-05} +{"ts": "2025-12-27T01:16:35", "event": "train_log", "step": 1422, "epoch": 0.6, "progress_pct": 10.0, "epoch_pct": 10.0, "eta": "64:09:19", "max_grad_norm": 0.8, "loss": 0.8120859265327454, "grad_norm": 1.1916998624801636, "learning_rate": 9.98845757208424e-05} +{"ts": "2025-12-27T01:16:55", "event": "train_log", "step": 1424, "epoch": 0.60084388185654, "progress_pct": 10.01, "epoch_pct": 10.01, "eta": "64:06:17", "max_grad_norm": 0.8, "loss": 0.7586462497711182, "grad_norm": 1.2772804498672485, "learning_rate": 9.988293915660572e-05} +{"ts": "2025-12-27T01:17:13", "event": "train_log", "step": 1426, "epoch": 0.6016877637130802, "progress_pct": 10.03, "epoch_pct": 10.03, "eta": "64:03:04", "max_grad_norm": 0.8, "loss": 0.8175994157791138, "grad_norm": 1.4139106273651123, "learning_rate": 9.988129108533299e-05} +{"ts": "2025-12-27T01:17:33", "event": "train_log", "step": 1428, "epoch": 0.6025316455696202, "progress_pct": 10.04, "epoch_pct": 10.04, "eta": "64:00:07", "max_grad_norm": 0.8, "loss": 0.7662636041641235, "grad_norm": 1.4481157064437866, "learning_rate": 9.987963150740439e-05} +{"ts": "2025-12-27T01:17:53", "event": "train_log", "step": 1430, "epoch": 0.6033755274261603, "progress_pct": 10.06, "epoch_pct": 10.06, "eta": "63:57:08", "max_grad_norm": 0.8, "loss": 0.7477837800979614, "grad_norm": 1.6000999212265015, "learning_rate": 9.987796042320277e-05} +{"ts": "2025-12-27T01:18:15", "event": "train_log", "step": 1432, "epoch": 0.6042194092827005, "progress_pct": 10.07, "epoch_pct": 10.07, "eta": "63:54:21", "max_grad_norm": 0.8, "loss": 0.7392798662185669, "grad_norm": 1.26194429397583, "learning_rate": 9.98762778331136e-05} +{"ts": "2025-12-27T01:18:34", "event": "train_log", "step": 1434, "epoch": 0.6050632911392405, "progress_pct": 10.08, "epoch_pct": 10.08, "eta": "63:51:15", "max_grad_norm": 0.8, "loss": 0.7795998454093933, "grad_norm": 1.2370645999908447, "learning_rate": 9.987458373752503e-05} +{"ts": "2025-12-27T01:18:53", "event": "train_log", "step": 1436, "epoch": 0.6059071729957806, "progress_pct": 10.1, "epoch_pct": 10.1, "eta": "63:48:11", "max_grad_norm": 0.8, "loss": 0.7833777070045471, "grad_norm": 1.4908311367034912, "learning_rate": 9.987287813682784e-05} +{"ts": "2025-12-27T01:19:13", "event": "train_log", "step": 1438, "epoch": 0.6067510548523207, "progress_pct": 10.11, "epoch_pct": 10.11, "eta": "63:45:12", "max_grad_norm": 0.8, "loss": 0.7269768118858337, "grad_norm": 1.2918652296066284, "learning_rate": 9.987116103141549e-05} +{"ts": "2025-12-27T01:19:33", "event": "train_log", "step": 1440, "epoch": 0.6075949367088608, "progress_pct": 10.13, "epoch_pct": 10.13, "eta": "63:42:18", "max_grad_norm": 0.8, "loss": 0.7599279284477234, "grad_norm": 1.2170461416244507, "learning_rate": 9.98694324216841e-05} +{"ts": "2025-12-27T01:19:52", "event": "train_log", "step": 1442, "epoch": 0.6084388185654008, "progress_pct": 10.14, "epoch_pct": 10.14, "eta": "63:39:08", "max_grad_norm": 0.8, "loss": 0.8256514668464661, "grad_norm": 1.4373505115509033, "learning_rate": 9.98676923080324e-05} +{"ts": "2025-12-27T01:20:12", "event": "train_log", "step": 1444, "epoch": 0.6092827004219409, "progress_pct": 10.15, "epoch_pct": 10.15, "eta": "63:36:17", "max_grad_norm": 0.8, "loss": 0.8462428450584412, "grad_norm": 1.3523614406585693, "learning_rate": 9.986594069086181e-05} +{"ts": "2025-12-27T01:20:32", "event": "train_log", "step": 1446, "epoch": 0.610126582278481, "progress_pct": 10.17, "epoch_pct": 10.17, "eta": "63:33:13", "max_grad_norm": 0.8, "loss": 0.8402239084243774, "grad_norm": 1.5131851434707642, "learning_rate": 9.98641775705764e-05} +{"ts": "2025-12-27T01:20:53", "event": "train_log", "step": 1448, "epoch": 0.6109704641350211, "progress_pct": 10.18, "epoch_pct": 10.18, "eta": "63:30:32", "max_grad_norm": 0.8, "loss": 0.7585759162902832, "grad_norm": 1.3518229722976685, "learning_rate": 9.98624029475829e-05} +{"ts": "2025-12-27T01:21:13", "event": "train_log", "step": 1450, "epoch": 0.6118143459915611, "progress_pct": 10.2, "epoch_pct": 10.2, "eta": "63:27:33", "max_grad_norm": 0.8, "loss": 0.773881733417511, "grad_norm": 1.3403998613357544, "learning_rate": 9.986061682229064e-05} +{"ts": "2025-12-27T01:21:33", "event": "train_log", "step": 1452, "epoch": 0.6126582278481013, "progress_pct": 10.21, "epoch_pct": 10.21, "eta": "63:24:40", "max_grad_norm": 0.8, "loss": 0.6770316958427429, "grad_norm": 1.1835366487503052, "learning_rate": 9.985881919511168e-05} +{"ts": "2025-12-27T01:21:53", "event": "train_log", "step": 1454, "epoch": 0.6135021097046414, "progress_pct": 10.23, "epoch_pct": 10.23, "eta": "63:21:44", "max_grad_norm": 0.8, "loss": 0.7081645727157593, "grad_norm": 1.1825730800628662, "learning_rate": 9.985701006646069e-05} +{"ts": "2025-12-27T01:22:12", "event": "train_log", "step": 1456, "epoch": 0.6143459915611814, "progress_pct": 10.24, "epoch_pct": 10.24, "eta": "63:18:46", "max_grad_norm": 0.8, "loss": 0.7750917673110962, "grad_norm": 1.378994345664978, "learning_rate": 9.9855189436755e-05} +{"ts": "2025-12-27T01:22:30", "event": "train_log", "step": 1458, "epoch": 0.6151898734177215, "progress_pct": 10.25, "epoch_pct": 10.25, "eta": "63:15:37", "max_grad_norm": 0.8, "loss": 0.7517801523208618, "grad_norm": 1.4208749532699585, "learning_rate": 9.985335730641458e-05} +{"ts": "2025-12-27T01:22:51", "event": "train_log", "step": 1460, "epoch": 0.6160337552742616, "progress_pct": 10.27, "epoch_pct": 10.27, "eta": "63:12:45", "max_grad_norm": 0.8, "loss": 0.712832510471344, "grad_norm": 1.1413639783859253, "learning_rate": 9.98515136758621e-05} +{"ts": "2025-12-27T01:23:10", "event": "train_log", "step": 1462, "epoch": 0.6168776371308017, "progress_pct": 10.28, "epoch_pct": 10.28, "eta": "63:09:49", "max_grad_norm": 0.8, "loss": 0.7884142994880676, "grad_norm": 1.3949562311172485, "learning_rate": 9.984965854552283e-05} +{"ts": "2025-12-27T01:23:28", "event": "train_log", "step": 1464, "epoch": 0.6177215189873417, "progress_pct": 10.3, "epoch_pct": 10.3, "eta": "63:06:36", "max_grad_norm": 0.8, "loss": 0.796623706817627, "grad_norm": 1.4057096242904663, "learning_rate": 9.984779191582471e-05} +{"ts": "2025-12-27T01:23:47", "event": "train_log", "step": 1466, "epoch": 0.6185654008438819, "progress_pct": 10.31, "epoch_pct": 10.31, "eta": "63:03:42", "max_grad_norm": 0.8, "loss": 0.7862933874130249, "grad_norm": 1.1681689023971558, "learning_rate": 9.984591378719834e-05} +{"ts": "2025-12-27T01:24:07", "event": "train_log", "step": 1468, "epoch": 0.619409282700422, "progress_pct": 10.32, "epoch_pct": 10.32, "eta": "63:00:48", "max_grad_norm": 0.8, "loss": 0.7889828681945801, "grad_norm": 1.2585291862487793, "learning_rate": 9.984402416007696e-05} +{"ts": "2025-12-27T01:24:26", "event": "train_log", "step": 1470, "epoch": 0.620253164556962, "progress_pct": 10.34, "epoch_pct": 10.34, "eta": "62:57:51", "max_grad_norm": 0.8, "loss": 0.7375997304916382, "grad_norm": 1.2598098516464233, "learning_rate": 9.984212303489649e-05} +{"ts": "2025-12-27T01:24:46", "event": "train_log", "step": 1472, "epoch": 0.6210970464135022, "progress_pct": 10.35, "epoch_pct": 10.35, "eta": "62:54:55", "max_grad_norm": 0.8, "loss": 0.7839564085006714, "grad_norm": 1.4628467559814453, "learning_rate": 9.984021041209547e-05} +{"ts": "2025-12-27T01:25:05", "event": "train_log", "step": 1474, "epoch": 0.6219409282700422, "progress_pct": 10.37, "epoch_pct": 10.37, "eta": "62:51:57", "max_grad_norm": 0.8, "loss": 0.7566051483154297, "grad_norm": 1.3606770038604736, "learning_rate": 9.983828629211511e-05} +{"ts": "2025-12-27T01:25:25", "event": "train_log", "step": 1476, "epoch": 0.6227848101265823, "progress_pct": 10.38, "epoch_pct": 10.38, "eta": "62:49:06", "max_grad_norm": 0.8, "loss": 0.6638457179069519, "grad_norm": 1.182644248008728, "learning_rate": 9.983635067539927e-05} +{"ts": "2025-12-27T01:25:44", "event": "train_log", "step": 1478, "epoch": 0.6236286919831223, "progress_pct": 10.39, "epoch_pct": 10.39, "eta": "62:46:14", "max_grad_norm": 0.8, "loss": 0.8227225542068481, "grad_norm": 1.5617793798446655, "learning_rate": 9.983440356239445e-05} +{"ts": "2025-12-27T01:26:04", "event": "train_log", "step": 1480, "epoch": 0.6244725738396625, "progress_pct": 10.41, "epoch_pct": 10.41, "eta": "62:43:24", "max_grad_norm": 0.8, "loss": 0.7086431980133057, "grad_norm": 1.2290058135986328, "learning_rate": 9.98324449535498e-05} +{"ts": "2025-12-27T01:26:22", "event": "train_log", "step": 1482, "epoch": 0.6253164556962025, "progress_pct": 10.42, "epoch_pct": 10.42, "eta": "62:40:21", "max_grad_norm": 0.8, "loss": 0.8076596856117249, "grad_norm": 1.3822678327560425, "learning_rate": 9.983047484931716e-05} +{"ts": "2025-12-27T01:26:42", "event": "train_log", "step": 1484, "epoch": 0.6261603375527426, "progress_pct": 10.44, "epoch_pct": 10.44, "eta": "62:37:30", "max_grad_norm": 0.8, "loss": 0.7514539361000061, "grad_norm": 1.163699746131897, "learning_rate": 9.982849325015098e-05} +{"ts": "2025-12-27T01:27:01", "event": "train_log", "step": 1486, "epoch": 0.6270042194092827, "progress_pct": 10.45, "epoch_pct": 10.45, "eta": "62:34:35", "max_grad_norm": 0.8, "loss": 0.7298142910003662, "grad_norm": 1.2635631561279297, "learning_rate": 9.982650015650839e-05} +{"ts": "2025-12-27T01:27:21", "event": "train_log", "step": 1488, "epoch": 0.6278481012658228, "progress_pct": 10.46, "epoch_pct": 10.46, "eta": "62:31:43", "max_grad_norm": 0.8, "loss": 0.8092831373214722, "grad_norm": 1.3135387897491455, "learning_rate": 9.982449556884914e-05} +{"ts": "2025-12-27T01:27:38", "event": "train_log", "step": 1490, "epoch": 0.6286919831223629, "progress_pct": 10.48, "epoch_pct": 10.48, "eta": "62:28:36", "max_grad_norm": 0.8, "loss": 0.7934147715568542, "grad_norm": 1.3577877283096313, "learning_rate": 9.982247948763567e-05} +{"ts": "2025-12-27T01:27:57", "event": "train_log", "step": 1492, "epoch": 0.6295358649789029, "progress_pct": 10.49, "epoch_pct": 10.49, "eta": "62:25:40", "max_grad_norm": 0.8, "loss": 0.789363443851471, "grad_norm": 1.1482092142105103, "learning_rate": 9.982045191333304e-05} +{"ts": "2025-12-27T01:28:16", "event": "train_log", "step": 1494, "epoch": 0.6303797468354431, "progress_pct": 10.51, "epoch_pct": 10.51, "eta": "62:22:49", "max_grad_norm": 0.8, "loss": 0.7458413243293762, "grad_norm": 1.189771056175232, "learning_rate": 9.981841284640895e-05} +{"ts": "2025-12-27T01:28:36", "event": "train_log", "step": 1496, "epoch": 0.6312236286919831, "progress_pct": 10.52, "epoch_pct": 10.52, "eta": "62:20:05", "max_grad_norm": 0.8, "loss": 0.7299918532371521, "grad_norm": 1.2815836668014526, "learning_rate": 9.981636228733383e-05} +{"ts": "2025-12-27T01:28:55", "event": "train_log", "step": 1498, "epoch": 0.6320675105485232, "progress_pct": 10.53, "epoch_pct": 10.53, "eta": "62:17:05", "max_grad_norm": 0.8, "loss": 0.7545169591903687, "grad_norm": 1.36761474609375, "learning_rate": 9.981430023658068e-05} +{"ts": "2025-12-27T01:29:14", "event": "train_log", "step": 1500, "epoch": 0.6329113924050633, "progress_pct": 10.55, "epoch_pct": 10.55, "eta": "62:14:11", "max_grad_norm": 0.8, "loss": 0.7358481884002686, "grad_norm": 1.2594345808029175, "learning_rate": 9.981222669462513e-05} +{"ts": "2025-12-27T01:43:40", "event": "train_log", "step": 1500, "epoch": 0.6329113924050633, "progress_pct": 10.55, "epoch_pct": 10.55, "eta": "64:16:34", "max_grad_norm": 0.8, "eval_loss": 0.7896141409873962, "eval_runtime": 865.9069, "eval_samples_per_second": 2.433, "eval_steps_per_second": 2.433} +{"ts": "2025-12-27T01:44:00", "event": "train_log", "step": 1502, "epoch": 0.6337552742616034, "progress_pct": 10.56, "epoch_pct": 10.56, "eta": "64:13:43", "max_grad_norm": 0.8, "loss": 0.8253764510154724, "grad_norm": 3.6419246196746826, "learning_rate": 9.981014166194556e-05} +{"ts": "2025-12-27T01:44:19", "event": "train_log", "step": 1504, "epoch": 0.6345991561181434, "progress_pct": 10.58, "epoch_pct": 10.58, "eta": "64:10:43", "max_grad_norm": 0.8, "loss": 0.8254884481430054, "grad_norm": 1.7333487272262573, "learning_rate": 9.980804513902294e-05} +{"ts": "2025-12-27T01:44:39", "event": "train_log", "step": 1506, "epoch": 0.6354430379746835, "progress_pct": 10.59, "epoch_pct": 10.59, "eta": "64:07:46", "max_grad_norm": 0.8, "loss": 0.7833738327026367, "grad_norm": 1.1998231410980225, "learning_rate": 9.980593712634088e-05} +{"ts": "2025-12-27T01:44:56", "event": "train_log", "step": 1508, "epoch": 0.6362869198312237, "progress_pct": 10.6, "epoch_pct": 10.6, "eta": "64:04:28", "max_grad_norm": 0.8, "loss": 0.753408670425415, "grad_norm": 1.347011685371399, "learning_rate": 9.980381762438566e-05} +{"ts": "2025-12-27T01:45:16", "event": "train_log", "step": 1510, "epoch": 0.6371308016877637, "progress_pct": 10.62, "epoch_pct": 10.62, "eta": "64:01:34", "max_grad_norm": 0.8, "loss": 0.7867791652679443, "grad_norm": 1.1759053468704224, "learning_rate": 9.980168663364622e-05} +{"ts": "2025-12-27T01:45:36", "event": "train_log", "step": 1512, "epoch": 0.6379746835443038, "progress_pct": 10.63, "epoch_pct": 10.63, "eta": "63:58:37", "max_grad_norm": 0.8, "loss": 0.6753612160682678, "grad_norm": 1.3113552331924438, "learning_rate": 9.979954415461412e-05} +{"ts": "2025-12-27T01:45:55", "event": "train_log", "step": 1514, "epoch": 0.6388185654008439, "progress_pct": 10.65, "epoch_pct": 10.65, "eta": "63:55:38", "max_grad_norm": 0.8, "loss": 0.750367283821106, "grad_norm": 1.3258320093154907, "learning_rate": 9.979739018778362e-05} +{"ts": "2025-12-27T01:46:15", "event": "train_log", "step": 1516, "epoch": 0.639662447257384, "progress_pct": 10.66, "epoch_pct": 10.66, "eta": "63:52:45", "max_grad_norm": 0.8, "loss": 0.7505861520767212, "grad_norm": 1.175145149230957, "learning_rate": 9.979522473365157e-05} +{"ts": "2025-12-27T01:46:34", "event": "train_log", "step": 1518, "epoch": 0.640506329113924, "progress_pct": 10.68, "epoch_pct": 10.68, "eta": "63:49:44", "max_grad_norm": 0.8, "loss": 0.7429317831993103, "grad_norm": 1.2276148796081543, "learning_rate": 9.979304779271752e-05} +{"ts": "2025-12-27T01:46:53", "event": "train_log", "step": 1520, "epoch": 0.6413502109704642, "progress_pct": 10.69, "epoch_pct": 10.69, "eta": "63:46:50", "max_grad_norm": 0.8, "loss": 0.786217212677002, "grad_norm": 1.3262875080108643, "learning_rate": 9.979085936548362e-05} +{"ts": "2025-12-27T01:47:12", "event": "train_log", "step": 1522, "epoch": 0.6421940928270042, "progress_pct": 10.7, "epoch_pct": 10.7, "eta": "63:43:51", "max_grad_norm": 0.8, "loss": 0.6942036151885986, "grad_norm": 1.3067121505737305, "learning_rate": 9.978865945245473e-05} +{"ts": "2025-12-27T01:47:30", "event": "train_log", "step": 1524, "epoch": 0.6430379746835443, "progress_pct": 10.72, "epoch_pct": 10.72, "eta": "63:40:42", "max_grad_norm": 0.8, "loss": 0.8281817436218262, "grad_norm": 1.5352400541305542, "learning_rate": 9.978644805413832e-05} +{"ts": "2025-12-27T01:47:48", "event": "train_log", "step": 1526, "epoch": 0.6438818565400843, "progress_pct": 10.73, "epoch_pct": 10.73, "eta": "63:37:36", "max_grad_norm": 0.8, "loss": 0.8110972046852112, "grad_norm": 1.2848507165908813, "learning_rate": 9.97842251710445e-05} +{"ts": "2025-12-27T01:48:07", "event": "train_log", "step": 1528, "epoch": 0.6447257383966245, "progress_pct": 10.75, "epoch_pct": 10.75, "eta": "63:34:37", "max_grad_norm": 0.8, "loss": 0.7354730367660522, "grad_norm": 1.352196216583252, "learning_rate": 9.978199080368607e-05} +{"ts": "2025-12-27T01:48:27", "event": "train_log", "step": 1530, "epoch": 0.6455696202531646, "progress_pct": 10.76, "epoch_pct": 10.76, "eta": "63:31:43", "max_grad_norm": 0.8, "loss": 0.7915583848953247, "grad_norm": 1.2427687644958496, "learning_rate": 9.977974495257842e-05} +{"ts": "2025-12-27T01:48:46", "event": "train_log", "step": 1532, "epoch": 0.6464135021097046, "progress_pct": 10.77, "epoch_pct": 10.77, "eta": "63:28:46", "max_grad_norm": 0.8, "loss": 0.7400109171867371, "grad_norm": 1.3163504600524902, "learning_rate": 9.977748761823967e-05} +{"ts": "2025-12-27T01:49:05", "event": "train_log", "step": 1534, "epoch": 0.6472573839662448, "progress_pct": 10.79, "epoch_pct": 10.79, "eta": "63:25:52", "max_grad_norm": 0.8, "loss": 0.7104899287223816, "grad_norm": 1.2496893405914307, "learning_rate": 9.977521880119049e-05} +{"ts": "2025-12-27T01:49:25", "event": "train_log", "step": 1536, "epoch": 0.6481012658227848, "progress_pct": 10.8, "epoch_pct": 10.8, "eta": "63:23:00", "max_grad_norm": 0.8, "loss": 0.8074463605880737, "grad_norm": 1.0907179117202759, "learning_rate": 9.97729385019543e-05} +{"ts": "2025-12-27T01:49:44", "event": "train_log", "step": 1538, "epoch": 0.6489451476793249, "progress_pct": 10.82, "epoch_pct": 10.82, "eta": "63:20:04", "max_grad_norm": 0.8, "loss": 0.7770540714263916, "grad_norm": 1.2323429584503174, "learning_rate": 9.977064672105712e-05} +{"ts": "2025-12-27T01:50:03", "event": "train_log", "step": 1540, "epoch": 0.6497890295358649, "progress_pct": 10.83, "epoch_pct": 10.83, "eta": "63:17:13", "max_grad_norm": 0.8, "loss": 0.806465208530426, "grad_norm": 1.224428415298462, "learning_rate": 9.976834345902759e-05} +{"ts": "2025-12-27T01:50:22", "event": "train_log", "step": 1542, "epoch": 0.6506329113924051, "progress_pct": 10.84, "epoch_pct": 10.84, "eta": "63:14:19", "max_grad_norm": 0.8, "loss": 0.7306749224662781, "grad_norm": 1.3529564142227173, "learning_rate": 9.976602871639705e-05} +{"ts": "2025-12-27T01:50:41", "event": "train_log", "step": 1544, "epoch": 0.6514767932489451, "progress_pct": 10.86, "epoch_pct": 10.86, "eta": "63:11:22", "max_grad_norm": 0.8, "loss": 0.783933699131012, "grad_norm": 1.1770031452178955, "learning_rate": 9.976370249369946e-05} +{"ts": "2025-12-27T01:51:01", "event": "train_log", "step": 1546, "epoch": 0.6523206751054852, "progress_pct": 10.87, "epoch_pct": 10.87, "eta": "63:08:38", "max_grad_norm": 0.8, "loss": 0.6937689185142517, "grad_norm": 1.205283522605896, "learning_rate": 9.976136479147144e-05} +{"ts": "2025-12-27T01:51:20", "event": "train_log", "step": 1548, "epoch": 0.6531645569620254, "progress_pct": 10.89, "epoch_pct": 10.89, "eta": "63:05:40", "max_grad_norm": 0.8, "loss": 0.8041763305664062, "grad_norm": 1.2329360246658325, "learning_rate": 9.975901561025223e-05} +{"ts": "2025-12-27T01:51:39", "event": "train_log", "step": 1550, "epoch": 0.6540084388185654, "progress_pct": 10.9, "epoch_pct": 10.9, "eta": "63:02:48", "max_grad_norm": 0.8, "loss": 0.750390887260437, "grad_norm": 1.499973177909851, "learning_rate": 9.975665495058377e-05} +{"ts": "2025-12-27T01:51:59", "event": "train_log", "step": 1552, "epoch": 0.6548523206751055, "progress_pct": 10.91, "epoch_pct": 10.91, "eta": "63:00:04", "max_grad_norm": 0.8, "loss": 0.7658298015594482, "grad_norm": 1.31832754611969, "learning_rate": 9.975428281301061e-05} +{"ts": "2025-12-27T01:52:18", "event": "train_log", "step": 1554, "epoch": 0.6556962025316456, "progress_pct": 10.93, "epoch_pct": 10.93, "eta": "62:57:09", "max_grad_norm": 0.8, "loss": 0.8651264905929565, "grad_norm": 1.3998414278030396, "learning_rate": 9.975189919807994e-05} +{"ts": "2025-12-27T01:52:38", "event": "train_log", "step": 1556, "epoch": 0.6565400843881857, "progress_pct": 10.94, "epoch_pct": 10.94, "eta": "62:54:28", "max_grad_norm": 0.8, "loss": 0.6776561141014099, "grad_norm": 1.2002551555633545, "learning_rate": 9.974950410634164e-05} +{"ts": "2025-12-27T01:52:58", "event": "train_log", "step": 1558, "epoch": 0.6573839662447257, "progress_pct": 10.96, "epoch_pct": 10.96, "eta": "62:51:41", "max_grad_norm": 0.8, "loss": 0.8159130811691284, "grad_norm": 1.1986602544784546, "learning_rate": 9.97470975383482e-05} +{"ts": "2025-12-27T01:53:16", "event": "train_log", "step": 1560, "epoch": 0.6582278481012658, "progress_pct": 10.97, "epoch_pct": 10.97, "eta": "62:48:45", "max_grad_norm": 0.8, "loss": 0.7528039216995239, "grad_norm": 1.3583602905273438, "learning_rate": 9.974467949465477e-05} +{"ts": "2025-12-27T01:53:35", "event": "train_log", "step": 1562, "epoch": 0.6590717299578059, "progress_pct": 10.98, "epoch_pct": 10.98, "eta": "62:45:51", "max_grad_norm": 0.8, "loss": 0.6970920562744141, "grad_norm": 1.4176239967346191, "learning_rate": 9.974224997581913e-05} +{"ts": "2025-12-27T01:53:54", "event": "train_log", "step": 1564, "epoch": 0.659915611814346, "progress_pct": 11.0, "epoch_pct": 11.0, "eta": "62:42:58", "max_grad_norm": 0.8, "loss": 0.7718377113342285, "grad_norm": 1.3899401426315308, "learning_rate": 9.973980898240177e-05} +{"ts": "2025-12-27T01:54:13", "event": "train_log", "step": 1566, "epoch": 0.660759493670886, "progress_pct": 11.01, "epoch_pct": 11.01, "eta": "62:40:14", "max_grad_norm": 0.8, "loss": 0.7346280217170715, "grad_norm": 1.222413182258606, "learning_rate": 9.973735651496571e-05} +{"ts": "2025-12-27T01:54:32", "event": "train_log", "step": 1568, "epoch": 0.6616033755274262, "progress_pct": 11.03, "epoch_pct": 11.03, "eta": "62:37:16", "max_grad_norm": 0.8, "loss": 0.7923588156700134, "grad_norm": 1.3750087022781372, "learning_rate": 9.973489257407676e-05} +{"ts": "2025-12-27T01:54:50", "event": "train_log", "step": 1570, "epoch": 0.6624472573839663, "progress_pct": 11.04, "epoch_pct": 11.04, "eta": "62:34:19", "max_grad_norm": 0.8, "loss": 0.8258910179138184, "grad_norm": 1.24547278881073, "learning_rate": 9.973241716030325e-05} +{"ts": "2025-12-27T01:55:09", "event": "train_log", "step": 1572, "epoch": 0.6632911392405063, "progress_pct": 11.05, "epoch_pct": 11.05, "eta": "62:31:37", "max_grad_norm": 0.8, "loss": 0.7869232296943665, "grad_norm": 1.2464141845703125, "learning_rate": 9.972993027421624e-05} +{"ts": "2025-12-27T01:55:28", "event": "train_log", "step": 1574, "epoch": 0.6641350210970464, "progress_pct": 11.07, "epoch_pct": 11.07, "eta": "62:28:46", "max_grad_norm": 0.8, "loss": 0.8144775629043579, "grad_norm": 1.3088903427124023, "learning_rate": 9.972743191638939e-05} +{"ts": "2025-12-27T01:55:47", "event": "train_log", "step": 1576, "epoch": 0.6649789029535865, "progress_pct": 11.08, "epoch_pct": 11.08, "eta": "62:25:59", "max_grad_norm": 0.8, "loss": 0.7432073950767517, "grad_norm": 1.2252418994903564, "learning_rate": 9.972492208739903e-05} +{"ts": "2025-12-27T01:56:07", "event": "train_log", "step": 1578, "epoch": 0.6658227848101266, "progress_pct": 11.1, "epoch_pct": 11.1, "eta": "62:23:17", "max_grad_norm": 0.8, "loss": 0.7386854887008667, "grad_norm": 1.2303717136383057, "learning_rate": 9.972240078782413e-05} +{"ts": "2025-12-27T01:56:26", "event": "train_log", "step": 1580, "epoch": 0.6666666666666666, "progress_pct": 11.11, "epoch_pct": 11.11, "eta": "62:20:28", "max_grad_norm": 0.8, "loss": 0.7127882838249207, "grad_norm": 1.0226294994354248, "learning_rate": 9.971986801824631e-05} +{"ts": "2025-12-27T01:56:45", "event": "train_log", "step": 1582, "epoch": 0.6675105485232068, "progress_pct": 11.13, "epoch_pct": 11.13, "eta": "62:17:42", "max_grad_norm": 0.8, "loss": 0.7557716369628906, "grad_norm": 1.362332820892334, "learning_rate": 9.971732377924982e-05} +{"ts": "2025-12-27T01:57:05", "event": "train_log", "step": 1584, "epoch": 0.6683544303797468, "progress_pct": 11.14, "epoch_pct": 11.14, "eta": "62:15:02", "max_grad_norm": 0.8, "loss": 0.7832611203193665, "grad_norm": 1.4436695575714111, "learning_rate": 9.971476807142158e-05} +{"ts": "2025-12-27T01:57:25", "event": "train_log", "step": 1586, "epoch": 0.6691983122362869, "progress_pct": 11.15, "epoch_pct": 11.15, "eta": "62:12:23", "max_grad_norm": 0.8, "loss": 0.8190197944641113, "grad_norm": 1.276695966720581, "learning_rate": 9.971220089535113e-05} +{"ts": "2025-12-27T01:57:45", "event": "train_log", "step": 1588, "epoch": 0.6700421940928271, "progress_pct": 11.17, "epoch_pct": 11.17, "eta": "62:09:46", "max_grad_norm": 0.8, "loss": 0.747222363948822, "grad_norm": 1.2413527965545654, "learning_rate": 9.970962225163069e-05} +{"ts": "2025-12-27T01:58:06", "event": "train_log", "step": 1590, "epoch": 0.6708860759493671, "progress_pct": 11.18, "epoch_pct": 11.18, "eta": "62:07:18", "max_grad_norm": 0.8, "loss": 0.7846449017524719, "grad_norm": 1.3395767211914062, "learning_rate": 9.970703214085507e-05} +{"ts": "2025-12-27T01:58:25", "event": "train_log", "step": 1592, "epoch": 0.6717299578059072, "progress_pct": 11.2, "epoch_pct": 11.2, "eta": "62:04:30", "max_grad_norm": 0.8, "loss": 0.8160232901573181, "grad_norm": 1.291327953338623, "learning_rate": 9.970443056362178e-05} +{"ts": "2025-12-27T01:58:45", "event": "train_log", "step": 1594, "epoch": 0.6725738396624472, "progress_pct": 11.21, "epoch_pct": 11.21, "eta": "62:01:53", "max_grad_norm": 0.8, "loss": 0.7413806915283203, "grad_norm": 1.3139684200286865, "learning_rate": 9.970181752053097e-05} +{"ts": "2025-12-27T01:59:05", "event": "train_log", "step": 1596, "epoch": 0.6734177215189874, "progress_pct": 11.22, "epoch_pct": 11.22, "eta": "61:59:13", "max_grad_norm": 0.8, "loss": 0.7637304067611694, "grad_norm": 1.3170921802520752, "learning_rate": 9.969919301218537e-05} +{"ts": "2025-12-27T01:59:25", "event": "train_log", "step": 1598, "epoch": 0.6742616033755274, "progress_pct": 11.24, "epoch_pct": 11.24, "eta": "61:56:39", "max_grad_norm": 0.8, "loss": 0.7823366522789001, "grad_norm": 1.3349758386611938, "learning_rate": 9.969655703919044e-05} +{"ts": "2025-12-27T01:59:46", "event": "train_log", "step": 1600, "epoch": 0.6751054852320675, "progress_pct": 11.25, "epoch_pct": 11.25, "eta": "61:54:06", "max_grad_norm": 0.8, "loss": 0.6587790846824646, "grad_norm": 1.2151578664779663, "learning_rate": 9.969390960215425e-05} +{"ts": "2025-12-27T02:14:07", "event": "train_log", "step": 1600, "epoch": 0.6751054852320675, "progress_pct": 11.25, "epoch_pct": 11.25, "eta": "63:47:21", "max_grad_norm": 0.8, "eval_loss": 0.7836604714393616, "eval_runtime": 861.5352, "eval_samples_per_second": 2.446, "eval_steps_per_second": 2.446} +{"ts": "2025-12-27T02:14:26", "event": "train_log", "step": 1602, "epoch": 0.6759493670886076, "progress_pct": 11.27, "epoch_pct": 11.27, "eta": "63:44:30", "max_grad_norm": 0.8, "loss": 0.7314544320106506, "grad_norm": 1.2541478872299194, "learning_rate": 9.96912507016875e-05} +{"ts": "2025-12-27T02:14:46", "event": "train_log", "step": 1604, "epoch": 0.6767932489451477, "progress_pct": 11.28, "epoch_pct": 11.28, "eta": "63:41:43", "max_grad_norm": 0.8, "loss": 0.702468752861023, "grad_norm": 1.091790795326233, "learning_rate": 9.968858033840357e-05} +{"ts": "2025-12-27T02:15:06", "event": "train_log", "step": 1606, "epoch": 0.6776371308016877, "progress_pct": 11.29, "epoch_pct": 11.29, "eta": "63:38:54", "max_grad_norm": 0.8, "loss": 0.7691897749900818, "grad_norm": 1.36745285987854, "learning_rate": 9.968589851291841e-05} +{"ts": "2025-12-27T02:15:25", "event": "train_log", "step": 1608, "epoch": 0.6784810126582278, "progress_pct": 11.31, "epoch_pct": 11.31, "eta": "63:36:09", "max_grad_norm": 0.8, "loss": 0.7422228455543518, "grad_norm": 1.1325993537902832, "learning_rate": 9.968320522585072e-05} +{"ts": "2025-12-27T02:15:46", "event": "train_log", "step": 1610, "epoch": 0.679324894514768, "progress_pct": 11.32, "epoch_pct": 11.32, "eta": "63:33:27", "max_grad_norm": 0.8, "loss": 0.677532434463501, "grad_norm": 1.1015450954437256, "learning_rate": 9.968050047782176e-05} +{"ts": "2025-12-27T02:16:05", "event": "train_log", "step": 1612, "epoch": 0.680168776371308, "progress_pct": 11.34, "epoch_pct": 11.34, "eta": "63:30:36", "max_grad_norm": 0.8, "loss": 0.7973438501358032, "grad_norm": 1.2216695547103882, "learning_rate": 9.967778426945548e-05} +{"ts": "2025-12-27T02:16:26", "event": "train_log", "step": 1614, "epoch": 0.6810126582278481, "progress_pct": 11.35, "epoch_pct": 11.35, "eta": "63:27:58", "max_grad_norm": 0.8, "loss": 0.6742876172065735, "grad_norm": 1.159395456314087, "learning_rate": 9.967505660137843e-05} +{"ts": "2025-12-27T02:16:45", "event": "train_log", "step": 1616, "epoch": 0.6818565400843882, "progress_pct": 11.36, "epoch_pct": 11.36, "eta": "63:25:10", "max_grad_norm": 0.8, "loss": 0.7592008709907532, "grad_norm": 1.404433250427246, "learning_rate": 9.967231747421988e-05} +{"ts": "2025-12-27T02:17:04", "event": "train_log", "step": 1618, "epoch": 0.6827004219409283, "progress_pct": 11.38, "epoch_pct": 11.38, "eta": "63:22:21", "max_grad_norm": 0.8, "loss": 0.7565826177597046, "grad_norm": 1.2489168643951416, "learning_rate": 9.966956688861164e-05} +{"ts": "2025-12-27T02:17:23", "event": "train_log", "step": 1620, "epoch": 0.6835443037974683, "progress_pct": 11.39, "epoch_pct": 11.39, "eta": "63:19:31", "max_grad_norm": 0.8, "loss": 0.7694597840309143, "grad_norm": 1.2960615158081055, "learning_rate": 9.966680484518825e-05} +{"ts": "2025-12-27T02:17:42", "event": "train_log", "step": 1622, "epoch": 0.6843881856540084, "progress_pct": 11.41, "epoch_pct": 11.41, "eta": "63:16:42", "max_grad_norm": 0.8, "loss": 0.8392959833145142, "grad_norm": 1.3598436117172241, "learning_rate": 9.966403134458685e-05} +{"ts": "2025-12-27T02:18:01", "event": "train_log", "step": 1624, "epoch": 0.6852320675105485, "progress_pct": 11.42, "epoch_pct": 11.42, "eta": "63:13:48", "max_grad_norm": 0.8, "loss": 0.8014217019081116, "grad_norm": 1.258065938949585, "learning_rate": 9.966124638744722e-05} +{"ts": "2025-12-27T02:18:19", "event": "train_log", "step": 1626, "epoch": 0.6860759493670886, "progress_pct": 11.43, "epoch_pct": 11.43, "eta": "63:10:56", "max_grad_norm": 0.8, "loss": 0.7029755711555481, "grad_norm": 1.3132309913635254, "learning_rate": 9.965844997441184e-05} +{"ts": "2025-12-27T02:18:38", "event": "train_log", "step": 1628, "epoch": 0.6869198312236287, "progress_pct": 11.45, "epoch_pct": 11.45, "eta": "63:08:04", "max_grad_norm": 0.8, "loss": 0.7213528752326965, "grad_norm": 1.1204946041107178, "learning_rate": 9.965564210612575e-05} +{"ts": "2025-12-27T02:18:59", "event": "train_log", "step": 1630, "epoch": 0.6877637130801688, "progress_pct": 11.46, "epoch_pct": 11.46, "eta": "63:05:31", "max_grad_norm": 0.8, "loss": 0.6895437240600586, "grad_norm": 1.037251591682434, "learning_rate": 9.965282278323667e-05} +{"ts": "2025-12-27T02:19:19", "event": "train_log", "step": 1632, "epoch": 0.6886075949367089, "progress_pct": 11.48, "epoch_pct": 11.48, "eta": "63:02:52", "max_grad_norm": 0.8, "loss": 0.8035063743591309, "grad_norm": 1.093807578086853, "learning_rate": 9.964999200639498e-05} +{"ts": "2025-12-27T02:19:38", "event": "train_log", "step": 1634, "epoch": 0.6894514767932489, "progress_pct": 11.49, "epoch_pct": 11.49, "eta": "63:00:03", "max_grad_norm": 0.8, "loss": 0.6191847920417786, "grad_norm": 1.367386817932129, "learning_rate": 9.964714977625367e-05} +{"ts": "2025-12-27T02:19:57", "event": "train_log", "step": 1636, "epoch": 0.6902953586497891, "progress_pct": 11.5, "epoch_pct": 11.5, "eta": "62:57:16", "max_grad_norm": 0.8, "loss": 0.7469727993011475, "grad_norm": 1.3160961866378784, "learning_rate": 9.964429609346841e-05} +{"ts": "2025-12-27T02:20:15", "event": "train_log", "step": 1638, "epoch": 0.6911392405063291, "progress_pct": 11.52, "epoch_pct": 11.52, "eta": "62:54:25", "max_grad_norm": 0.8, "loss": 0.7987836599349976, "grad_norm": 1.3736863136291504, "learning_rate": 9.964143095869748e-05} +{"ts": "2025-12-27T02:20:34", "event": "train_log", "step": 1640, "epoch": 0.6919831223628692, "progress_pct": 11.53, "epoch_pct": 11.53, "eta": "62:51:39", "max_grad_norm": 0.8, "loss": 0.7901709675788879, "grad_norm": 1.323209524154663, "learning_rate": 9.963855437260182e-05} +{"ts": "2025-12-27T02:20:53", "event": "train_log", "step": 1642, "epoch": 0.6928270042194092, "progress_pct": 11.55, "epoch_pct": 11.55, "eta": "62:48:49", "max_grad_norm": 0.8, "loss": 0.7889530658721924, "grad_norm": 1.3943440914154053, "learning_rate": 9.963566633584496e-05} +{"ts": "2025-12-27T02:21:12", "event": "train_log", "step": 1644, "epoch": 0.6936708860759494, "progress_pct": 11.56, "epoch_pct": 11.56, "eta": "62:46:05", "max_grad_norm": 0.8, "loss": 0.756829559803009, "grad_norm": 1.3699116706848145, "learning_rate": 9.963276684909317e-05} +{"ts": "2025-12-27T02:21:31", "event": "train_log", "step": 1646, "epoch": 0.6945147679324895, "progress_pct": 11.58, "epoch_pct": 11.58, "eta": "62:43:23", "max_grad_norm": 0.8, "loss": 0.7840303182601929, "grad_norm": 1.4216378927230835, "learning_rate": 9.962985591301529e-05} +{"ts": "2025-12-27T02:21:52", "event": "train_log", "step": 1648, "epoch": 0.6953586497890295, "progress_pct": 11.59, "epoch_pct": 11.59, "eta": "62:40:51", "max_grad_norm": 0.8, "loss": 0.700393557548523, "grad_norm": 1.2231985330581665, "learning_rate": 9.962693352828279e-05} +{"ts": "2025-12-27T02:22:13", "event": "train_log", "step": 1650, "epoch": 0.6962025316455697, "progress_pct": 11.6, "epoch_pct": 11.6, "eta": "62:38:21", "max_grad_norm": 0.8, "loss": 0.7010306715965271, "grad_norm": 1.3568313121795654, "learning_rate": 9.962399969556983e-05} +{"ts": "2025-12-27T02:22:33", "event": "train_log", "step": 1652, "epoch": 0.6970464135021097, "progress_pct": 11.62, "epoch_pct": 11.62, "eta": "62:35:43", "max_grad_norm": 0.8, "loss": 0.6935506463050842, "grad_norm": 1.1662907600402832, "learning_rate": 9.96210544155532e-05} +{"ts": "2025-12-27T02:22:53", "event": "train_log", "step": 1654, "epoch": 0.6978902953586498, "progress_pct": 11.63, "epoch_pct": 11.63, "eta": "62:33:06", "max_grad_norm": 0.8, "loss": 0.7913851141929626, "grad_norm": 1.3066680431365967, "learning_rate": 9.96180976889123e-05} +{"ts": "2025-12-27T02:23:14", "event": "train_log", "step": 1656, "epoch": 0.6987341772151898, "progress_pct": 11.65, "epoch_pct": 11.65, "eta": "62:30:39", "max_grad_norm": 0.8, "loss": 0.764849066734314, "grad_norm": 1.2268375158309937, "learning_rate": 9.961512951632918e-05} +{"ts": "2025-12-27T02:23:34", "event": "train_log", "step": 1658, "epoch": 0.69957805907173, "progress_pct": 11.66, "epoch_pct": 11.66, "eta": "62:28:01", "max_grad_norm": 0.8, "loss": 0.7544103860855103, "grad_norm": 1.4509469270706177, "learning_rate": 9.96121498984886e-05} +{"ts": "2025-12-27T02:23:55", "event": "train_log", "step": 1660, "epoch": 0.70042194092827, "progress_pct": 11.67, "epoch_pct": 11.67, "eta": "62:25:40", "max_grad_norm": 0.8, "loss": 0.7766591310501099, "grad_norm": 1.200772762298584, "learning_rate": 9.960915883607782e-05} +{"ts": "2025-12-27T02:24:16", "event": "train_log", "step": 1662, "epoch": 0.7012658227848101, "progress_pct": 11.69, "epoch_pct": 11.69, "eta": "62:23:06", "max_grad_norm": 0.8, "loss": 0.7433559894561768, "grad_norm": 1.3825311660766602, "learning_rate": 9.960615632978687e-05} +{"ts": "2025-12-27T02:24:35", "event": "train_log", "step": 1664, "epoch": 0.7021097046413503, "progress_pct": 11.7, "epoch_pct": 11.7, "eta": "62:20:25", "max_grad_norm": 0.8, "loss": 0.7770103812217712, "grad_norm": 1.3197243213653564, "learning_rate": 9.960314238030836e-05} +{"ts": "2025-12-27T02:24:54", "event": "train_log", "step": 1666, "epoch": 0.7029535864978903, "progress_pct": 11.72, "epoch_pct": 11.72, "eta": "62:17:48", "max_grad_norm": 0.8, "loss": 0.8597216606140137, "grad_norm": 1.515163779258728, "learning_rate": 9.960011698833755e-05} +{"ts": "2025-12-27T02:25:16", "event": "train_log", "step": 1668, "epoch": 0.7037974683544304, "progress_pct": 11.73, "epoch_pct": 11.73, "eta": "62:15:27", "max_grad_norm": 0.8, "loss": 0.7630532383918762, "grad_norm": 1.2329891920089722, "learning_rate": 9.959708015457234e-05} +{"ts": "2025-12-27T02:25:38", "event": "train_log", "step": 1670, "epoch": 0.7046413502109705, "progress_pct": 11.74, "epoch_pct": 11.74, "eta": "62:13:05", "max_grad_norm": 0.8, "loss": 0.7299806475639343, "grad_norm": 1.0592037439346313, "learning_rate": 9.959403187971327e-05} +{"ts": "2025-12-27T02:25:57", "event": "train_log", "step": 1672, "epoch": 0.7054852320675106, "progress_pct": 11.76, "epoch_pct": 11.76, "eta": "62:10:23", "max_grad_norm": 0.8, "loss": 0.6999854445457458, "grad_norm": 2.2717394828796387, "learning_rate": 9.959097216446351e-05} +{"ts": "2025-12-27T02:26:16", "event": "train_log", "step": 1674, "epoch": 0.7063291139240506, "progress_pct": 11.77, "epoch_pct": 11.77, "eta": "62:07:45", "max_grad_norm": 0.8, "loss": 0.8403060436248779, "grad_norm": 1.1552131175994873, "learning_rate": 9.958790100952889e-05} +{"ts": "2025-12-27T02:26:36", "event": "train_log", "step": 1676, "epoch": 0.7071729957805907, "progress_pct": 11.79, "epoch_pct": 11.79, "eta": "62:05:09", "max_grad_norm": 0.8, "loss": 0.7729134559631348, "grad_norm": 1.290488839149475, "learning_rate": 9.958481841561787e-05} +{"ts": "2025-12-27T02:26:56", "event": "train_log", "step": 1678, "epoch": 0.7080168776371308, "progress_pct": 11.8, "epoch_pct": 11.8, "eta": "62:02:40", "max_grad_norm": 0.8, "loss": 0.7100697755813599, "grad_norm": 1.1913278102874756, "learning_rate": 9.958172438344152e-05} +{"ts": "2025-12-27T02:27:16", "event": "train_log", "step": 1680, "epoch": 0.7088607594936709, "progress_pct": 11.81, "epoch_pct": 11.81, "eta": "62:00:05", "max_grad_norm": 0.8, "loss": 0.7014795541763306, "grad_norm": 1.2355852127075195, "learning_rate": 9.957861891371359e-05} +{"ts": "2025-12-27T02:27:34", "event": "train_log", "step": 1682, "epoch": 0.7097046413502109, "progress_pct": 11.83, "epoch_pct": 11.83, "eta": "61:57:22", "max_grad_norm": 0.8, "loss": 0.8131424784660339, "grad_norm": 1.258705496788025, "learning_rate": 9.957550200715044e-05} +{"ts": "2025-12-27T02:27:55", "event": "train_log", "step": 1684, "epoch": 0.7105485232067511, "progress_pct": 11.84, "epoch_pct": 11.84, "eta": "61:54:57", "max_grad_norm": 0.8, "loss": 0.6842480301856995, "grad_norm": 1.1102997064590454, "learning_rate": 9.957237366447112e-05} +{"ts": "2025-12-27T02:28:15", "event": "train_log", "step": 1686, "epoch": 0.7113924050632912, "progress_pct": 11.86, "epoch_pct": 11.86, "eta": "61:52:29", "max_grad_norm": 0.8, "loss": 0.6730120182037354, "grad_norm": 1.4466290473937988, "learning_rate": 9.956923388639724e-05} +{"ts": "2025-12-27T02:28:35", "event": "train_log", "step": 1688, "epoch": 0.7122362869198312, "progress_pct": 11.87, "epoch_pct": 11.87, "eta": "61:49:56", "max_grad_norm": 0.8, "loss": 0.7109374403953552, "grad_norm": 1.261152982711792, "learning_rate": 9.956608267365311e-05} +{"ts": "2025-12-27T02:28:54", "event": "train_log", "step": 1690, "epoch": 0.7130801687763713, "progress_pct": 11.88, "epoch_pct": 11.88, "eta": "61:47:14", "max_grad_norm": 0.8, "loss": 0.7545008063316345, "grad_norm": 1.4070630073547363, "learning_rate": 9.956292002696562e-05} +{"ts": "2025-12-27T02:29:13", "event": "train_log", "step": 1692, "epoch": 0.7139240506329114, "progress_pct": 11.9, "epoch_pct": 11.9, "eta": "61:44:41", "max_grad_norm": 0.8, "loss": 0.7892587184906006, "grad_norm": 1.2532793283462524, "learning_rate": 9.955974594706436e-05} +{"ts": "2025-12-27T02:29:34", "event": "train_log", "step": 1694, "epoch": 0.7147679324894515, "progress_pct": 11.91, "epoch_pct": 11.91, "eta": "61:42:19", "max_grad_norm": 0.8, "loss": 0.7348554134368896, "grad_norm": 1.1180293560028076, "learning_rate": 9.955656043468153e-05} +{"ts": "2025-12-27T02:29:53", "event": "train_log", "step": 1696, "epoch": 0.7156118143459915, "progress_pct": 11.93, "epoch_pct": 11.93, "eta": "61:39:37", "max_grad_norm": 0.8, "loss": 0.8207674026489258, "grad_norm": 1.333054542541504, "learning_rate": 9.955336349055195e-05} +{"ts": "2025-12-27T02:30:12", "event": "train_log", "step": 1698, "epoch": 0.7164556962025317, "progress_pct": 11.94, "epoch_pct": 11.94, "eta": "61:37:03", "max_grad_norm": 0.8, "loss": 0.7226691842079163, "grad_norm": 1.1373547315597534, "learning_rate": 9.95501551154131e-05} +{"ts": "2025-12-27T02:30:33", "event": "train_log", "step": 1700, "epoch": 0.7172995780590717, "progress_pct": 11.95, "epoch_pct": 11.95, "eta": "61:34:43", "max_grad_norm": 0.8, "loss": 0.726982831954956, "grad_norm": 1.2342052459716797, "learning_rate": 9.95469353100051e-05} +{"ts": "2025-12-27T02:44:39", "event": "train_log", "step": 1700, "epoch": 0.7172995780590717, "progress_pct": 11.95, "epoch_pct": 11.95, "eta": "63:18:35", "max_grad_norm": 0.8, "eval_loss": 0.7783148884773254, "eval_runtime": 846.1986, "eval_samples_per_second": 2.49, "eval_steps_per_second": 2.49} +{"ts": "2025-12-27T02:44:58", "event": "train_log", "step": 1702, "epoch": 0.7181434599156118, "progress_pct": 11.97, "epoch_pct": 11.97, "eta": "63:15:45", "max_grad_norm": 0.8, "loss": 0.7623077034950256, "grad_norm": 1.3781483173370361, "learning_rate": 9.95437040750707e-05} +{"ts": "2025-12-27T02:45:16", "event": "train_log", "step": 1704, "epoch": 0.7189873417721518, "progress_pct": 11.98, "epoch_pct": 11.98, "eta": "63:12:59", "max_grad_norm": 0.8, "loss": 0.7421616315841675, "grad_norm": 1.301440715789795, "learning_rate": 9.954046141135526e-05} +{"ts": "2025-12-27T02:45:38", "event": "train_log", "step": 1706, "epoch": 0.719831223628692, "progress_pct": 12.0, "epoch_pct": 12.0, "eta": "63:10:35", "max_grad_norm": 0.8, "loss": 0.685523509979248, "grad_norm": 1.1375854015350342, "learning_rate": 9.953720731960683e-05} +{"ts": "2025-12-27T02:45:59", "event": "train_log", "step": 1708, "epoch": 0.7206751054852321, "progress_pct": 12.01, "epoch_pct": 12.01, "eta": "63:08:02", "max_grad_norm": 0.8, "loss": 0.756073534488678, "grad_norm": 1.2014397382736206, "learning_rate": 9.953394180057604e-05} +{"ts": "2025-12-27T02:46:18", "event": "train_log", "step": 1710, "epoch": 0.7215189873417721, "progress_pct": 12.03, "epoch_pct": 12.03, "eta": "63:05:25", "max_grad_norm": 0.8, "loss": 0.7364522814750671, "grad_norm": 1.232802152633667, "learning_rate": 9.95306648550162e-05} +{"ts": "2025-12-27T02:46:38", "event": "train_log", "step": 1712, "epoch": 0.7223628691983123, "progress_pct": 12.04, "epoch_pct": 12.04, "eta": "63:02:49", "max_grad_norm": 0.8, "loss": 0.7073688507080078, "grad_norm": 1.4462472200393677, "learning_rate": 9.952737648368323e-05} +{"ts": "2025-12-27T02:46:59", "event": "train_log", "step": 1714, "epoch": 0.7232067510548523, "progress_pct": 12.05, "epoch_pct": 12.05, "eta": "63:00:19", "max_grad_norm": 0.8, "loss": 0.7147064805030823, "grad_norm": 1.123523473739624, "learning_rate": 9.95240766873357e-05} +{"ts": "2025-12-27T02:47:17", "event": "train_log", "step": 1716, "epoch": 0.7240506329113924, "progress_pct": 12.07, "epoch_pct": 12.07, "eta": "62:57:29", "max_grad_norm": 0.8, "loss": 0.7108398079872131, "grad_norm": 1.4111510515213013, "learning_rate": 9.95207654667348e-05} +{"ts": "2025-12-27T02:47:38", "event": "train_log", "step": 1718, "epoch": 0.7248945147679325, "progress_pct": 12.08, "epoch_pct": 12.08, "eta": "62:55:05", "max_grad_norm": 0.8, "loss": 0.7080079317092896, "grad_norm": 1.2785903215408325, "learning_rate": 9.951744282264437e-05} +{"ts": "2025-12-27T02:47:59", "event": "train_log", "step": 1720, "epoch": 0.7257383966244726, "progress_pct": 12.1, "epoch_pct": 12.1, "eta": "62:52:37", "max_grad_norm": 0.8, "loss": 0.7396624684333801, "grad_norm": 1.1361653804779053, "learning_rate": 9.951410875583089e-05} +{"ts": "2025-12-27T02:48:19", "event": "train_log", "step": 1722, "epoch": 0.7265822784810126, "progress_pct": 12.11, "epoch_pct": 12.11, "eta": "62:50:00", "max_grad_norm": 0.8, "loss": 0.7724334597587585, "grad_norm": 1.0762585401535034, "learning_rate": 9.951076326706346e-05} +{"ts": "2025-12-27T02:48:38", "event": "train_log", "step": 1724, "epoch": 0.7274261603375527, "progress_pct": 12.12, "epoch_pct": 12.12, "eta": "62:47:21", "max_grad_norm": 0.8, "loss": 0.7311923503875732, "grad_norm": 1.3104428052902222, "learning_rate": 9.950740635711379e-05} +{"ts": "2025-12-27T02:49:00", "event": "train_log", "step": 1726, "epoch": 0.7282700421940929, "progress_pct": 12.14, "epoch_pct": 12.14, "eta": "62:45:00", "max_grad_norm": 0.8, "loss": 0.6878296732902527, "grad_norm": 1.1291942596435547, "learning_rate": 9.95040380267563e-05} +{"ts": "2025-12-27T02:49:18", "event": "train_log", "step": 1728, "epoch": 0.7291139240506329, "progress_pct": 12.15, "epoch_pct": 12.15, "eta": "62:42:14", "max_grad_norm": 0.8, "loss": 0.7410538196563721, "grad_norm": 1.5171746015548706, "learning_rate": 9.9500658276768e-05} +{"ts": "2025-12-27T02:49:38", "event": "train_log", "step": 1730, "epoch": 0.729957805907173, "progress_pct": 12.17, "epoch_pct": 12.17, "eta": "62:39:41", "max_grad_norm": 0.8, "loss": 0.6953532695770264, "grad_norm": 1.0966423749923706, "learning_rate": 9.949726710792848e-05} +{"ts": "2025-12-27T02:49:58", "event": "train_log", "step": 1732, "epoch": 0.7308016877637131, "progress_pct": 12.18, "epoch_pct": 12.18, "eta": "62:37:10", "max_grad_norm": 0.8, "loss": 0.6679023504257202, "grad_norm": 1.2436997890472412, "learning_rate": 9.949386452102007e-05} +{"ts": "2025-12-27T02:50:19", "event": "train_log", "step": 1734, "epoch": 0.7316455696202532, "progress_pct": 12.19, "epoch_pct": 12.19, "eta": "62:34:47", "max_grad_norm": 0.8, "loss": 0.8046789765357971, "grad_norm": 1.1364835500717163, "learning_rate": 9.949045051682766e-05} +{"ts": "2025-12-27T02:50:40", "event": "train_log", "step": 1736, "epoch": 0.7324894514767932, "progress_pct": 12.21, "epoch_pct": 12.21, "eta": "62:32:20", "max_grad_norm": 0.8, "loss": 0.7322937846183777, "grad_norm": 1.296648383140564, "learning_rate": 9.948702509613878e-05} +{"ts": "2025-12-27T02:50:59", "event": "train_log", "step": 1738, "epoch": 0.7333333333333333, "progress_pct": 12.22, "epoch_pct": 12.22, "eta": "62:29:39", "max_grad_norm": 0.8, "loss": 0.7442626357078552, "grad_norm": 1.2355525493621826, "learning_rate": 9.948358825974365e-05} +{"ts": "2025-12-27T02:51:20", "event": "train_log", "step": 1740, "epoch": 0.7341772151898734, "progress_pct": 12.24, "epoch_pct": 12.24, "eta": "62:27:19", "max_grad_norm": 0.8, "loss": 0.7231078743934631, "grad_norm": 1.1634451150894165, "learning_rate": 9.948014000843504e-05} +{"ts": "2025-12-27T02:51:41", "event": "train_log", "step": 1742, "epoch": 0.7350210970464135, "progress_pct": 12.25, "epoch_pct": 12.25, "eta": "62:24:51", "max_grad_norm": 0.8, "loss": 0.6436833143234253, "grad_norm": 1.1500129699707031, "learning_rate": 9.947668034300843e-05} +{"ts": "2025-12-27T02:52:00", "event": "train_log", "step": 1744, "epoch": 0.7358649789029535, "progress_pct": 12.26, "epoch_pct": 12.26, "eta": "62:22:13", "max_grad_norm": 0.8, "loss": 0.8170580863952637, "grad_norm": 1.3881278038024902, "learning_rate": 9.947320926426189e-05} +{"ts": "2025-12-27T02:52:17", "event": "train_log", "step": 1746, "epoch": 0.7367088607594937, "progress_pct": 12.28, "epoch_pct": 12.28, "eta": "62:19:27", "max_grad_norm": 0.8, "loss": 0.7830947041511536, "grad_norm": 1.3479492664337158, "learning_rate": 9.94697267729961e-05} +{"ts": "2025-12-27T02:52:39", "event": "train_log", "step": 1748, "epoch": 0.7375527426160338, "progress_pct": 12.29, "epoch_pct": 12.29, "eta": "62:17:06", "max_grad_norm": 0.8, "loss": 0.7358533143997192, "grad_norm": 1.0187158584594727, "learning_rate": 9.946623287001444e-05} +{"ts": "2025-12-27T02:52:57", "event": "train_log", "step": 1750, "epoch": 0.7383966244725738, "progress_pct": 12.31, "epoch_pct": 12.31, "eta": "62:14:28", "max_grad_norm": 0.8, "loss": 0.7279790639877319, "grad_norm": 1.2575689554214478, "learning_rate": 9.946272755612287e-05} +{"ts": "2025-12-27T02:53:18", "event": "train_log", "step": 1752, "epoch": 0.739240506329114, "progress_pct": 12.32, "epoch_pct": 12.32, "eta": "62:12:05", "max_grad_norm": 0.8, "loss": 0.6953092217445374, "grad_norm": 1.2045027017593384, "learning_rate": 9.945921083213002e-05} +{"ts": "2025-12-27T02:53:37", "event": "train_log", "step": 1754, "epoch": 0.740084388185654, "progress_pct": 12.33, "epoch_pct": 12.33, "eta": "62:09:23", "max_grad_norm": 0.8, "loss": 0.8094141483306885, "grad_norm": 1.3994466066360474, "learning_rate": 9.945568269884708e-05} +{"ts": "2025-12-27T02:53:58", "event": "train_log", "step": 1756, "epoch": 0.7409282700421941, "progress_pct": 12.35, "epoch_pct": 12.35, "eta": "62:07:01", "max_grad_norm": 0.8, "loss": 0.6979201436042786, "grad_norm": 1.2892286777496338, "learning_rate": 9.945214315708797e-05} +{"ts": "2025-12-27T02:54:17", "event": "train_log", "step": 1758, "epoch": 0.7417721518987341, "progress_pct": 12.36, "epoch_pct": 12.36, "eta": "62:04:31", "max_grad_norm": 0.8, "loss": 0.6810774803161621, "grad_norm": 1.2006971836090088, "learning_rate": 9.944859220766919e-05} +{"ts": "2025-12-27T02:54:37", "event": "train_log", "step": 1760, "epoch": 0.7426160337552743, "progress_pct": 12.38, "epoch_pct": 12.38, "eta": "62:02:02", "max_grad_norm": 0.8, "loss": 0.6796762347221375, "grad_norm": 1.055793285369873, "learning_rate": 9.944502985140986e-05} +{"ts": "2025-12-27T02:54:57", "event": "train_log", "step": 1762, "epoch": 0.7434599156118143, "progress_pct": 12.39, "epoch_pct": 12.39, "eta": "61:59:32", "max_grad_norm": 0.8, "loss": 0.7954121828079224, "grad_norm": 1.174714207649231, "learning_rate": 9.944145608913175e-05} +{"ts": "2025-12-27T02:55:17", "event": "train_log", "step": 1764, "epoch": 0.7443037974683544, "progress_pct": 12.41, "epoch_pct": 12.41, "eta": "61:57:05", "max_grad_norm": 0.8, "loss": 0.6939491629600525, "grad_norm": 1.1638222932815552, "learning_rate": 9.943787092165926e-05} +{"ts": "2025-12-27T02:55:39", "event": "train_log", "step": 1766, "epoch": 0.7451476793248946, "progress_pct": 12.42, "epoch_pct": 12.42, "eta": "61:54:51", "max_grad_norm": 0.8, "loss": 0.8112956285476685, "grad_norm": 1.1861820220947266, "learning_rate": 9.943427434981942e-05} +{"ts": "2025-12-27T02:56:01", "event": "train_log", "step": 1768, "epoch": 0.7459915611814346, "progress_pct": 12.43, "epoch_pct": 12.43, "eta": "61:52:36", "max_grad_norm": 0.8, "loss": 0.6812481880187988, "grad_norm": 0.9667421579360962, "learning_rate": 9.943066637444189e-05} +{"ts": "2025-12-27T02:56:19", "event": "train_log", "step": 1770, "epoch": 0.7468354430379747, "progress_pct": 12.45, "epoch_pct": 12.45, "eta": "61:50:01", "max_grad_norm": 0.8, "loss": 0.7598370313644409, "grad_norm": 1.2826191186904907, "learning_rate": 9.942704699635898e-05} +{"ts": "2025-12-27T02:56:39", "event": "train_log", "step": 1772, "epoch": 0.7476793248945147, "progress_pct": 12.46, "epoch_pct": 12.46, "eta": "61:47:31", "max_grad_norm": 0.8, "loss": 0.7118877172470093, "grad_norm": 1.2257909774780273, "learning_rate": 9.942341621640558e-05} +{"ts": "2025-12-27T02:56:58", "event": "train_log", "step": 1774, "epoch": 0.7485232067510549, "progress_pct": 12.48, "epoch_pct": 12.48, "eta": "61:44:56", "max_grad_norm": 0.8, "loss": 0.8037024736404419, "grad_norm": 1.5224615335464478, "learning_rate": 9.941977403541925e-05} +{"ts": "2025-12-27T02:57:18", "event": "train_log", "step": 1776, "epoch": 0.7493670886075949, "progress_pct": 12.49, "epoch_pct": 12.49, "eta": "61:42:31", "max_grad_norm": 0.8, "loss": 0.6795828938484192, "grad_norm": 1.188689947128296, "learning_rate": 9.941612045424018e-05} +{"ts": "2025-12-27T02:57:36", "event": "train_log", "step": 1778, "epoch": 0.750210970464135, "progress_pct": 12.5, "epoch_pct": 12.5, "eta": "61:39:52", "max_grad_norm": 0.8, "loss": 0.6934568881988525, "grad_norm": 1.0685369968414307, "learning_rate": 9.941245547371116e-05} +{"ts": "2025-12-27T02:57:57", "event": "train_log", "step": 1780, "epoch": 0.7510548523206751, "progress_pct": 12.52, "epoch_pct": 12.52, "eta": "61:37:32", "max_grad_norm": 0.8, "loss": 0.6883851289749146, "grad_norm": 1.1643654108047485, "learning_rate": 9.940877909467767e-05} +{"ts": "2025-12-27T02:58:15", "event": "train_log", "step": 1782, "epoch": 0.7518987341772152, "progress_pct": 12.53, "epoch_pct": 12.53, "eta": "61:34:56", "max_grad_norm": 0.8, "loss": 0.8284637928009033, "grad_norm": 1.15621018409729, "learning_rate": 9.940509131798775e-05} +{"ts": "2025-12-27T02:58:33", "event": "train_log", "step": 1784, "epoch": 0.7527426160337553, "progress_pct": 12.55, "epoch_pct": 12.55, "eta": "61:32:17", "max_grad_norm": 0.8, "loss": 0.7108310461044312, "grad_norm": 1.1946302652359009, "learning_rate": 9.94013921444921e-05} +{"ts": "2025-12-27T02:58:53", "event": "train_log", "step": 1786, "epoch": 0.7535864978902953, "progress_pct": 12.56, "epoch_pct": 12.56, "eta": "61:29:49", "max_grad_norm": 0.8, "loss": 0.7166154384613037, "grad_norm": 1.1536555290222168, "learning_rate": 9.939768157504404e-05} +{"ts": "2025-12-27T02:59:11", "event": "train_log", "step": 1788, "epoch": 0.7544303797468355, "progress_pct": 12.57, "epoch_pct": 12.57, "eta": "61:27:17", "max_grad_norm": 0.8, "loss": 0.7774572372436523, "grad_norm": 1.3184611797332764, "learning_rate": 9.939395961049956e-05} +{"ts": "2025-12-27T02:59:31", "event": "train_log", "step": 1790, "epoch": 0.7552742616033755, "progress_pct": 12.59, "epoch_pct": 12.59, "eta": "61:24:53", "max_grad_norm": 0.8, "loss": 0.7386471033096313, "grad_norm": 1.0782374143600464, "learning_rate": 9.939022625171723e-05} +{"ts": "2025-12-27T02:59:51", "event": "train_log", "step": 1792, "epoch": 0.7561181434599156, "progress_pct": 12.6, "epoch_pct": 12.6, "eta": "61:22:27", "max_grad_norm": 0.8, "loss": 0.6495215892791748, "grad_norm": 1.1616696119308472, "learning_rate": 9.938648149955824e-05} +{"ts": "2025-12-27T03:00:11", "event": "train_log", "step": 1794, "epoch": 0.7569620253164557, "progress_pct": 12.62, "epoch_pct": 12.62, "eta": "61:20:01", "max_grad_norm": 0.8, "loss": 0.7733646631240845, "grad_norm": 1.1715892553329468, "learning_rate": 9.938272535488647e-05} +{"ts": "2025-12-27T03:00:30", "event": "train_log", "step": 1796, "epoch": 0.7578059071729958, "progress_pct": 12.63, "epoch_pct": 12.63, "eta": "61:17:35", "max_grad_norm": 0.8, "loss": 0.7354782223701477, "grad_norm": 1.203466773033142, "learning_rate": 9.937895781856838e-05} +{"ts": "2025-12-27T03:00:48", "event": "train_log", "step": 1798, "epoch": 0.7586497890295358, "progress_pct": 12.64, "epoch_pct": 12.64, "eta": "61:14:59", "max_grad_norm": 0.8, "loss": 0.823226273059845, "grad_norm": 1.246559977531433, "learning_rate": 9.937517889147305e-05} +{"ts": "2025-12-27T03:01:09", "event": "train_log", "step": 1800, "epoch": 0.759493670886076, "progress_pct": 12.66, "epoch_pct": 12.66, "eta": "61:12:40", "max_grad_norm": 0.8, "loss": 0.6221681833267212, "grad_norm": 0.9968833923339844, "learning_rate": 9.937138857447221e-05} +{"ts": "2025-12-27T03:15:22", "event": "train_log", "step": 1800, "epoch": 0.759493670886076, "progress_pct": 12.66, "epoch_pct": 12.66, "eta": "62:50:47", "max_grad_norm": 0.8, "eval_loss": 0.7719914317131042, "eval_runtime": 853.1943, "eval_samples_per_second": 2.47, "eval_steps_per_second": 2.47} +{"ts": "2025-12-27T03:15:41", "event": "train_log", "step": 1802, "epoch": 0.760337552742616, "progress_pct": 12.67, "epoch_pct": 12.67, "eta": "62:48:11", "max_grad_norm": 0.8, "loss": 0.7799059152603149, "grad_norm": 1.5454338788986206, "learning_rate": 9.936758686844024e-05} +{"ts": "2025-12-27T03:16:01", "event": "train_log", "step": 1804, "epoch": 0.7611814345991561, "progress_pct": 12.69, "epoch_pct": 12.69, "eta": "62:45:43", "max_grad_norm": 0.8, "loss": 0.653838038444519, "grad_norm": 1.1954455375671387, "learning_rate": 9.936377377425409e-05} +{"ts": "2025-12-27T03:16:21", "event": "train_log", "step": 1806, "epoch": 0.7620253164556962, "progress_pct": 12.7, "epoch_pct": 12.7, "eta": "62:43:13", "max_grad_norm": 0.8, "loss": 0.7046942710876465, "grad_norm": 1.2538350820541382, "learning_rate": 9.935994929279339e-05} +{"ts": "2025-12-27T03:16:41", "event": "train_log", "step": 1808, "epoch": 0.7628691983122363, "progress_pct": 12.71, "epoch_pct": 12.71, "eta": "62:40:42", "max_grad_norm": 0.8, "loss": 0.7821131348609924, "grad_norm": 1.2358729839324951, "learning_rate": 9.935611342494035e-05} +{"ts": "2025-12-27T03:16:59", "event": "train_log", "step": 1810, "epoch": 0.7637130801687764, "progress_pct": 12.73, "epoch_pct": 12.73, "eta": "62:38:05", "max_grad_norm": 0.8, "loss": 0.7594596147537231, "grad_norm": 1.2401310205459595, "learning_rate": 9.935226617157986e-05} +{"ts": "2025-12-27T03:17:19", "event": "train_log", "step": 1812, "epoch": 0.7645569620253164, "progress_pct": 12.74, "epoch_pct": 12.74, "eta": "62:35:32", "max_grad_norm": 0.8, "loss": 0.7512493133544922, "grad_norm": 1.3197205066680908, "learning_rate": 9.934840753359938e-05} +{"ts": "2025-12-27T03:17:38", "event": "train_log", "step": 1814, "epoch": 0.7654008438818566, "progress_pct": 12.76, "epoch_pct": 12.76, "eta": "62:33:00", "max_grad_norm": 0.8, "loss": 0.6953311562538147, "grad_norm": 1.2482305765151978, "learning_rate": 9.934453751188903e-05} +{"ts": "2025-12-27T03:17:57", "event": "train_log", "step": 1816, "epoch": 0.7662447257383966, "progress_pct": 12.77, "epoch_pct": 12.77, "eta": "62:30:27", "max_grad_norm": 0.8, "loss": 0.7699819803237915, "grad_norm": 1.5995157957077026, "learning_rate": 9.934065610734157e-05} +{"ts": "2025-12-27T03:18:18", "event": "train_log", "step": 1818, "epoch": 0.7670886075949367, "progress_pct": 12.78, "epoch_pct": 12.78, "eta": "62:28:02", "max_grad_norm": 0.8, "loss": 0.6532001495361328, "grad_norm": 1.2414922714233398, "learning_rate": 9.933676332085235e-05} +{"ts": "2025-12-27T03:18:38", "event": "train_log", "step": 1820, "epoch": 0.7679324894514767, "progress_pct": 12.8, "epoch_pct": 12.8, "eta": "62:25:36", "max_grad_norm": 0.8, "loss": 0.7716373801231384, "grad_norm": 1.2274713516235352, "learning_rate": 9.933285915331937e-05} +{"ts": "2025-12-27T03:18:58", "event": "train_log", "step": 1822, "epoch": 0.7687763713080169, "progress_pct": 12.81, "epoch_pct": 12.81, "eta": "62:23:07", "max_grad_norm": 0.8, "loss": 0.7002654671669006, "grad_norm": 1.2894618511199951, "learning_rate": 9.932894360564322e-05} +{"ts": "2025-12-27T03:19:18", "event": "train_log", "step": 1824, "epoch": 0.769620253164557, "progress_pct": 12.83, "epoch_pct": 12.83, "eta": "62:20:46", "max_grad_norm": 0.8, "loss": 0.7970587015151978, "grad_norm": 1.10796320438385, "learning_rate": 9.932501667872718e-05} +{"ts": "2025-12-27T03:19:37", "event": "train_log", "step": 1826, "epoch": 0.770464135021097, "progress_pct": 12.84, "epoch_pct": 12.84, "eta": "62:18:08", "max_grad_norm": 0.8, "loss": 0.8071644306182861, "grad_norm": 1.2393653392791748, "learning_rate": 9.932107837347708e-05} +{"ts": "2025-12-27T03:19:58", "event": "train_log", "step": 1828, "epoch": 0.7713080168776372, "progress_pct": 12.86, "epoch_pct": 12.86, "eta": "62:15:49", "max_grad_norm": 0.8, "loss": 0.7376157641410828, "grad_norm": 1.1999030113220215, "learning_rate": 9.931712869080144e-05} +{"ts": "2025-12-27T03:20:16", "event": "train_log", "step": 1830, "epoch": 0.7721518987341772, "progress_pct": 12.87, "epoch_pct": 12.87, "eta": "62:13:13", "max_grad_norm": 0.8, "loss": 0.7487053275108337, "grad_norm": 1.1166026592254639, "learning_rate": 9.931316763161135e-05} +{"ts": "2025-12-27T03:20:36", "event": "train_log", "step": 1832, "epoch": 0.7729957805907173, "progress_pct": 12.88, "epoch_pct": 12.88, "eta": "62:10:47", "max_grad_norm": 0.8, "loss": 0.733161985874176, "grad_norm": 1.1788052320480347, "learning_rate": 9.930919519682059e-05} +{"ts": "2025-12-27T03:20:54", "event": "train_log", "step": 1834, "epoch": 0.7738396624472574, "progress_pct": 12.9, "epoch_pct": 12.9, "eta": "62:08:09", "max_grad_norm": 0.8, "loss": 0.7907692790031433, "grad_norm": 1.309968113899231, "learning_rate": 9.930521138734548e-05} +{"ts": "2025-12-27T03:21:14", "event": "train_log", "step": 1836, "epoch": 0.7746835443037975, "progress_pct": 12.91, "epoch_pct": 12.91, "eta": "62:05:44", "max_grad_norm": 0.8, "loss": 0.7192210555076599, "grad_norm": 1.1685889959335327, "learning_rate": 9.930121620410502e-05} +{"ts": "2025-12-27T03:21:35", "event": "train_log", "step": 1838, "epoch": 0.7755274261603375, "progress_pct": 12.93, "epoch_pct": 12.93, "eta": "62:03:22", "max_grad_norm": 0.8, "loss": 0.7394438982009888, "grad_norm": 1.2243701219558716, "learning_rate": 9.929720964802085e-05} +{"ts": "2025-12-27T03:21:54", "event": "train_log", "step": 1840, "epoch": 0.7763713080168776, "progress_pct": 12.94, "epoch_pct": 12.94, "eta": "62:00:52", "max_grad_norm": 0.8, "loss": 0.7885041832923889, "grad_norm": 1.2940958738327026, "learning_rate": 9.929319172001717e-05} +{"ts": "2025-12-27T03:22:13", "event": "train_log", "step": 1842, "epoch": 0.7772151898734178, "progress_pct": 12.95, "epoch_pct": 12.95, "eta": "61:58:21", "max_grad_norm": 0.8, "loss": 0.6822885274887085, "grad_norm": 1.0952763557434082, "learning_rate": 9.928916242102086e-05} +{"ts": "2025-12-27T03:22:32", "event": "train_log", "step": 1844, "epoch": 0.7780590717299578, "progress_pct": 12.97, "epoch_pct": 12.97, "eta": "61:55:52", "max_grad_norm": 0.8, "loss": 0.7070927619934082, "grad_norm": 1.0333503484725952, "learning_rate": 9.928512175196139e-05} +{"ts": "2025-12-27T03:22:52", "event": "train_log", "step": 1846, "epoch": 0.7789029535864979, "progress_pct": 12.98, "epoch_pct": 12.98, "eta": "61:53:27", "max_grad_norm": 0.8, "loss": 0.7041296362876892, "grad_norm": 1.201359510421753, "learning_rate": 9.928106971377088e-05} +{"ts": "2025-12-27T03:23:12", "event": "train_log", "step": 1848, "epoch": 0.779746835443038, "progress_pct": 13.0, "epoch_pct": 13.0, "eta": "61:51:04", "max_grad_norm": 0.8, "loss": 0.6630192995071411, "grad_norm": 1.5381278991699219, "learning_rate": 9.927700630738404e-05} +{"ts": "2025-12-27T03:23:31", "event": "train_log", "step": 1850, "epoch": 0.7805907172995781, "progress_pct": 13.01, "epoch_pct": 13.01, "eta": "61:48:36", "max_grad_norm": 0.8, "loss": 0.7628101110458374, "grad_norm": 1.2858322858810425, "learning_rate": 9.927293153373823e-05} +{"ts": "2025-12-27T03:23:50", "event": "train_log", "step": 1852, "epoch": 0.7814345991561181, "progress_pct": 13.02, "epoch_pct": 13.02, "eta": "61:46:04", "max_grad_norm": 0.8, "loss": 0.7557390928268433, "grad_norm": 1.3730580806732178, "learning_rate": 9.926884539377343e-05} +{"ts": "2025-12-27T03:24:08", "event": "train_log", "step": 1854, "epoch": 0.7822784810126582, "progress_pct": 13.04, "epoch_pct": 13.04, "eta": "61:43:33", "max_grad_norm": 0.8, "loss": 0.8217329978942871, "grad_norm": 1.4954931735992432, "learning_rate": 9.92647478884322e-05} +{"ts": "2025-12-27T03:24:29", "event": "train_log", "step": 1856, "epoch": 0.7831223628691983, "progress_pct": 13.05, "epoch_pct": 13.05, "eta": "61:41:13", "max_grad_norm": 0.8, "loss": 0.672879695892334, "grad_norm": 1.1092652082443237, "learning_rate": 9.92606390186598e-05} +{"ts": "2025-12-27T03:24:47", "event": "train_log", "step": 1858, "epoch": 0.7839662447257384, "progress_pct": 13.07, "epoch_pct": 13.07, "eta": "61:38:42", "max_grad_norm": 0.8, "loss": 0.7380653619766235, "grad_norm": 1.2077893018722534, "learning_rate": 9.925651878540404e-05} +{"ts": "2025-12-27T03:25:07", "event": "train_log", "step": 1860, "epoch": 0.7848101265822784, "progress_pct": 13.08, "epoch_pct": 13.08, "eta": "61:36:19", "max_grad_norm": 0.8, "loss": 0.6648160219192505, "grad_norm": 1.0789313316345215, "learning_rate": 9.925238718961538e-05} +{"ts": "2025-12-27T03:25:24", "event": "train_log", "step": 1862, "epoch": 0.7856540084388186, "progress_pct": 13.09, "epoch_pct": 13.09, "eta": "61:33:41", "max_grad_norm": 0.8, "loss": 0.8316769003868103, "grad_norm": 1.3950812816619873, "learning_rate": 9.924824423224692e-05} +{"ts": "2025-12-27T03:25:43", "event": "train_log", "step": 1864, "epoch": 0.7864978902953587, "progress_pct": 13.11, "epoch_pct": 13.11, "eta": "61:31:08", "max_grad_norm": 0.8, "loss": 0.7901778817176819, "grad_norm": 1.3934763669967651, "learning_rate": 9.924408991425433e-05} +{"ts": "2025-12-27T03:26:03", "event": "train_log", "step": 1866, "epoch": 0.7873417721518987, "progress_pct": 13.12, "epoch_pct": 13.12, "eta": "61:28:46", "max_grad_norm": 0.8, "loss": 0.7643826007843018, "grad_norm": 1.2191659212112427, "learning_rate": 9.923992423659596e-05} +{"ts": "2025-12-27T03:26:23", "event": "train_log", "step": 1868, "epoch": 0.7881856540084389, "progress_pct": 13.14, "epoch_pct": 13.14, "eta": "61:26:28", "max_grad_norm": 0.8, "loss": 0.6314064860343933, "grad_norm": 0.986673891544342, "learning_rate": 9.923574720023274e-05} +{"ts": "2025-12-27T03:26:43", "event": "train_log", "step": 1870, "epoch": 0.7890295358649789, "progress_pct": 13.15, "epoch_pct": 13.15, "eta": "61:24:05", "max_grad_norm": 0.8, "loss": 0.8244763016700745, "grad_norm": 1.003552794456482, "learning_rate": 9.923155880612823e-05} +{"ts": "2025-12-27T03:27:03", "event": "train_log", "step": 1872, "epoch": 0.789873417721519, "progress_pct": 13.16, "epoch_pct": 13.16, "eta": "61:21:47", "max_grad_norm": 0.8, "loss": 0.7398403882980347, "grad_norm": 1.0831382274627686, "learning_rate": 9.92273590552486e-05} +{"ts": "2025-12-27T03:27:22", "event": "train_log", "step": 1874, "epoch": 0.790717299578059, "progress_pct": 13.18, "epoch_pct": 13.18, "eta": "61:19:22", "max_grad_norm": 0.8, "loss": 0.735211968421936, "grad_norm": 1.1782667636871338, "learning_rate": 9.922314794856267e-05} +{"ts": "2025-12-27T03:27:41", "event": "train_log", "step": 1876, "epoch": 0.7915611814345992, "progress_pct": 13.19, "epoch_pct": 13.19, "eta": "61:16:54", "max_grad_norm": 0.8, "loss": 0.7550510764122009, "grad_norm": 2.230534076690674, "learning_rate": 9.921892548704186e-05} +{"ts": "2025-12-27T03:28:01", "event": "train_log", "step": 1878, "epoch": 0.7924050632911392, "progress_pct": 13.21, "epoch_pct": 13.21, "eta": "61:14:34", "max_grad_norm": 0.8, "loss": 0.7676286697387695, "grad_norm": 1.0191401243209839, "learning_rate": 9.92146916716602e-05} +{"ts": "2025-12-27T03:28:20", "event": "train_log", "step": 1880, "epoch": 0.7932489451476793, "progress_pct": 13.22, "epoch_pct": 13.22, "eta": "61:12:13", "max_grad_norm": 0.8, "loss": 0.7409467697143555, "grad_norm": 1.1347072124481201, "learning_rate": 9.921044650339438e-05} +{"ts": "2025-12-27T03:28:39", "event": "train_log", "step": 1882, "epoch": 0.7940928270042195, "progress_pct": 13.23, "epoch_pct": 13.23, "eta": "61:09:48", "max_grad_norm": 0.8, "loss": 0.7760165333747864, "grad_norm": 1.107528567314148, "learning_rate": 9.920618998322364e-05} +{"ts": "2025-12-27T03:29:00", "event": "train_log", "step": 1884, "epoch": 0.7949367088607595, "progress_pct": 13.25, "epoch_pct": 13.25, "eta": "61:07:32", "max_grad_norm": 0.8, "loss": 0.7360131740570068, "grad_norm": 1.1110666990280151, "learning_rate": 9.92019221121299e-05} +{"ts": "2025-12-27T03:29:18", "event": "train_log", "step": 1886, "epoch": 0.7957805907172996, "progress_pct": 13.26, "epoch_pct": 13.26, "eta": "61:05:06", "max_grad_norm": 0.8, "loss": 0.7784845232963562, "grad_norm": 1.267580509185791, "learning_rate": 9.919764289109765e-05} +{"ts": "2025-12-27T03:29:38", "event": "train_log", "step": 1888, "epoch": 0.7966244725738396, "progress_pct": 13.28, "epoch_pct": 13.28, "eta": "61:02:43", "max_grad_norm": 0.8, "loss": 0.7880831360816956, "grad_norm": 1.5894557237625122, "learning_rate": 9.919335232111407e-05} +{"ts": "2025-12-27T03:29:57", "event": "train_log", "step": 1890, "epoch": 0.7974683544303798, "progress_pct": 13.29, "epoch_pct": 13.29, "eta": "61:00:22", "max_grad_norm": 0.8, "loss": 0.7315587997436523, "grad_norm": 1.1906384229660034, "learning_rate": 9.918905040316886e-05} +{"ts": "2025-12-27T03:30:15", "event": "train_log", "step": 1892, "epoch": 0.7983122362869198, "progress_pct": 13.31, "epoch_pct": 13.31, "eta": "60:57:51", "max_grad_norm": 0.8, "loss": 0.7808622121810913, "grad_norm": 1.3626811504364014, "learning_rate": 9.918473713825445e-05} +{"ts": "2025-12-27T03:30:35", "event": "train_log", "step": 1894, "epoch": 0.7991561181434599, "progress_pct": 13.32, "epoch_pct": 13.32, "eta": "60:55:31", "max_grad_norm": 0.8, "loss": 0.7055642604827881, "grad_norm": 1.1801300048828125, "learning_rate": 9.918041252736577e-05} +{"ts": "2025-12-27T03:30:54", "event": "train_log", "step": 1896, "epoch": 0.8, "progress_pct": 13.33, "epoch_pct": 13.33, "eta": "60:53:10", "max_grad_norm": 0.8, "loss": 0.7188893556594849, "grad_norm": 1.2669063806533813, "learning_rate": 9.917607657150046e-05} +{"ts": "2025-12-27T03:31:13", "event": "train_log", "step": 1898, "epoch": 0.8008438818565401, "progress_pct": 13.35, "epoch_pct": 13.35, "eta": "60:50:44", "max_grad_norm": 0.8, "loss": 0.7787454128265381, "grad_norm": 1.1746855974197388, "learning_rate": 9.91717292716587e-05} +{"ts": "2025-12-27T03:31:33", "event": "train_log", "step": 1900, "epoch": 0.8016877637130801, "progress_pct": 13.36, "epoch_pct": 13.36, "eta": "60:48:30", "max_grad_norm": 0.8, "loss": 0.720715343952179, "grad_norm": 1.120012640953064, "learning_rate": 9.916737062884338e-05} +{"ts": "2025-12-27T03:45:59", "event": "train_log", "step": 1900, "epoch": 0.8016877637130801, "progress_pct": 13.36, "epoch_pct": 13.36, "eta": "62:22:05", "max_grad_norm": 0.8, "eval_loss": 0.7648926973342896, "eval_runtime": 865.9394, "eval_samples_per_second": 2.433, "eval_steps_per_second": 2.433} +{"ts": "2025-12-27T03:46:18", "event": "train_log", "step": 1902, "epoch": 0.8025316455696202, "progress_pct": 13.38, "epoch_pct": 13.38, "eta": "62:19:37", "max_grad_norm": 0.8, "loss": 0.7544789910316467, "grad_norm": 1.1745549440383911, "learning_rate": 9.916300064405993e-05} +{"ts": "2025-12-27T03:46:37", "event": "train_log", "step": 1904, "epoch": 0.8033755274261604, "progress_pct": 13.39, "epoch_pct": 13.39, "eta": "62:17:10", "max_grad_norm": 0.8, "loss": 0.7479203343391418, "grad_norm": 1.1439874172210693, "learning_rate": 9.915861931831643e-05} +{"ts": "2025-12-27T03:46:56", "event": "train_log", "step": 1906, "epoch": 0.8042194092827004, "progress_pct": 13.4, "epoch_pct": 13.4, "eta": "62:14:38", "max_grad_norm": 0.8, "loss": 0.6995842456817627, "grad_norm": 1.3508219718933105, "learning_rate": 9.915422665262356e-05} +{"ts": "2025-12-27T03:47:15", "event": "train_log", "step": 1908, "epoch": 0.8050632911392405, "progress_pct": 13.42, "epoch_pct": 13.42, "eta": "62:12:12", "max_grad_norm": 0.8, "loss": 0.7152725458145142, "grad_norm": 1.1519006490707397, "learning_rate": 9.914982264799462e-05} +{"ts": "2025-12-27T03:47:35", "event": "train_log", "step": 1910, "epoch": 0.8059071729957806, "progress_pct": 13.43, "epoch_pct": 13.43, "eta": "62:09:48", "max_grad_norm": 0.8, "loss": 0.7105516195297241, "grad_norm": 1.0818005800247192, "learning_rate": 9.914540730544554e-05} +{"ts": "2025-12-27T03:47:55", "event": "train_log", "step": 1912, "epoch": 0.8067510548523207, "progress_pct": 13.45, "epoch_pct": 13.45, "eta": "62:07:25", "max_grad_norm": 0.8, "loss": 0.6911059617996216, "grad_norm": 1.1611127853393555, "learning_rate": 9.914098062599485e-05} +{"ts": "2025-12-27T03:48:15", "event": "train_log", "step": 1914, "epoch": 0.8075949367088607, "progress_pct": 13.46, "epoch_pct": 13.46, "eta": "62:05:02", "max_grad_norm": 0.8, "loss": 0.6897286772727966, "grad_norm": 1.1964445114135742, "learning_rate": 9.91365426106637e-05} +{"ts": "2025-12-27T03:48:33", "event": "train_log", "step": 1916, "epoch": 0.8084388185654009, "progress_pct": 13.47, "epoch_pct": 13.47, "eta": "62:02:28", "max_grad_norm": 0.8, "loss": 0.7263250350952148, "grad_norm": 1.3873497247695923, "learning_rate": 9.913209326047585e-05} +{"ts": "2025-12-27T03:48:52", "event": "train_log", "step": 1918, "epoch": 0.809282700421941, "progress_pct": 13.49, "epoch_pct": 13.49, "eta": "62:00:05", "max_grad_norm": 0.8, "loss": 0.7045295238494873, "grad_norm": 1.1729894876480103, "learning_rate": 9.91276325764577e-05} +{"ts": "2025-12-27T03:49:13", "event": "train_log", "step": 1920, "epoch": 0.810126582278481, "progress_pct": 13.5, "epoch_pct": 13.5, "eta": "61:57:51", "max_grad_norm": 0.8, "loss": 0.587131142616272, "grad_norm": 0.9089694619178772, "learning_rate": 9.912316055963822e-05} +{"ts": "2025-12-27T03:49:32", "event": "train_log", "step": 1922, "epoch": 0.810970464135021, "progress_pct": 13.52, "epoch_pct": 13.52, "eta": "61:55:20", "max_grad_norm": 0.8, "loss": 0.7237880229949951, "grad_norm": 1.2051384449005127, "learning_rate": 9.911867721104902e-05} +{"ts": "2025-12-27T03:49:51", "event": "train_log", "step": 1924, "epoch": 0.8118143459915612, "progress_pct": 13.53, "epoch_pct": 13.53, "eta": "61:52:54", "max_grad_norm": 0.8, "loss": 0.6967294216156006, "grad_norm": 1.2152670621871948, "learning_rate": 9.911418253172433e-05} +{"ts": "2025-12-27T03:50:10", "event": "train_log", "step": 1926, "epoch": 0.8126582278481013, "progress_pct": 13.54, "epoch_pct": 13.54, "eta": "61:50:29", "max_grad_norm": 0.8, "loss": 0.7636315822601318, "grad_norm": 1.1193642616271973, "learning_rate": 9.9109676522701e-05} +{"ts": "2025-12-27T03:50:29", "event": "train_log", "step": 1928, "epoch": 0.8135021097046413, "progress_pct": 13.56, "epoch_pct": 13.56, "eta": "61:48:03", "max_grad_norm": 0.8, "loss": 0.7451969981193542, "grad_norm": 1.2457597255706787, "learning_rate": 9.910515918501843e-05} +{"ts": "2025-12-27T03:50:50", "event": "train_log", "step": 1930, "epoch": 0.8143459915611815, "progress_pct": 13.57, "epoch_pct": 13.57, "eta": "61:45:53", "max_grad_norm": 0.8, "loss": 0.6320056319236755, "grad_norm": 1.057009220123291, "learning_rate": 9.910063051971876e-05} +{"ts": "2025-12-27T03:51:09", "event": "train_log", "step": 1932, "epoch": 0.8151898734177215, "progress_pct": 13.59, "epoch_pct": 13.59, "eta": "61:43:26", "max_grad_norm": 0.8, "loss": 0.691004753112793, "grad_norm": 1.2820258140563965, "learning_rate": 9.909609052784661e-05} +{"ts": "2025-12-27T03:51:29", "event": "train_log", "step": 1934, "epoch": 0.8160337552742616, "progress_pct": 13.6, "epoch_pct": 13.6, "eta": "61:41:07", "max_grad_norm": 0.8, "loss": 0.7741923332214355, "grad_norm": 1.331312656402588, "learning_rate": 9.909153921044927e-05} +{"ts": "2025-12-27T03:51:49", "event": "train_log", "step": 1936, "epoch": 0.8168776371308016, "progress_pct": 13.61, "epoch_pct": 13.61, "eta": "61:38:47", "max_grad_norm": 0.8, "loss": 0.668049156665802, "grad_norm": 1.2055360078811646, "learning_rate": 9.908697656857668e-05} +{"ts": "2025-12-27T03:52:08", "event": "train_log", "step": 1938, "epoch": 0.8177215189873418, "progress_pct": 13.63, "epoch_pct": 13.63, "eta": "61:36:24", "max_grad_norm": 0.8, "loss": 0.6584748029708862, "grad_norm": 1.2124541997909546, "learning_rate": 9.90824026032813e-05} +{"ts": "2025-12-27T03:52:28", "event": "train_log", "step": 1940, "epoch": 0.8185654008438819, "progress_pct": 13.64, "epoch_pct": 13.64, "eta": "61:34:02", "max_grad_norm": 0.8, "loss": 0.7081992626190186, "grad_norm": 1.244288682937622, "learning_rate": 9.90778173156183e-05} +{"ts": "2025-12-27T03:52:47", "event": "train_log", "step": 1942, "epoch": 0.8194092827004219, "progress_pct": 13.66, "epoch_pct": 13.66, "eta": "61:31:37", "max_grad_norm": 0.8, "loss": 0.7977840900421143, "grad_norm": 1.250558853149414, "learning_rate": 9.907322070664542e-05} +{"ts": "2025-12-27T03:53:05", "event": "train_log", "step": 1944, "epoch": 0.8202531645569621, "progress_pct": 13.67, "epoch_pct": 13.67, "eta": "61:29:11", "max_grad_norm": 0.8, "loss": 0.7830103635787964, "grad_norm": 1.3892892599105835, "learning_rate": 9.906861277742297e-05} +{"ts": "2025-12-27T03:53:24", "event": "train_log", "step": 1946, "epoch": 0.8210970464135021, "progress_pct": 13.68, "epoch_pct": 13.68, "eta": "61:26:45", "max_grad_norm": 0.8, "loss": 0.8451479077339172, "grad_norm": 1.3152644634246826, "learning_rate": 9.906399352901393e-05} +{"ts": "2025-12-27T03:53:44", "event": "train_log", "step": 1948, "epoch": 0.8219409282700422, "progress_pct": 13.7, "epoch_pct": 13.7, "eta": "61:24:27", "max_grad_norm": 0.8, "loss": 0.7035528421401978, "grad_norm": 1.1102250814437866, "learning_rate": 9.905936296248388e-05} +{"ts": "2025-12-27T03:54:04", "event": "train_log", "step": 1950, "epoch": 0.8227848101265823, "progress_pct": 13.71, "epoch_pct": 13.71, "eta": "61:22:11", "max_grad_norm": 0.8, "loss": 0.764616847038269, "grad_norm": 1.0271214246749878, "learning_rate": 9.905472107890101e-05} +{"ts": "2025-12-27T03:54:23", "event": "train_log", "step": 1952, "epoch": 0.8236286919831224, "progress_pct": 13.73, "epoch_pct": 13.73, "eta": "61:19:49", "max_grad_norm": 0.8, "loss": 0.7699717283248901, "grad_norm": 1.1772255897521973, "learning_rate": 9.905006787933609e-05} +{"ts": "2025-12-27T03:54:41", "event": "train_log", "step": 1954, "epoch": 0.8244725738396624, "progress_pct": 13.74, "epoch_pct": 13.74, "eta": "61:17:22", "max_grad_norm": 0.8, "loss": 0.7755605578422546, "grad_norm": 1.2486404180526733, "learning_rate": 9.904540336486252e-05} +{"ts": "2025-12-27T03:55:02", "event": "train_log", "step": 1956, "epoch": 0.8253164556962025, "progress_pct": 13.76, "epoch_pct": 13.76, "eta": "61:15:09", "max_grad_norm": 0.8, "loss": 0.688934326171875, "grad_norm": 1.070148229598999, "learning_rate": 9.904072753655635e-05} +{"ts": "2025-12-27T03:55:21", "event": "train_log", "step": 1958, "epoch": 0.8261603375527427, "progress_pct": 13.77, "epoch_pct": 13.77, "eta": "61:12:51", "max_grad_norm": 0.8, "loss": 0.7447791695594788, "grad_norm": 1.118401288986206, "learning_rate": 9.903604039549617e-05} +{"ts": "2025-12-27T03:55:40", "event": "train_log", "step": 1960, "epoch": 0.8270042194092827, "progress_pct": 13.78, "epoch_pct": 13.78, "eta": "61:10:27", "max_grad_norm": 0.8, "loss": 0.7990683317184448, "grad_norm": 1.2209899425506592, "learning_rate": 9.903134194276323e-05} +{"ts": "2025-12-27T03:55:59", "event": "train_log", "step": 1962, "epoch": 0.8278481012658228, "progress_pct": 13.8, "epoch_pct": 13.8, "eta": "61:08:04", "max_grad_norm": 0.8, "loss": 0.7290873527526855, "grad_norm": 1.296093225479126, "learning_rate": 9.902663217944137e-05} +{"ts": "2025-12-27T03:56:18", "event": "train_log", "step": 1964, "epoch": 0.8286919831223629, "progress_pct": 13.81, "epoch_pct": 13.81, "eta": "61:05:42", "max_grad_norm": 0.8, "loss": 0.7971217036247253, "grad_norm": 1.2594937086105347, "learning_rate": 9.902191110661704e-05} +{"ts": "2025-12-27T03:56:37", "event": "train_log", "step": 1966, "epoch": 0.829535864978903, "progress_pct": 13.83, "epoch_pct": 13.83, "eta": "61:03:20", "max_grad_norm": 0.8, "loss": 0.6728768348693848, "grad_norm": 1.6016536951065063, "learning_rate": 9.90171787253793e-05} +{"ts": "2025-12-27T03:56:56", "event": "train_log", "step": 1968, "epoch": 0.830379746835443, "progress_pct": 13.84, "epoch_pct": 13.84, "eta": "61:01:00", "max_grad_norm": 0.8, "loss": 0.7684211730957031, "grad_norm": 3.3128950595855713, "learning_rate": 9.901243503681983e-05} +{"ts": "2025-12-27T03:57:15", "event": "train_log", "step": 1970, "epoch": 0.8312236286919831, "progress_pct": 13.85, "epoch_pct": 13.85, "eta": "60:58:42", "max_grad_norm": 0.8, "loss": 0.756637454032898, "grad_norm": 1.2970373630523682, "learning_rate": 9.90076800420329e-05} +{"ts": "2025-12-27T03:57:37", "event": "train_log", "step": 1972, "epoch": 0.8320675105485232, "progress_pct": 13.87, "epoch_pct": 13.87, "eta": "60:56:36", "max_grad_norm": 0.8, "loss": 0.6692084074020386, "grad_norm": 1.1388959884643555, "learning_rate": 9.900291374211538e-05} +{"ts": "2025-12-27T03:57:55", "event": "train_log", "step": 1974, "epoch": 0.8329113924050633, "progress_pct": 13.88, "epoch_pct": 13.88, "eta": "60:54:13", "max_grad_norm": 0.8, "loss": 0.7298309803009033, "grad_norm": 1.050641655921936, "learning_rate": 9.899813613816677e-05} +{"ts": "2025-12-27T03:58:14", "event": "train_log", "step": 1976, "epoch": 0.8337552742616033, "progress_pct": 13.9, "epoch_pct": 13.9, "eta": "60:51:54", "max_grad_norm": 0.8, "loss": 0.6886547803878784, "grad_norm": 1.2598577737808228, "learning_rate": 9.899334723128922e-05} +{"ts": "2025-12-27T03:58:34", "event": "train_log", "step": 1978, "epoch": 0.8345991561181435, "progress_pct": 13.91, "epoch_pct": 13.91, "eta": "60:49:37", "max_grad_norm": 0.8, "loss": 0.745341420173645, "grad_norm": 1.2800767421722412, "learning_rate": 9.898854702258735e-05} +{"ts": "2025-12-27T03:58:53", "event": "train_log", "step": 1980, "epoch": 0.8354430379746836, "progress_pct": 13.92, "epoch_pct": 13.92, "eta": "60:47:17", "max_grad_norm": 0.8, "loss": 0.7133575081825256, "grad_norm": 1.1923155784606934, "learning_rate": 9.898373551316856e-05} +{"ts": "2025-12-27T03:59:13", "event": "train_log", "step": 1982, "epoch": 0.8362869198312236, "progress_pct": 13.94, "epoch_pct": 13.94, "eta": "60:45:07", "max_grad_norm": 0.8, "loss": 0.8117790818214417, "grad_norm": 1.156121015548706, "learning_rate": 9.897891270414272e-05} +{"ts": "2025-12-27T03:59:34", "event": "train_log", "step": 1984, "epoch": 0.8371308016877637, "progress_pct": 13.95, "epoch_pct": 13.95, "eta": "60:43:01", "max_grad_norm": 0.8, "loss": 0.6094260215759277, "grad_norm": 1.0400618314743042, "learning_rate": 9.897407859662238e-05} +{"ts": "2025-12-27T03:59:55", "event": "train_log", "step": 1986, "epoch": 0.8379746835443038, "progress_pct": 13.97, "epoch_pct": 13.97, "eta": "60:40:54", "max_grad_norm": 0.8, "loss": 0.7680332064628601, "grad_norm": 1.451953411102295, "learning_rate": 9.896923319172268e-05} +{"ts": "2025-12-27T04:00:16", "event": "train_log", "step": 1988, "epoch": 0.8388185654008439, "progress_pct": 13.98, "epoch_pct": 13.98, "eta": "60:38:44", "max_grad_norm": 0.8, "loss": 0.6918784379959106, "grad_norm": 1.2560248374938965, "learning_rate": 9.896437649056134e-05} +{"ts": "2025-12-27T04:00:35", "event": "train_log", "step": 1990, "epoch": 0.8396624472573839, "progress_pct": 13.99, "epoch_pct": 13.99, "eta": "60:36:28", "max_grad_norm": 0.8, "loss": 0.7654696106910706, "grad_norm": 1.2744325399398804, "learning_rate": 9.895950849425874e-05} +{"ts": "2025-12-27T04:00:55", "event": "train_log", "step": 1992, "epoch": 0.8405063291139241, "progress_pct": 14.01, "epoch_pct": 14.01, "eta": "60:34:15", "max_grad_norm": 0.8, "loss": 0.7585932612419128, "grad_norm": 1.304439902305603, "learning_rate": 9.895462920393781e-05} +{"ts": "2025-12-27T04:01:14", "event": "train_log", "step": 1994, "epoch": 0.8413502109704641, "progress_pct": 14.02, "epoch_pct": 14.02, "eta": "60:31:58", "max_grad_norm": 0.8, "loss": 0.7474164962768555, "grad_norm": 1.578957200050354, "learning_rate": 9.89497386207241e-05} +{"ts": "2025-12-27T04:01:35", "event": "train_log", "step": 1996, "epoch": 0.8421940928270042, "progress_pct": 14.04, "epoch_pct": 14.04, "eta": "60:29:52", "max_grad_norm": 0.8, "loss": 0.663844883441925, "grad_norm": 1.0358996391296387, "learning_rate": 9.89448367457458e-05} +{"ts": "2025-12-27T04:01:53", "event": "train_log", "step": 1998, "epoch": 0.8430379746835444, "progress_pct": 14.05, "epoch_pct": 14.05, "eta": "60:27:32", "max_grad_norm": 0.8, "loss": 0.7578557729721069, "grad_norm": 1.2285103797912598, "learning_rate": 9.893992358013366e-05} +{"ts": "2025-12-27T04:02:13", "event": "train_log", "step": 2000, "epoch": 0.8438818565400844, "progress_pct": 14.06, "epoch_pct": 14.06, "eta": "60:25:21", "max_grad_norm": 0.8, "loss": 0.7795036435127258, "grad_norm": 1.2051875591278076, "learning_rate": 9.893499912502108e-05} +{"ts": "2025-12-27T04:16:30", "event": "train_log", "step": 2000, "epoch": 0.8438818565400844, "progress_pct": 14.06, "epoch_pct": 14.06, "eta": "61:52:32", "max_grad_norm": 0.8, "eval_loss": 0.7587011456489563, "eval_runtime": 856.2276, "eval_samples_per_second": 2.461, "eval_steps_per_second": 2.461} +{"ts": "2025-12-27T04:16:52", "event": "train_log", "step": 2002, "epoch": 0.8447257383966245, "progress_pct": 14.08, "epoch_pct": 14.08, "eta": "61:50:29", "max_grad_norm": 0.8, "loss": 0.731850802898407, "grad_norm": 1.145434021949768, "learning_rate": 9.893006338154401e-05} +{"ts": "2025-12-27T04:17:11", "event": "train_log", "step": 2004, "epoch": 0.8455696202531645, "progress_pct": 14.09, "epoch_pct": 14.09, "eta": "61:48:08", "max_grad_norm": 0.8, "loss": 0.6711665391921997, "grad_norm": 1.0618077516555786, "learning_rate": 9.892511635084101e-05} +{"ts": "2025-12-27T04:17:30", "event": "train_log", "step": 2006, "epoch": 0.8464135021097047, "progress_pct": 14.11, "epoch_pct": 14.11, "eta": "61:45:46", "max_grad_norm": 0.8, "loss": 0.6894803643226624, "grad_norm": 1.1657867431640625, "learning_rate": 9.892015803405331e-05} +{"ts": "2025-12-27T04:17:50", "event": "train_log", "step": 2008, "epoch": 0.8472573839662447, "progress_pct": 14.12, "epoch_pct": 14.12, "eta": "61:43:27", "max_grad_norm": 0.8, "loss": 0.628146231174469, "grad_norm": 1.080140233039856, "learning_rate": 9.891518843232467e-05} +{"ts": "2025-12-27T04:18:09", "event": "train_log", "step": 2010, "epoch": 0.8481012658227848, "progress_pct": 14.14, "epoch_pct": 14.14, "eta": "61:41:07", "max_grad_norm": 0.8, "loss": 0.740858793258667, "grad_norm": 1.0664509534835815, "learning_rate": 9.891020754680151e-05} +{"ts": "2025-12-27T04:18:28", "event": "train_log", "step": 2012, "epoch": 0.8489451476793249, "progress_pct": 14.15, "epoch_pct": 14.15, "eta": "61:38:43", "max_grad_norm": 0.8, "loss": 0.7763919234275818, "grad_norm": 1.5567615032196045, "learning_rate": 9.89052153786328e-05} +{"ts": "2025-12-27T04:18:47", "event": "train_log", "step": 2014, "epoch": 0.849789029535865, "progress_pct": 14.16, "epoch_pct": 14.16, "eta": "61:36:23", "max_grad_norm": 0.8, "loss": 0.8131396770477295, "grad_norm": 1.4347095489501953, "learning_rate": 9.890021192897016e-05} +{"ts": "2025-12-27T04:19:06", "event": "train_log", "step": 2016, "epoch": 0.850632911392405, "progress_pct": 14.18, "epoch_pct": 14.18, "eta": "61:34:04", "max_grad_norm": 0.8, "loss": 0.6829051375389099, "grad_norm": 1.1787892580032349, "learning_rate": 9.889519719896776e-05} +{"ts": "2025-12-27T04:19:25", "event": "train_log", "step": 2018, "epoch": 0.8514767932489451, "progress_pct": 14.19, "epoch_pct": 14.19, "eta": "61:31:40", "max_grad_norm": 0.8, "loss": 0.7664558291435242, "grad_norm": 1.239745855331421, "learning_rate": 9.889017118978241e-05} +{"ts": "2025-12-27T04:19:45", "event": "train_log", "step": 2020, "epoch": 0.8523206751054853, "progress_pct": 14.21, "epoch_pct": 14.21, "eta": "61:29:23", "max_grad_norm": 0.8, "loss": 0.7307376861572266, "grad_norm": 1.1224207878112793, "learning_rate": 9.888513390257352e-05} +{"ts": "2025-12-27T04:20:04", "event": "train_log", "step": 2022, "epoch": 0.8531645569620253, "progress_pct": 14.22, "epoch_pct": 14.22, "eta": "61:27:06", "max_grad_norm": 0.8, "loss": 0.6786578893661499, "grad_norm": 1.100536823272705, "learning_rate": 9.88800853385031e-05} +{"ts": "2025-12-27T04:20:24", "event": "train_log", "step": 2024, "epoch": 0.8540084388185654, "progress_pct": 14.23, "epoch_pct": 14.23, "eta": "61:24:49", "max_grad_norm": 0.8, "loss": 0.7971984148025513, "grad_norm": 1.25773024559021, "learning_rate": 9.887502549873576e-05} +{"ts": "2025-12-27T04:20:44", "event": "train_log", "step": 2026, "epoch": 0.8548523206751055, "progress_pct": 14.25, "epoch_pct": 14.25, "eta": "61:22:39", "max_grad_norm": 0.8, "loss": 0.6990941166877747, "grad_norm": 0.9980104565620422, "learning_rate": 9.886995438443868e-05} +{"ts": "2025-12-27T04:21:04", "event": "train_log", "step": 2028, "epoch": 0.8556962025316456, "progress_pct": 14.26, "epoch_pct": 14.26, "eta": "61:20:21", "max_grad_norm": 0.8, "loss": 0.763938307762146, "grad_norm": 1.0464621782302856, "learning_rate": 9.886487199678171e-05} +{"ts": "2025-12-27T04:21:22", "event": "train_log", "step": 2030, "epoch": 0.8565400843881856, "progress_pct": 14.28, "epoch_pct": 14.28, "eta": "61:17:58", "max_grad_norm": 0.8, "loss": 0.7165632247924805, "grad_norm": 1.2303017377853394, "learning_rate": 9.885977833693724e-05} +{"ts": "2025-12-27T04:21:40", "event": "train_log", "step": 2032, "epoch": 0.8573839662447258, "progress_pct": 14.29, "epoch_pct": 14.29, "eta": "61:15:32", "max_grad_norm": 0.8, "loss": 0.7586364150047302, "grad_norm": 1.2203325033187866, "learning_rate": 9.885467340608027e-05} +{"ts": "2025-12-27T04:22:00", "event": "train_log", "step": 2034, "epoch": 0.8582278481012658, "progress_pct": 14.3, "epoch_pct": 14.3, "eta": "61:13:21", "max_grad_norm": 0.8, "loss": 0.703253984451294, "grad_norm": 1.113882064819336, "learning_rate": 9.884955720538843e-05} +{"ts": "2025-12-27T04:22:20", "event": "train_log", "step": 2036, "epoch": 0.8590717299578059, "progress_pct": 14.32, "epoch_pct": 14.32, "eta": "61:11:08", "max_grad_norm": 0.8, "loss": 0.8530917763710022, "grad_norm": 1.1731632947921753, "learning_rate": 9.88444297360419e-05} +{"ts": "2025-12-27T04:22:39", "event": "train_log", "step": 2038, "epoch": 0.859915611814346, "progress_pct": 14.33, "epoch_pct": 14.33, "eta": "61:08:46", "max_grad_norm": 0.8, "loss": 0.8166638612747192, "grad_norm": 1.4592338800430298, "learning_rate": 9.883929099922349e-05} +{"ts": "2025-12-27T04:23:00", "event": "train_log", "step": 2040, "epoch": 0.8607594936708861, "progress_pct": 14.35, "epoch_pct": 14.35, "eta": "61:06:39", "max_grad_norm": 0.8, "loss": 0.6762415170669556, "grad_norm": 1.1279125213623047, "learning_rate": 9.883414099611864e-05} +{"ts": "2025-12-27T04:23:21", "event": "train_log", "step": 2042, "epoch": 0.8616033755274262, "progress_pct": 14.36, "epoch_pct": 14.36, "eta": "61:04:33", "max_grad_norm": 0.8, "loss": 0.6826539039611816, "grad_norm": 1.1587293148040771, "learning_rate": 9.882897972791534e-05} +{"ts": "2025-12-27T04:23:40", "event": "train_log", "step": 2044, "epoch": 0.8624472573839662, "progress_pct": 14.37, "epoch_pct": 14.37, "eta": "61:02:15", "max_grad_norm": 0.8, "loss": 0.7372410893440247, "grad_norm": 1.1909502744674683, "learning_rate": 9.88238071958042e-05} +{"ts": "2025-12-27T04:24:01", "event": "train_log", "step": 2046, "epoch": 0.8632911392405064, "progress_pct": 14.39, "epoch_pct": 14.39, "eta": "61:00:10", "max_grad_norm": 0.8, "loss": 0.699260950088501, "grad_norm": 1.0340155363082886, "learning_rate": 9.881862340097841e-05} +{"ts": "2025-12-27T04:24:21", "event": "train_log", "step": 2048, "epoch": 0.8641350210970464, "progress_pct": 14.4, "epoch_pct": 14.4, "eta": "60:57:58", "max_grad_norm": 0.8, "loss": 0.7689789533615112, "grad_norm": 1.1745870113372803, "learning_rate": 9.881342834463379e-05} +{"ts": "2025-12-27T04:24:41", "event": "train_log", "step": 2050, "epoch": 0.8649789029535865, "progress_pct": 14.42, "epoch_pct": 14.42, "eta": "60:55:50", "max_grad_norm": 0.8, "loss": 0.6877372860908508, "grad_norm": 1.0003606081008911, "learning_rate": 9.880822202796872e-05} +{"ts": "2025-12-27T04:25:02", "event": "train_log", "step": 2052, "epoch": 0.8658227848101265, "progress_pct": 14.43, "epoch_pct": 14.43, "eta": "60:53:43", "max_grad_norm": 0.8, "loss": 0.7632413506507874, "grad_norm": 1.2546781301498413, "learning_rate": 9.88030044521842e-05} +{"ts": "2025-12-27T04:25:22", "event": "train_log", "step": 2054, "epoch": 0.8666666666666667, "progress_pct": 14.44, "epoch_pct": 14.44, "eta": "60:51:33", "max_grad_norm": 0.8, "loss": 0.6776729822158813, "grad_norm": 1.1178704500198364, "learning_rate": 9.879777561848385e-05} +{"ts": "2025-12-27T04:25:41", "event": "train_log", "step": 2056, "epoch": 0.8675105485232067, "progress_pct": 14.46, "epoch_pct": 14.46, "eta": "60:49:15", "max_grad_norm": 0.8, "loss": 0.7592973709106445, "grad_norm": 1.523606777191162, "learning_rate": 9.879253552807384e-05} +{"ts": "2025-12-27T04:26:01", "event": "train_log", "step": 2058, "epoch": 0.8683544303797468, "progress_pct": 14.47, "epoch_pct": 14.47, "eta": "60:47:07", "max_grad_norm": 0.8, "loss": 0.8028839230537415, "grad_norm": 1.3490995168685913, "learning_rate": 9.878728418216296e-05} +{"ts": "2025-12-27T04:26:21", "event": "train_log", "step": 2060, "epoch": 0.869198312236287, "progress_pct": 14.49, "epoch_pct": 14.49, "eta": "60:44:55", "max_grad_norm": 0.8, "loss": 0.7499933838844299, "grad_norm": 1.1851624250411987, "learning_rate": 9.87820215819626e-05} +{"ts": "2025-12-27T04:26:40", "event": "train_log", "step": 2062, "epoch": 0.870042194092827, "progress_pct": 14.5, "epoch_pct": 14.5, "eta": "60:42:38", "max_grad_norm": 0.8, "loss": 0.7324717044830322, "grad_norm": 1.1877925395965576, "learning_rate": 9.877674772868672e-05} +{"ts": "2025-12-27T04:26:59", "event": "train_log", "step": 2064, "epoch": 0.8708860759493671, "progress_pct": 14.51, "epoch_pct": 14.51, "eta": "60:40:21", "max_grad_norm": 0.8, "loss": 0.7456585168838501, "grad_norm": 1.2982885837554932, "learning_rate": 9.877146262355194e-05} +{"ts": "2025-12-27T04:27:20", "event": "train_log", "step": 2066, "epoch": 0.8717299578059071, "progress_pct": 14.53, "epoch_pct": 14.53, "eta": "60:38:18", "max_grad_norm": 0.8, "loss": 0.7552799582481384, "grad_norm": 1.043912649154663, "learning_rate": 9.876616626777739e-05} +{"ts": "2025-12-27T04:27:40", "event": "train_log", "step": 2068, "epoch": 0.8725738396624473, "progress_pct": 14.54, "epoch_pct": 14.54, "eta": "60:36:09", "max_grad_norm": 0.8, "loss": 0.6964990496635437, "grad_norm": 1.172580599784851, "learning_rate": 9.876085866258487e-05} +{"ts": "2025-12-27T04:28:00", "event": "train_log", "step": 2070, "epoch": 0.8734177215189873, "progress_pct": 14.56, "epoch_pct": 14.56, "eta": "60:34:01", "max_grad_norm": 0.8, "loss": 0.7368612289428711, "grad_norm": 1.26815927028656, "learning_rate": 9.875553980919871e-05} +{"ts": "2025-12-27T04:28:21", "event": "train_log", "step": 2072, "epoch": 0.8742616033755274, "progress_pct": 14.57, "epoch_pct": 14.57, "eta": "60:31:58", "max_grad_norm": 0.8, "loss": 0.7400802969932556, "grad_norm": 1.1268136501312256, "learning_rate": 9.875020970884587e-05} +{"ts": "2025-12-27T04:28:43", "event": "train_log", "step": 2074, "epoch": 0.8751054852320675, "progress_pct": 14.59, "epoch_pct": 14.59, "eta": "60:30:00", "max_grad_norm": 0.8, "loss": 0.6931334137916565, "grad_norm": 1.0556721687316895, "learning_rate": 9.874486836275594e-05} +{"ts": "2025-12-27T04:29:03", "event": "train_log", "step": 2076, "epoch": 0.8759493670886076, "progress_pct": 14.6, "epoch_pct": 14.6, "eta": "60:27:53", "max_grad_norm": 0.8, "loss": 0.7124089002609253, "grad_norm": 1.1967823505401611, "learning_rate": 9.873951577216106e-05} +{"ts": "2025-12-27T04:29:23", "event": "train_log", "step": 2078, "epoch": 0.8767932489451477, "progress_pct": 14.61, "epoch_pct": 14.61, "eta": "60:25:42", "max_grad_norm": 0.8, "loss": 0.7462030053138733, "grad_norm": 1.1753164529800415, "learning_rate": 9.873415193829591e-05} +{"ts": "2025-12-27T04:29:42", "event": "train_log", "step": 2080, "epoch": 0.8776371308016878, "progress_pct": 14.63, "epoch_pct": 14.63, "eta": "60:23:29", "max_grad_norm": 0.8, "loss": 0.778078019618988, "grad_norm": 1.326923131942749, "learning_rate": 9.872877686239789e-05} +{"ts": "2025-12-27T04:30:03", "event": "train_log", "step": 2082, "epoch": 0.8784810126582279, "progress_pct": 14.64, "epoch_pct": 14.64, "eta": "60:21:24", "max_grad_norm": 0.8, "loss": 0.6592919826507568, "grad_norm": 1.1472662687301636, "learning_rate": 9.87233905457069e-05} +{"ts": "2025-12-27T04:30:23", "event": "train_log", "step": 2084, "epoch": 0.8793248945147679, "progress_pct": 14.66, "epoch_pct": 14.66, "eta": "60:19:20", "max_grad_norm": 0.8, "loss": 0.661717414855957, "grad_norm": 1.1162762641906738, "learning_rate": 9.871799298946544e-05} +{"ts": "2025-12-27T04:30:44", "event": "train_log", "step": 2086, "epoch": 0.880168776371308, "progress_pct": 14.67, "epoch_pct": 14.67, "eta": "60:17:16", "max_grad_norm": 0.8, "loss": 0.6203670501708984, "grad_norm": 1.1694408655166626, "learning_rate": 9.871258419491866e-05} +{"ts": "2025-12-27T04:31:04", "event": "train_log", "step": 2088, "epoch": 0.8810126582278481, "progress_pct": 14.68, "epoch_pct": 14.68, "eta": "60:15:06", "max_grad_norm": 0.8, "loss": 0.758888304233551, "grad_norm": 1.229691505432129, "learning_rate": 9.870716416331425e-05} +{"ts": "2025-12-27T04:31:21", "event": "train_log", "step": 2090, "epoch": 0.8818565400843882, "progress_pct": 14.7, "epoch_pct": 14.7, "eta": "60:12:47", "max_grad_norm": 0.8, "loss": 0.760649561882019, "grad_norm": 1.540377140045166, "learning_rate": 9.870173289590251e-05} +{"ts": "2025-12-27T04:31:43", "event": "train_log", "step": 2092, "epoch": 0.8827004219409282, "progress_pct": 14.71, "epoch_pct": 14.71, "eta": "60:10:51", "max_grad_norm": 0.8, "loss": 0.6981227397918701, "grad_norm": 1.173628568649292, "learning_rate": 9.869629039393632e-05} +{"ts": "2025-12-27T04:32:04", "event": "train_log", "step": 2094, "epoch": 0.8835443037974684, "progress_pct": 14.73, "epoch_pct": 14.73, "eta": "60:08:46", "max_grad_norm": 0.8, "loss": 0.7808336615562439, "grad_norm": 1.1404013633728027, "learning_rate": 9.869083665867116e-05} +{"ts": "2025-12-27T04:32:23", "event": "train_log", "step": 2096, "epoch": 0.8843881856540085, "progress_pct": 14.74, "epoch_pct": 14.74, "eta": "60:06:35", "max_grad_norm": 0.8, "loss": 0.7540555596351624, "grad_norm": 1.1038721799850464, "learning_rate": 9.868537169136511e-05} +{"ts": "2025-12-27T04:32:44", "event": "train_log", "step": 2098, "epoch": 0.8852320675105485, "progress_pct": 14.75, "epoch_pct": 14.75, "eta": "60:04:37", "max_grad_norm": 0.8, "loss": 0.6650454998016357, "grad_norm": 1.1510080099105835, "learning_rate": 9.867989549327885e-05} +{"ts": "2025-12-27T04:33:06", "event": "train_log", "step": 2100, "epoch": 0.8860759493670886, "progress_pct": 14.77, "epoch_pct": 14.77, "eta": "60:02:39", "max_grad_norm": 0.8, "loss": 0.673769474029541, "grad_norm": 1.166912317276001, "learning_rate": 9.867440806567561e-05} +{"ts": "2025-12-27T04:47:14", "event": "train_log", "step": 2100, "epoch": 0.8860759493670886, "progress_pct": 14.77, "epoch_pct": 14.77, "eta": "61:24:12", "max_grad_norm": 0.8, "eval_loss": 0.7559094429016113, "eval_runtime": 847.8311, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485} +{"ts": "2025-12-27T04:47:35", "event": "train_log", "step": 2102, "epoch": 0.8869198312236287, "progress_pct": 14.78, "epoch_pct": 14.78, "eta": "61:22:07", "max_grad_norm": 0.8, "loss": 0.8314241766929626, "grad_norm": 1.227583885192871, "learning_rate": 9.866890940982121e-05} +{"ts": "2025-12-27T04:47:56", "event": "train_log", "step": 2104, "epoch": 0.8877637130801688, "progress_pct": 14.8, "epoch_pct": 14.8, "eta": "61:20:01", "max_grad_norm": 0.8, "loss": 0.6770843863487244, "grad_norm": 1.1813976764678955, "learning_rate": 9.866339952698413e-05} +{"ts": "2025-12-27T04:48:16", "event": "train_log", "step": 2106, "epoch": 0.8886075949367088, "progress_pct": 14.81, "epoch_pct": 14.81, "eta": "61:17:54", "max_grad_norm": 0.8, "loss": 0.7142292857170105, "grad_norm": 1.2471063137054443, "learning_rate": 9.865787841843539e-05} +{"ts": "2025-12-27T04:48:35", "event": "train_log", "step": 2108, "epoch": 0.889451476793249, "progress_pct": 14.82, "epoch_pct": 14.82, "eta": "61:15:34", "max_grad_norm": 0.8, "loss": 0.6981731653213501, "grad_norm": 1.1602860689163208, "learning_rate": 9.865234608544858e-05} +{"ts": "2025-12-27T04:48:55", "event": "train_log", "step": 2110, "epoch": 0.890295358649789, "progress_pct": 14.84, "epoch_pct": 14.84, "eta": "61:13:26", "max_grad_norm": 0.8, "loss": 0.7019379138946533, "grad_norm": 1.145677089691162, "learning_rate": 9.864680252929992e-05} +{"ts": "2025-12-27T04:49:14", "event": "train_log", "step": 2112, "epoch": 0.8911392405063291, "progress_pct": 14.85, "epoch_pct": 14.85, "eta": "61:11:10", "max_grad_norm": 0.8, "loss": 0.7690986394882202, "grad_norm": 1.2222462892532349, "learning_rate": 9.86412477512682e-05} +{"ts": "2025-12-27T04:49:34", "event": "train_log", "step": 2114, "epoch": 0.8919831223628693, "progress_pct": 14.87, "epoch_pct": 14.87, "eta": "61:08:57", "max_grad_norm": 0.8, "loss": 0.7241792678833008, "grad_norm": 1.1288166046142578, "learning_rate": 9.863568175263478e-05} +{"ts": "2025-12-27T04:49:53", "event": "train_log", "step": 2116, "epoch": 0.8928270042194093, "progress_pct": 14.88, "epoch_pct": 14.88, "eta": "61:06:45", "max_grad_norm": 0.8, "loss": 0.7392162084579468, "grad_norm": 1.1773978471755981, "learning_rate": 9.863010453468364e-05} +{"ts": "2025-12-27T04:50:14", "event": "train_log", "step": 2118, "epoch": 0.8936708860759494, "progress_pct": 14.89, "epoch_pct": 14.89, "eta": "61:04:38", "max_grad_norm": 0.8, "loss": 0.7603078484535217, "grad_norm": 1.102638840675354, "learning_rate": 9.862451609870136e-05} +{"ts": "2025-12-27T04:50:33", "event": "train_log", "step": 2120, "epoch": 0.8945147679324894, "progress_pct": 14.91, "epoch_pct": 14.91, "eta": "61:02:23", "max_grad_norm": 0.8, "loss": 0.6804911494255066, "grad_norm": 1.1325360536575317, "learning_rate": 9.861891644597707e-05} +{"ts": "2025-12-27T04:50:54", "event": "train_log", "step": 2122, "epoch": 0.8953586497890296, "progress_pct": 14.92, "epoch_pct": 14.92, "eta": "61:00:17", "max_grad_norm": 0.8, "loss": 0.787288248538971, "grad_norm": 1.1381969451904297, "learning_rate": 9.86133055778025e-05} +{"ts": "2025-12-27T04:51:15", "event": "train_log", "step": 2124, "epoch": 0.8962025316455696, "progress_pct": 14.94, "epoch_pct": 14.94, "eta": "60:58:15", "max_grad_norm": 0.8, "loss": 0.7282505035400391, "grad_norm": 1.2454546689987183, "learning_rate": 9.860768349547196e-05} +{"ts": "2025-12-27T04:51:34", "event": "train_log", "step": 2126, "epoch": 0.8970464135021097, "progress_pct": 14.95, "epoch_pct": 14.95, "eta": "60:56:03", "max_grad_norm": 0.8, "loss": 0.7554803490638733, "grad_norm": 1.2568305730819702, "learning_rate": 9.860205020028237e-05} +{"ts": "2025-12-27T04:51:55", "event": "train_log", "step": 2128, "epoch": 0.8978902953586498, "progress_pct": 14.96, "epoch_pct": 14.96, "eta": "60:53:56", "max_grad_norm": 0.8, "loss": 0.7126525044441223, "grad_norm": 1.1523523330688477, "learning_rate": 9.859640569353321e-05} +{"ts": "2025-12-27T04:52:15", "event": "train_log", "step": 2130, "epoch": 0.8987341772151899, "progress_pct": 14.98, "epoch_pct": 14.98, "eta": "60:51:47", "max_grad_norm": 0.8, "loss": 0.7300811409950256, "grad_norm": 1.314878225326538, "learning_rate": 9.859074997652658e-05} +{"ts": "2025-12-27T04:52:36", "event": "train_log", "step": 2132, "epoch": 0.8995780590717299, "progress_pct": 14.99, "epoch_pct": 14.99, "eta": "60:49:46", "max_grad_norm": 0.8, "loss": 0.7217329144477844, "grad_norm": 1.1272218227386475, "learning_rate": 9.858508305056713e-05} +{"ts": "2025-12-27T04:52:56", "event": "train_log", "step": 2134, "epoch": 0.90042194092827, "progress_pct": 15.01, "epoch_pct": 15.01, "eta": "60:47:39", "max_grad_norm": 0.8, "loss": 0.714308500289917, "grad_norm": 1.10934317111969, "learning_rate": 9.857940491696211e-05} +{"ts": "2025-12-27T04:53:15", "event": "train_log", "step": 2136, "epoch": 0.9012658227848102, "progress_pct": 15.02, "epoch_pct": 15.02, "eta": "60:45:26", "max_grad_norm": 0.8, "loss": 0.6613366007804871, "grad_norm": 1.1991039514541626, "learning_rate": 9.857371557702136e-05} +{"ts": "2025-12-27T04:53:32", "event": "train_log", "step": 2138, "epoch": 0.9021097046413502, "progress_pct": 15.04, "epoch_pct": 15.04, "eta": "60:43:01", "max_grad_norm": 0.8, "loss": 0.6972863078117371, "grad_norm": 1.3176918029785156, "learning_rate": 9.85680150320573e-05} +{"ts": "2025-12-27T04:53:52", "event": "train_log", "step": 2140, "epoch": 0.9029535864978903, "progress_pct": 15.05, "epoch_pct": 15.05, "eta": "60:40:51", "max_grad_norm": 0.8, "loss": 0.7299100160598755, "grad_norm": 1.1966592073440552, "learning_rate": 9.856230328338496e-05} +{"ts": "2025-12-27T04:54:11", "event": "train_log", "step": 2142, "epoch": 0.9037974683544304, "progress_pct": 15.06, "epoch_pct": 15.06, "eta": "60:38:39", "max_grad_norm": 0.8, "loss": 0.7145020961761475, "grad_norm": 1.2889270782470703, "learning_rate": 9.85565803323219e-05} +{"ts": "2025-12-27T04:54:32", "event": "train_log", "step": 2144, "epoch": 0.9046413502109705, "progress_pct": 15.08, "epoch_pct": 15.08, "eta": "60:36:39", "max_grad_norm": 0.8, "loss": 0.6717942953109741, "grad_norm": 1.2112789154052734, "learning_rate": 9.855084618018828e-05} +{"ts": "2025-12-27T04:54:51", "event": "train_log", "step": 2146, "epoch": 0.9054852320675105, "progress_pct": 15.09, "epoch_pct": 15.09, "eta": "60:34:24", "max_grad_norm": 0.8, "loss": 0.7460196018218994, "grad_norm": 1.2550239562988281, "learning_rate": 9.85451008283069e-05} +{"ts": "2025-12-27T04:55:12", "event": "train_log", "step": 2148, "epoch": 0.9063291139240506, "progress_pct": 15.11, "epoch_pct": 15.11, "eta": "60:32:24", "max_grad_norm": 0.8, "loss": 0.8300626873970032, "grad_norm": 1.2926387786865234, "learning_rate": 9.853934427800309e-05} +{"ts": "2025-12-27T04:55:32", "event": "train_log", "step": 2150, "epoch": 0.9071729957805907, "progress_pct": 15.12, "epoch_pct": 15.12, "eta": "60:30:19", "max_grad_norm": 0.8, "loss": 0.715215802192688, "grad_norm": 1.0690672397613525, "learning_rate": 9.853357653060478e-05} +{"ts": "2025-12-27T04:55:52", "event": "train_log", "step": 2152, "epoch": 0.9080168776371308, "progress_pct": 15.13, "epoch_pct": 15.13, "eta": "60:28:12", "max_grad_norm": 0.8, "loss": 0.7021427154541016, "grad_norm": 1.1021424531936646, "learning_rate": 9.852779758744245e-05} +{"ts": "2025-12-27T04:56:12", "event": "train_log", "step": 2154, "epoch": 0.9088607594936708, "progress_pct": 15.15, "epoch_pct": 15.15, "eta": "60:26:04", "max_grad_norm": 0.8, "loss": 0.7576406598091125, "grad_norm": 1.0713517665863037, "learning_rate": 9.852200744984921e-05} +{"ts": "2025-12-27T04:56:29", "event": "train_log", "step": 2156, "epoch": 0.909704641350211, "progress_pct": 15.16, "epoch_pct": 15.16, "eta": "60:23:43", "max_grad_norm": 0.8, "loss": 0.7008846998214722, "grad_norm": 1.277526617050171, "learning_rate": 9.851620611916075e-05} +{"ts": "2025-12-27T04:56:48", "event": "train_log", "step": 2158, "epoch": 0.9105485232067511, "progress_pct": 15.18, "epoch_pct": 15.18, "eta": "60:21:33", "max_grad_norm": 0.8, "loss": 0.7536613345146179, "grad_norm": 1.2434618473052979, "learning_rate": 9.85103935967153e-05} +{"ts": "2025-12-27T04:57:07", "event": "train_log", "step": 2160, "epoch": 0.9113924050632911, "progress_pct": 15.19, "epoch_pct": 15.19, "eta": "60:19:22", "max_grad_norm": 0.8, "loss": 0.7435567378997803, "grad_norm": 1.1654841899871826, "learning_rate": 9.850456988385371e-05} +{"ts": "2025-12-27T04:57:28", "event": "train_log", "step": 2162, "epoch": 0.9122362869198313, "progress_pct": 15.2, "epoch_pct": 15.2, "eta": "60:17:21", "max_grad_norm": 0.8, "loss": 0.7725666165351868, "grad_norm": 1.0718246698379517, "learning_rate": 9.849873498191939e-05} +{"ts": "2025-12-27T04:57:46", "event": "train_log", "step": 2164, "epoch": 0.9130801687763713, "progress_pct": 15.22, "epoch_pct": 15.22, "eta": "60:15:07", "max_grad_norm": 0.8, "loss": 0.7833593487739563, "grad_norm": 1.3425630331039429, "learning_rate": 9.849288889225835e-05} +{"ts": "2025-12-27T04:58:06", "event": "train_log", "step": 2166, "epoch": 0.9139240506329114, "progress_pct": 15.23, "epoch_pct": 15.23, "eta": "60:12:58", "max_grad_norm": 0.8, "loss": 0.7290158867835999, "grad_norm": 1.1989985704421997, "learning_rate": 9.848703161621917e-05} +{"ts": "2025-12-27T04:58:27", "event": "train_log", "step": 2168, "epoch": 0.9147679324894514, "progress_pct": 15.25, "epoch_pct": 15.25, "eta": "60:11:03", "max_grad_norm": 0.8, "loss": 0.6787996888160706, "grad_norm": 1.0549380779266357, "learning_rate": 9.8481163155153e-05} +{"ts": "2025-12-27T04:58:46", "event": "train_log", "step": 2170, "epoch": 0.9156118143459916, "progress_pct": 15.26, "epoch_pct": 15.26, "eta": "60:08:48", "max_grad_norm": 0.8, "loss": 0.7645748853683472, "grad_norm": 1.0757017135620117, "learning_rate": 9.847528351041359e-05} +{"ts": "2025-12-27T04:59:06", "event": "train_log", "step": 2172, "epoch": 0.9164556962025316, "progress_pct": 15.27, "epoch_pct": 15.27, "eta": "60:06:49", "max_grad_norm": 0.8, "loss": 0.6640698313713074, "grad_norm": 1.0636975765228271, "learning_rate": 9.846939268335726e-05} +{"ts": "2025-12-27T04:59:25", "event": "train_log", "step": 2174, "epoch": 0.9172995780590717, "progress_pct": 15.29, "epoch_pct": 15.29, "eta": "60:04:39", "max_grad_norm": 0.8, "loss": 0.7216284275054932, "grad_norm": 1.2038439512252808, "learning_rate": 9.846349067534291e-05} +{"ts": "2025-12-27T04:59:45", "event": "train_log", "step": 2176, "epoch": 0.9181434599156119, "progress_pct": 15.3, "epoch_pct": 15.3, "eta": "60:02:34", "max_grad_norm": 0.8, "loss": 0.7244991660118103, "grad_norm": 1.17854642868042, "learning_rate": 9.845757748773203e-05} +{"ts": "2025-12-27T05:00:05", "event": "train_log", "step": 2178, "epoch": 0.9189873417721519, "progress_pct": 15.32, "epoch_pct": 15.32, "eta": "60:00:29", "max_grad_norm": 0.8, "loss": 0.6043152809143066, "grad_norm": 1.0391159057617188, "learning_rate": 9.845165312188864e-05} +{"ts": "2025-12-27T05:00:24", "event": "train_log", "step": 2180, "epoch": 0.919831223628692, "progress_pct": 15.33, "epoch_pct": 15.33, "eta": "59:58:19", "max_grad_norm": 0.8, "loss": 0.7791659832000732, "grad_norm": 1.2382071018218994, "learning_rate": 9.844571757917944e-05} +{"ts": "2025-12-27T05:00:44", "event": "train_log", "step": 2182, "epoch": 0.920675105485232, "progress_pct": 15.34, "epoch_pct": 15.34, "eta": "59:56:14", "max_grad_norm": 0.8, "loss": 0.7190433144569397, "grad_norm": 1.0855708122253418, "learning_rate": 9.84397708609736e-05} +{"ts": "2025-12-27T05:01:04", "event": "train_log", "step": 2184, "epoch": 0.9215189873417722, "progress_pct": 15.36, "epoch_pct": 15.36, "eta": "59:54:11", "max_grad_norm": 0.8, "loss": 0.6648658514022827, "grad_norm": 1.103308916091919, "learning_rate": 9.843381296864291e-05} +{"ts": "2025-12-27T05:01:23", "event": "train_log", "step": 2186, "epoch": 0.9223628691983122, "progress_pct": 15.37, "epoch_pct": 15.37, "eta": "59:52:07", "max_grad_norm": 0.8, "loss": 0.6891760230064392, "grad_norm": 1.073517918586731, "learning_rate": 9.842784390356178e-05} +{"ts": "2025-12-27T05:01:43", "event": "train_log", "step": 2188, "epoch": 0.9232067510548523, "progress_pct": 15.39, "epoch_pct": 15.39, "eta": "59:50:00", "max_grad_norm": 0.8, "loss": 0.6880859136581421, "grad_norm": 1.0806199312210083, "learning_rate": 9.842186366710712e-05} +{"ts": "2025-12-27T05:02:03", "event": "train_log", "step": 2190, "epoch": 0.9240506329113924, "progress_pct": 15.4, "epoch_pct": 15.4, "eta": "59:47:57", "max_grad_norm": 0.8, "loss": 0.6238307952880859, "grad_norm": 1.0631483793258667, "learning_rate": 9.841587226065848e-05} +{"ts": "2025-12-27T05:02:22", "event": "train_log", "step": 2192, "epoch": 0.9248945147679325, "progress_pct": 15.41, "epoch_pct": 15.41, "eta": "59:45:49", "max_grad_norm": 0.8, "loss": 0.6905744075775146, "grad_norm": 1.2630863189697266, "learning_rate": 9.840986968559795e-05} +{"ts": "2025-12-27T05:02:41", "event": "train_log", "step": 2194, "epoch": 0.9257383966244725, "progress_pct": 15.43, "epoch_pct": 15.43, "eta": "59:43:42", "max_grad_norm": 0.8, "loss": 0.7531564235687256, "grad_norm": 1.1307560205459595, "learning_rate": 9.840385594331022e-05} +{"ts": "2025-12-27T05:03:01", "event": "train_log", "step": 2196, "epoch": 0.9265822784810127, "progress_pct": 15.44, "epoch_pct": 15.44, "eta": "59:41:43", "max_grad_norm": 0.8, "loss": 0.6750671863555908, "grad_norm": 1.0294862985610962, "learning_rate": 9.839783103518254e-05} +{"ts": "2025-12-27T05:03:20", "event": "train_log", "step": 2198, "epoch": 0.9274261603375528, "progress_pct": 15.46, "epoch_pct": 15.46, "eta": "59:39:35", "max_grad_norm": 0.8, "loss": 0.7200804352760315, "grad_norm": 1.2446976900100708, "learning_rate": 9.839179496260472e-05} +{"ts": "2025-12-27T05:03:39", "event": "train_log", "step": 2200, "epoch": 0.9282700421940928, "progress_pct": 15.47, "epoch_pct": 15.47, "eta": "59:37:27", "max_grad_norm": 0.8, "loss": 0.7002623677253723, "grad_norm": 1.2673420906066895, "learning_rate": 9.83857477269692e-05} +{"ts": "2025-12-27T05:17:56", "event": "train_log", "step": 2200, "epoch": 0.9282700421940928, "progress_pct": 15.47, "epoch_pct": 15.47, "eta": "60:55:28", "max_grad_norm": 0.8, "eval_loss": 0.7497645616531372, "eval_runtime": 856.8766, "eval_samples_per_second": 2.459, "eval_steps_per_second": 2.459} +{"ts": "2025-12-27T05:18:15", "event": "train_log", "step": 2202, "epoch": 0.9291139240506329, "progress_pct": 15.49, "epoch_pct": 15.49, "eta": "60:53:18", "max_grad_norm": 0.8, "loss": 0.7718265056610107, "grad_norm": 1.5114624500274658, "learning_rate": 9.837968932967094e-05} +{"ts": "2025-12-27T05:18:35", "event": "train_log", "step": 2204, "epoch": 0.929957805907173, "progress_pct": 15.5, "epoch_pct": 15.5, "eta": "60:51:10", "max_grad_norm": 0.8, "loss": 0.7204271554946899, "grad_norm": 1.2059369087219238, "learning_rate": 9.837361977210751e-05} +{"ts": "2025-12-27T05:18:56", "event": "train_log", "step": 2206, "epoch": 0.9308016877637131, "progress_pct": 15.51, "epoch_pct": 15.51, "eta": "60:49:10", "max_grad_norm": 0.8, "loss": 0.7371073961257935, "grad_norm": 1.2077301740646362, "learning_rate": 9.836753905567902e-05} +{"ts": "2025-12-27T05:19:17", "event": "train_log", "step": 2208, "epoch": 0.9316455696202531, "progress_pct": 15.53, "epoch_pct": 15.53, "eta": "60:47:12", "max_grad_norm": 0.8, "loss": 0.6601167321205139, "grad_norm": 1.120097279548645, "learning_rate": 9.836144718178818e-05} +{"ts": "2025-12-27T05:19:37", "event": "train_log", "step": 2210, "epoch": 0.9324894514767933, "progress_pct": 15.54, "epoch_pct": 15.54, "eta": "60:45:06", "max_grad_norm": 0.8, "loss": 0.6897423267364502, "grad_norm": 1.1755714416503906, "learning_rate": 9.835534415184029e-05} +{"ts": "2025-12-27T05:19:57", "event": "train_log", "step": 2212, "epoch": 0.9333333333333333, "progress_pct": 15.56, "epoch_pct": 15.56, "eta": "60:42:57", "max_grad_norm": 0.8, "loss": 0.758438229560852, "grad_norm": 1.3587000370025635, "learning_rate": 9.834922996724317e-05} +{"ts": "2025-12-27T05:20:18", "event": "train_log", "step": 2214, "epoch": 0.9341772151898734, "progress_pct": 15.57, "epoch_pct": 15.57, "eta": "60:40:56", "max_grad_norm": 0.8, "loss": 0.7489214539527893, "grad_norm": 1.1898177862167358, "learning_rate": 9.834310462940727e-05} +{"ts": "2025-12-27T05:20:39", "event": "train_log", "step": 2216, "epoch": 0.9350210970464135, "progress_pct": 15.58, "epoch_pct": 15.58, "eta": "60:38:58", "max_grad_norm": 0.8, "loss": 0.6844488382339478, "grad_norm": 1.0814623832702637, "learning_rate": 9.833696813974558e-05} +{"ts": "2025-12-27T05:21:00", "event": "train_log", "step": 2218, "epoch": 0.9358649789029536, "progress_pct": 15.6, "epoch_pct": 15.6, "eta": "60:37:00", "max_grad_norm": 0.8, "loss": 0.6617586016654968, "grad_norm": 1.1060179471969604, "learning_rate": 9.833082049967366e-05} +{"ts": "2025-12-27T05:21:20", "event": "train_log", "step": 2220, "epoch": 0.9367088607594937, "progress_pct": 15.61, "epoch_pct": 15.61, "eta": "60:34:56", "max_grad_norm": 0.8, "loss": 0.7383584976196289, "grad_norm": 1.1780575513839722, "learning_rate": 9.832466171060968e-05} +{"ts": "2025-12-27T05:21:41", "event": "train_log", "step": 2222, "epoch": 0.9375527426160337, "progress_pct": 15.63, "epoch_pct": 15.63, "eta": "60:32:57", "max_grad_norm": 0.8, "loss": 0.7764308452606201, "grad_norm": 1.3734618425369263, "learning_rate": 9.831849177397432e-05} +{"ts": "2025-12-27T05:22:01", "event": "train_log", "step": 2224, "epoch": 0.9383966244725739, "progress_pct": 15.64, "epoch_pct": 15.64, "eta": "60:30:52", "max_grad_norm": 0.8, "loss": 0.6834397912025452, "grad_norm": 1.1367733478546143, "learning_rate": 9.831231069119089e-05} +{"ts": "2025-12-27T05:22:22", "event": "train_log", "step": 2226, "epoch": 0.9392405063291139, "progress_pct": 15.65, "epoch_pct": 15.65, "eta": "60:28:51", "max_grad_norm": 0.8, "loss": 0.7054480910301208, "grad_norm": 1.1695492267608643, "learning_rate": 9.830611846368524e-05} +{"ts": "2025-12-27T05:22:44", "event": "train_log", "step": 2228, "epoch": 0.940084388185654, "progress_pct": 15.67, "epoch_pct": 15.67, "eta": "60:26:59", "max_grad_norm": 0.8, "loss": 0.694448709487915, "grad_norm": 1.0345736742019653, "learning_rate": 9.829991509288579e-05} +{"ts": "2025-12-27T05:23:03", "event": "train_log", "step": 2230, "epoch": 0.9409282700421941, "progress_pct": 15.68, "epoch_pct": 15.68, "eta": "60:24:49", "max_grad_norm": 0.8, "loss": 0.6839741468429565, "grad_norm": 1.298105239868164, "learning_rate": 9.829370058022356e-05} +{"ts": "2025-12-27T05:23:21", "event": "train_log", "step": 2232, "epoch": 0.9417721518987342, "progress_pct": 15.7, "epoch_pct": 15.7, "eta": "60:22:37", "max_grad_norm": 0.8, "loss": 0.7886884212493896, "grad_norm": 1.2905502319335938, "learning_rate": 9.828747492713209e-05} +{"ts": "2025-12-27T05:23:41", "event": "train_log", "step": 2234, "epoch": 0.9426160337552743, "progress_pct": 15.71, "epoch_pct": 15.71, "eta": "60:20:29", "max_grad_norm": 0.8, "loss": 0.7206413149833679, "grad_norm": 1.12301504611969, "learning_rate": 9.828123813504753e-05} +{"ts": "2025-12-27T05:24:00", "event": "train_log", "step": 2236, "epoch": 0.9434599156118143, "progress_pct": 15.72, "epoch_pct": 15.72, "eta": "60:18:23", "max_grad_norm": 0.8, "loss": 0.7700693607330322, "grad_norm": 1.2644896507263184, "learning_rate": 9.82749902054086e-05} +{"ts": "2025-12-27T05:24:19", "event": "train_log", "step": 2238, "epoch": 0.9443037974683545, "progress_pct": 15.74, "epoch_pct": 15.74, "eta": "60:16:13", "max_grad_norm": 0.8, "loss": 0.7199711203575134, "grad_norm": 1.1626365184783936, "learning_rate": 9.826873113965655e-05} +{"ts": "2025-12-27T05:24:37", "event": "train_log", "step": 2240, "epoch": 0.9451476793248945, "progress_pct": 15.75, "epoch_pct": 15.75, "eta": "60:14:02", "max_grad_norm": 0.8, "loss": 0.7183539271354675, "grad_norm": 1.0728627443313599, "learning_rate": 9.826246093923528e-05} +{"ts": "2025-12-27T05:24:56", "event": "train_log", "step": 2242, "epoch": 0.9459915611814346, "progress_pct": 15.77, "epoch_pct": 15.77, "eta": "60:11:51", "max_grad_norm": 0.8, "loss": 0.7417964935302734, "grad_norm": 1.1444766521453857, "learning_rate": 9.825617960559114e-05} +{"ts": "2025-12-27T05:25:14", "event": "train_log", "step": 2244, "epoch": 0.9468354430379747, "progress_pct": 15.78, "epoch_pct": 15.78, "eta": "60:09:40", "max_grad_norm": 0.8, "loss": 0.7949740290641785, "grad_norm": 1.4059823751449585, "learning_rate": 9.824988714017316e-05} +{"ts": "2025-12-27T05:25:34", "event": "train_log", "step": 2246, "epoch": 0.9476793248945148, "progress_pct": 15.79, "epoch_pct": 15.79, "eta": "60:07:36", "max_grad_norm": 0.8, "loss": 0.6433083415031433, "grad_norm": 1.1349766254425049, "learning_rate": 9.824358354443286e-05} +{"ts": "2025-12-27T05:25:53", "event": "train_log", "step": 2248, "epoch": 0.9485232067510548, "progress_pct": 15.81, "epoch_pct": 15.81, "eta": "60:05:31", "max_grad_norm": 0.8, "loss": 0.6519861817359924, "grad_norm": 1.0879144668579102, "learning_rate": 9.823726881982438e-05} +{"ts": "2025-12-27T05:26:12", "event": "train_log", "step": 2250, "epoch": 0.9493670886075949, "progress_pct": 15.82, "epoch_pct": 15.82, "eta": "60:03:24", "max_grad_norm": 0.8, "loss": 0.7280195355415344, "grad_norm": 1.2289162874221802, "learning_rate": 9.82309429678044e-05} +{"ts": "2025-12-27T05:26:32", "event": "train_log", "step": 2252, "epoch": 0.950210970464135, "progress_pct": 15.84, "epoch_pct": 15.84, "eta": "60:01:22", "max_grad_norm": 0.8, "loss": 0.7524687647819519, "grad_norm": 1.1755765676498413, "learning_rate": 9.822460598983217e-05} +{"ts": "2025-12-27T05:26:52", "event": "train_log", "step": 2254, "epoch": 0.9510548523206751, "progress_pct": 15.85, "epoch_pct": 15.85, "eta": "59:59:16", "max_grad_norm": 0.8, "loss": 0.7543174624443054, "grad_norm": 1.179807186126709, "learning_rate": 9.821825788736949e-05} +{"ts": "2025-12-27T05:27:11", "event": "train_log", "step": 2256, "epoch": 0.9518987341772152, "progress_pct": 15.86, "epoch_pct": 15.86, "eta": "59:57:11", "max_grad_norm": 0.8, "loss": 0.716377854347229, "grad_norm": 1.1234289407730103, "learning_rate": 9.821189866188079e-05} +{"ts": "2025-12-27T05:27:31", "event": "train_log", "step": 2258, "epoch": 0.9527426160337553, "progress_pct": 15.88, "epoch_pct": 15.88, "eta": "59:55:08", "max_grad_norm": 0.8, "loss": 0.6403332948684692, "grad_norm": 1.0324063301086426, "learning_rate": 9.820552831483297e-05} +{"ts": "2025-12-27T05:27:50", "event": "train_log", "step": 2260, "epoch": 0.9535864978902954, "progress_pct": 15.89, "epoch_pct": 15.89, "eta": "59:53:03", "max_grad_norm": 0.8, "loss": 0.7406947612762451, "grad_norm": 1.1459579467773438, "learning_rate": 9.819914684769558e-05} +{"ts": "2025-12-27T05:28:09", "event": "train_log", "step": 2262, "epoch": 0.9544303797468354, "progress_pct": 15.91, "epoch_pct": 15.91, "eta": "59:50:57", "max_grad_norm": 0.8, "loss": 0.749687671661377, "grad_norm": 1.2886124849319458, "learning_rate": 9.819275426194072e-05} +{"ts": "2025-12-27T05:28:27", "event": "train_log", "step": 2264, "epoch": 0.9552742616033755, "progress_pct": 15.92, "epoch_pct": 15.92, "eta": "59:48:46", "max_grad_norm": 0.8, "loss": 0.778410017490387, "grad_norm": 1.3349844217300415, "learning_rate": 9.818635055904299e-05} +{"ts": "2025-12-27T05:28:46", "event": "train_log", "step": 2266, "epoch": 0.9561181434599156, "progress_pct": 15.94, "epoch_pct": 15.94, "eta": "59:46:38", "max_grad_norm": 0.8, "loss": 0.6701914668083191, "grad_norm": 1.0994901657104492, "learning_rate": 9.81799357404796e-05} +{"ts": "2025-12-27T05:29:05", "event": "train_log", "step": 2268, "epoch": 0.9569620253164557, "progress_pct": 15.95, "epoch_pct": 15.95, "eta": "59:44:33", "max_grad_norm": 0.8, "loss": 0.7205135226249695, "grad_norm": 1.1787796020507812, "learning_rate": 9.817350980773038e-05} +{"ts": "2025-12-27T05:29:24", "event": "train_log", "step": 2270, "epoch": 0.9578059071729957, "progress_pct": 15.96, "epoch_pct": 15.96, "eta": "59:42:31", "max_grad_norm": 0.8, "loss": 0.6897916197776794, "grad_norm": 1.100813627243042, "learning_rate": 9.816707276227763e-05} +{"ts": "2025-12-27T05:29:44", "event": "train_log", "step": 2272, "epoch": 0.9586497890295359, "progress_pct": 15.98, "epoch_pct": 15.98, "eta": "59:40:30", "max_grad_norm": 0.8, "loss": 0.6763570308685303, "grad_norm": 1.1280698776245117, "learning_rate": 9.816062460560627e-05} +{"ts": "2025-12-27T05:30:03", "event": "train_log", "step": 2274, "epoch": 0.959493670886076, "progress_pct": 15.99, "epoch_pct": 15.99, "eta": "59:38:24", "max_grad_norm": 0.8, "loss": 0.6948683857917786, "grad_norm": 1.2322514057159424, "learning_rate": 9.815416533920374e-05} +{"ts": "2025-12-27T05:30:21", "event": "train_log", "step": 2276, "epoch": 0.960337552742616, "progress_pct": 16.01, "epoch_pct": 16.01, "eta": "59:36:16", "max_grad_norm": 0.8, "loss": 0.7876828908920288, "grad_norm": 1.3963630199432373, "learning_rate": 9.814769496456008e-05} +{"ts": "2025-12-27T05:30:40", "event": "train_log", "step": 2278, "epoch": 0.9611814345991562, "progress_pct": 16.02, "epoch_pct": 16.02, "eta": "59:34:09", "max_grad_norm": 0.8, "loss": 0.8191362619400024, "grad_norm": 1.2093676328659058, "learning_rate": 9.814121348316792e-05} +{"ts": "2025-12-27T05:30:58", "event": "train_log", "step": 2280, "epoch": 0.9620253164556962, "progress_pct": 16.03, "epoch_pct": 16.03, "eta": "59:32:01", "max_grad_norm": 0.8, "loss": 0.7162626385688782, "grad_norm": 1.2223572731018066, "learning_rate": 9.813472089652233e-05} +{"ts": "2025-12-27T05:31:17", "event": "train_log", "step": 2282, "epoch": 0.9628691983122363, "progress_pct": 16.05, "epoch_pct": 16.05, "eta": "59:29:58", "max_grad_norm": 0.8, "loss": 0.7183970212936401, "grad_norm": 1.1498078107833862, "learning_rate": 9.812821720612111e-05} +{"ts": "2025-12-27T05:31:37", "event": "train_log", "step": 2284, "epoch": 0.9637130801687763, "progress_pct": 16.06, "epoch_pct": 16.06, "eta": "59:27:57", "max_grad_norm": 0.8, "loss": 0.734487771987915, "grad_norm": 1.1563853025436401, "learning_rate": 9.812170241346449e-05} +{"ts": "2025-12-27T05:31:56", "event": "train_log", "step": 2286, "epoch": 0.9645569620253165, "progress_pct": 16.08, "epoch_pct": 16.08, "eta": "59:25:51", "max_grad_norm": 0.8, "loss": 0.7312371730804443, "grad_norm": 1.1823415756225586, "learning_rate": 9.81151765200553e-05} +{"ts": "2025-12-27T05:32:15", "event": "train_log", "step": 2288, "epoch": 0.9654008438818565, "progress_pct": 16.09, "epoch_pct": 16.09, "eta": "59:23:47", "max_grad_norm": 0.8, "loss": 0.7668377757072449, "grad_norm": 1.1336151361465454, "learning_rate": 9.810863952739899e-05} +{"ts": "2025-12-27T05:32:35", "event": "train_log", "step": 2290, "epoch": 0.9662447257383966, "progress_pct": 16.1, "epoch_pct": 16.1, "eta": "59:21:49", "max_grad_norm": 0.8, "loss": 0.7100399732589722, "grad_norm": 1.0857036113739014, "learning_rate": 9.810209143700347e-05} +{"ts": "2025-12-27T05:32:54", "event": "train_log", "step": 2292, "epoch": 0.9670886075949368, "progress_pct": 16.12, "epoch_pct": 16.12, "eta": "59:19:47", "max_grad_norm": 0.8, "loss": 0.7169836163520813, "grad_norm": 1.1368129253387451, "learning_rate": 9.809553225037926e-05} +{"ts": "2025-12-27T05:33:13", "event": "train_log", "step": 2294, "epoch": 0.9679324894514768, "progress_pct": 16.13, "epoch_pct": 16.13, "eta": "59:17:42", "max_grad_norm": 0.8, "loss": 0.7709535956382751, "grad_norm": 1.141107439994812, "learning_rate": 9.808896196903947e-05} +{"ts": "2025-12-27T05:33:31", "event": "train_log", "step": 2296, "epoch": 0.9687763713080169, "progress_pct": 16.15, "epoch_pct": 16.15, "eta": "59:15:38", "max_grad_norm": 0.8, "loss": 0.7300511002540588, "grad_norm": 1.276405930519104, "learning_rate": 9.808238059449971e-05} +{"ts": "2025-12-27T05:33:51", "event": "train_log", "step": 2298, "epoch": 0.9696202531645569, "progress_pct": 16.16, "epoch_pct": 16.16, "eta": "59:13:38", "max_grad_norm": 0.8, "loss": 0.6259129047393799, "grad_norm": 0.9817046523094177, "learning_rate": 9.80757881282782e-05} +{"ts": "2025-12-27T05:34:09", "event": "train_log", "step": 2300, "epoch": 0.9704641350210971, "progress_pct": 16.17, "epoch_pct": 16.17, "eta": "59:11:29", "max_grad_norm": 0.8, "loss": 0.7361716032028198, "grad_norm": 1.3965257406234741, "learning_rate": 9.806918457189566e-05} +{"ts": "2025-12-27T05:48:33", "event": "train_log", "step": 2300, "epoch": 0.9704641350210971, "progress_pct": 16.17, "epoch_pct": 16.17, "eta": "60:26:08", "max_grad_norm": 0.8, "eval_loss": 0.7464568614959717, "eval_runtime": 864.2128, "eval_samples_per_second": 2.438, "eval_steps_per_second": 2.438} +{"ts": "2025-12-27T05:48:52", "event": "train_log", "step": 2302, "epoch": 0.9713080168776371, "progress_pct": 16.19, "epoch_pct": 16.19, "eta": "60:23:59", "max_grad_norm": 0.8, "loss": 0.805477499961853, "grad_norm": 1.2168612480163574, "learning_rate": 9.806256992687544e-05} +{"ts": "2025-12-27T05:49:12", "event": "train_log", "step": 2304, "epoch": 0.9721518987341772, "progress_pct": 16.2, "epoch_pct": 16.2, "eta": "60:21:58", "max_grad_norm": 0.8, "loss": 0.6673368811607361, "grad_norm": 1.0418168306350708, "learning_rate": 9.80559441947434e-05} +{"ts": "2025-12-27T05:49:30", "event": "train_log", "step": 2306, "epoch": 0.9729957805907173, "progress_pct": 16.22, "epoch_pct": 16.22, "eta": "60:19:49", "max_grad_norm": 0.8, "loss": 0.7585647106170654, "grad_norm": 1.223128318786621, "learning_rate": 9.804930737702796e-05} +{"ts": "2025-12-27T05:49:50", "event": "train_log", "step": 2308, "epoch": 0.9738396624472574, "progress_pct": 16.23, "epoch_pct": 16.23, "eta": "60:17:44", "max_grad_norm": 0.8, "loss": 0.7642034888267517, "grad_norm": 1.264511227607727, "learning_rate": 9.804265947526011e-05} +{"ts": "2025-12-27T05:50:09", "event": "train_log", "step": 2310, "epoch": 0.9746835443037974, "progress_pct": 16.24, "epoch_pct": 16.24, "eta": "60:15:40", "max_grad_norm": 0.8, "loss": 0.7094541192054749, "grad_norm": 1.076887607574463, "learning_rate": 9.803600049097339e-05} +{"ts": "2025-12-27T05:50:28", "event": "train_log", "step": 2312, "epoch": 0.9755274261603376, "progress_pct": 16.26, "epoch_pct": 16.26, "eta": "60:13:35", "max_grad_norm": 0.8, "loss": 0.7370059490203857, "grad_norm": 1.0214987993240356, "learning_rate": 9.802933042570392e-05} +{"ts": "2025-12-27T05:50:47", "event": "train_log", "step": 2314, "epoch": 0.9763713080168777, "progress_pct": 16.27, "epoch_pct": 16.27, "eta": "60:11:27", "max_grad_norm": 0.8, "loss": 0.726834237575531, "grad_norm": 1.3075295686721802, "learning_rate": 9.802264928099035e-05} +{"ts": "2025-12-27T05:51:06", "event": "train_log", "step": 2316, "epoch": 0.9772151898734177, "progress_pct": 16.29, "epoch_pct": 16.29, "eta": "60:09:22", "max_grad_norm": 0.8, "loss": 0.6742353439331055, "grad_norm": 1.057386040687561, "learning_rate": 9.801595705837385e-05} +{"ts": "2025-12-27T05:51:24", "event": "train_log", "step": 2318, "epoch": 0.9780590717299578, "progress_pct": 16.3, "epoch_pct": 16.3, "eta": "60:07:12", "max_grad_norm": 0.8, "loss": 0.6862425208091736, "grad_norm": 1.3998085260391235, "learning_rate": 9.800925375939825e-05} +{"ts": "2025-12-27T05:51:45", "event": "train_log", "step": 2320, "epoch": 0.9789029535864979, "progress_pct": 16.32, "epoch_pct": 16.32, "eta": "60:05:17", "max_grad_norm": 0.8, "loss": 0.6212031245231628, "grad_norm": 1.080574631690979, "learning_rate": 9.800253938560983e-05} +{"ts": "2025-12-27T05:52:04", "event": "train_log", "step": 2322, "epoch": 0.979746835443038, "progress_pct": 16.33, "epoch_pct": 16.33, "eta": "60:03:09", "max_grad_norm": 0.8, "loss": 0.7522522211074829, "grad_norm": 1.3643771409988403, "learning_rate": 9.799581393855748e-05} +{"ts": "2025-12-27T05:52:23", "event": "train_log", "step": 2324, "epoch": 0.980590717299578, "progress_pct": 16.34, "epoch_pct": 16.34, "eta": "60:01:03", "max_grad_norm": 0.8, "loss": 0.7265716791152954, "grad_norm": 1.2455768585205078, "learning_rate": 9.798907741979264e-05} +{"ts": "2025-12-27T05:52:42", "event": "train_log", "step": 2326, "epoch": 0.9814345991561182, "progress_pct": 16.36, "epoch_pct": 16.36, "eta": "59:59:00", "max_grad_norm": 0.8, "loss": 0.7160419225692749, "grad_norm": 1.078774333000183, "learning_rate": 9.798232983086927e-05} +{"ts": "2025-12-27T05:53:01", "event": "train_log", "step": 2328, "epoch": 0.9822784810126582, "progress_pct": 16.37, "epoch_pct": 16.37, "eta": "59:56:54", "max_grad_norm": 0.8, "loss": 0.7991124391555786, "grad_norm": 1.3013948202133179, "learning_rate": 9.797557117334394e-05} +{"ts": "2025-12-27T05:53:19", "event": "train_log", "step": 2330, "epoch": 0.9831223628691983, "progress_pct": 16.39, "epoch_pct": 16.39, "eta": "59:54:48", "max_grad_norm": 0.8, "loss": 0.7193916440010071, "grad_norm": 1.2216732501983643, "learning_rate": 9.796880144877572e-05} +{"ts": "2025-12-27T05:53:38", "event": "train_log", "step": 2332, "epoch": 0.9839662447257383, "progress_pct": 16.4, "epoch_pct": 16.4, "eta": "59:52:42", "max_grad_norm": 0.8, "loss": 0.7184370756149292, "grad_norm": 1.1469542980194092, "learning_rate": 9.796202065872627e-05} +{"ts": "2025-12-27T05:53:58", "event": "train_log", "step": 2334, "epoch": 0.9848101265822785, "progress_pct": 16.41, "epoch_pct": 16.41, "eta": "59:50:43", "max_grad_norm": 0.8, "loss": 0.6474619507789612, "grad_norm": 1.0431830883026123, "learning_rate": 9.795522880475979e-05} +{"ts": "2025-12-27T05:54:17", "event": "train_log", "step": 2336, "epoch": 0.9856540084388186, "progress_pct": 16.43, "epoch_pct": 16.43, "eta": "59:48:40", "max_grad_norm": 0.8, "loss": 0.6392545700073242, "grad_norm": 1.1819576025009155, "learning_rate": 9.794842588844299e-05} +{"ts": "2025-12-27T05:54:37", "event": "train_log", "step": 2338, "epoch": 0.9864978902953586, "progress_pct": 16.44, "epoch_pct": 16.44, "eta": "59:46:39", "max_grad_norm": 0.8, "loss": 0.7358114719390869, "grad_norm": 1.1984983682632446, "learning_rate": 9.794161191134525e-05} +{"ts": "2025-12-27T05:54:57", "event": "train_log", "step": 2340, "epoch": 0.9873417721518988, "progress_pct": 16.46, "epoch_pct": 16.46, "eta": "59:44:39", "max_grad_norm": 0.8, "loss": 0.6762020587921143, "grad_norm": 1.3378512859344482, "learning_rate": 9.793478687503834e-05} +{"ts": "2025-12-27T05:55:15", "event": "train_log", "step": 2342, "epoch": 0.9881856540084388, "progress_pct": 16.47, "epoch_pct": 16.47, "eta": "59:42:34", "max_grad_norm": 0.8, "loss": 0.7478934526443481, "grad_norm": 1.272674560546875, "learning_rate": 9.792795078109673e-05} +{"ts": "2025-12-27T05:55:35", "event": "train_log", "step": 2344, "epoch": 0.9890295358649789, "progress_pct": 16.48, "epoch_pct": 16.48, "eta": "59:40:34", "max_grad_norm": 0.8, "loss": 0.7316533923149109, "grad_norm": 1.153746247291565, "learning_rate": 9.792110363109733e-05} +{"ts": "2025-12-27T05:55:54", "event": "train_log", "step": 2346, "epoch": 0.9898734177215189, "progress_pct": 16.5, "epoch_pct": 16.5, "eta": "59:38:32", "max_grad_norm": 0.8, "loss": 0.7078539133071899, "grad_norm": 1.1361702680587769, "learning_rate": 9.791424542661967e-05} +{"ts": "2025-12-27T05:56:12", "event": "train_log", "step": 2348, "epoch": 0.9907172995780591, "progress_pct": 16.51, "epoch_pct": 16.51, "eta": "59:36:25", "max_grad_norm": 0.8, "loss": 0.7945935130119324, "grad_norm": 1.3043115139007568, "learning_rate": 9.790737616924581e-05} +{"ts": "2025-12-27T05:56:32", "event": "train_log", "step": 2350, "epoch": 0.9915611814345991, "progress_pct": 16.53, "epoch_pct": 16.53, "eta": "59:34:24", "max_grad_norm": 0.8, "loss": 0.8247197866439819, "grad_norm": 1.1913264989852905, "learning_rate": 9.790049586056034e-05} +{"ts": "2025-12-27T05:56:50", "event": "train_log", "step": 2352, "epoch": 0.9924050632911392, "progress_pct": 16.54, "epoch_pct": 16.54, "eta": "59:32:18", "max_grad_norm": 0.8, "loss": 0.7099657654762268, "grad_norm": 1.1560171842575073, "learning_rate": 9.789360450215041e-05} +{"ts": "2025-12-27T05:57:09", "event": "train_log", "step": 2354, "epoch": 0.9932489451476794, "progress_pct": 16.55, "epoch_pct": 16.55, "eta": "59:30:17", "max_grad_norm": 0.8, "loss": 0.7480318546295166, "grad_norm": 1.2311041355133057, "learning_rate": 9.788670209560575e-05} +{"ts": "2025-12-27T05:57:29", "event": "train_log", "step": 2356, "epoch": 0.9940928270042194, "progress_pct": 16.57, "epoch_pct": 16.57, "eta": "59:28:16", "max_grad_norm": 0.8, "loss": 0.6870889067649841, "grad_norm": 1.1584707498550415, "learning_rate": 9.787978864251859e-05} +{"ts": "2025-12-27T05:57:49", "event": "train_log", "step": 2358, "epoch": 0.9949367088607595, "progress_pct": 16.58, "epoch_pct": 16.58, "eta": "59:26:21", "max_grad_norm": 0.8, "loss": 0.6114922165870667, "grad_norm": 1.057478666305542, "learning_rate": 9.787286414448375e-05} +{"ts": "2025-12-27T05:58:09", "event": "train_log", "step": 2360, "epoch": 0.9957805907172996, "progress_pct": 16.6, "epoch_pct": 16.6, "eta": "59:24:25", "max_grad_norm": 0.8, "loss": 0.6955118179321289, "grad_norm": 1.1431775093078613, "learning_rate": 9.786592860309856e-05} +{"ts": "2025-12-27T05:58:28", "event": "train_log", "step": 2362, "epoch": 0.9966244725738397, "progress_pct": 16.61, "epoch_pct": 16.61, "eta": "59:22:20", "max_grad_norm": 0.8, "loss": 0.735048770904541, "grad_norm": 1.232142448425293, "learning_rate": 9.785898201996292e-05} +{"ts": "2025-12-27T05:58:49", "event": "train_log", "step": 2364, "epoch": 0.9974683544303797, "progress_pct": 16.62, "epoch_pct": 16.62, "eta": "59:20:30", "max_grad_norm": 0.8, "loss": 0.7150241136550903, "grad_norm": 1.1236306428909302, "learning_rate": 9.785202439667928e-05} +{"ts": "2025-12-27T05:59:09", "event": "train_log", "step": 2366, "epoch": 0.9983122362869198, "progress_pct": 16.64, "epoch_pct": 16.64, "eta": "59:18:33", "max_grad_norm": 0.8, "loss": 0.6870222687721252, "grad_norm": 1.0517534017562866, "learning_rate": 9.784505573485263e-05} +{"ts": "2025-12-27T05:59:28", "event": "train_log", "step": 2368, "epoch": 0.99915611814346, "progress_pct": 16.65, "epoch_pct": 16.65, "eta": "59:16:31", "max_grad_norm": 0.8, "loss": 0.7521567940711975, "grad_norm": 1.1747480630874634, "learning_rate": 9.78380760360905e-05} +{"ts": "2025-12-27T05:59:46", "event": "train_log", "step": 2370, "epoch": 1.0, "progress_pct": 16.67, "epoch_pct": 16.67, "eta": "59:14:26", "max_grad_norm": 0.8, "loss": 0.7336234450340271, "grad_norm": 1.2790346145629883, "learning_rate": 9.783108530200298e-05} +{"ts": "2025-12-27T06:00:05", "event": "train_log", "step": 2372, "epoch": 1.0008438818565402, "progress_pct": 16.68, "epoch_pct": 16.68, "eta": "59:12:29", "max_grad_norm": 0.8, "loss": 0.6378109455108643, "grad_norm": 1.1216399669647217, "learning_rate": 9.78240835342027e-05} +{"ts": "2025-12-27T06:00:26", "event": "train_log", "step": 2374, "epoch": 1.00168776371308, "progress_pct": 16.69, "epoch_pct": 16.69, "eta": "59:10:35", "max_grad_norm": 0.8, "loss": 0.6174905300140381, "grad_norm": 1.267336368560791, "learning_rate": 9.781707073430482e-05} +{"ts": "2025-12-27T06:00:47", "event": "train_log", "step": 2376, "epoch": 1.0025316455696203, "progress_pct": 16.71, "epoch_pct": 16.71, "eta": "59:08:43", "max_grad_norm": 0.8, "loss": 0.6579123139381409, "grad_norm": 1.1342934370040894, "learning_rate": 9.781004690392706e-05} +{"ts": "2025-12-27T06:01:06", "event": "train_log", "step": 2378, "epoch": 1.0033755274261604, "progress_pct": 16.72, "epoch_pct": 16.72, "eta": "59:06:45", "max_grad_norm": 0.8, "loss": 0.6679617166519165, "grad_norm": 1.1317468881607056, "learning_rate": 9.78030120446897e-05} +{"ts": "2025-12-27T06:01:25", "event": "train_log", "step": 2380, "epoch": 1.0042194092827004, "progress_pct": 16.74, "epoch_pct": 16.74, "eta": "59:04:44", "max_grad_norm": 0.8, "loss": 0.7368149161338806, "grad_norm": 1.2992616891860962, "learning_rate": 9.779596615821552e-05} +{"ts": "2025-12-27T06:01:45", "event": "train_log", "step": 2382, "epoch": 1.0050632911392405, "progress_pct": 16.75, "epoch_pct": 16.75, "eta": "59:02:50", "max_grad_norm": 0.8, "loss": 0.6887164115905762, "grad_norm": 1.1714510917663574, "learning_rate": 9.77889092461299e-05} +{"ts": "2025-12-27T06:02:04", "event": "train_log", "step": 2384, "epoch": 1.0059071729957807, "progress_pct": 16.77, "epoch_pct": 16.77, "eta": "59:00:48", "max_grad_norm": 0.8, "loss": 0.681344211101532, "grad_norm": 1.1670639514923096, "learning_rate": 9.778184131006071e-05} +{"ts": "2025-12-27T06:02:24", "event": "train_log", "step": 2386, "epoch": 1.0067510548523206, "progress_pct": 16.78, "epoch_pct": 16.78, "eta": "58:58:53", "max_grad_norm": 0.8, "loss": 0.7342769503593445, "grad_norm": 1.2487291097640991, "learning_rate": 9.77747623516384e-05} +{"ts": "2025-12-27T06:02:43", "event": "train_log", "step": 2388, "epoch": 1.0075949367088608, "progress_pct": 16.79, "epoch_pct": 16.79, "eta": "58:56:55", "max_grad_norm": 0.8, "loss": 0.577454149723053, "grad_norm": 1.2408956289291382, "learning_rate": 9.776767237249595e-05} +{"ts": "2025-12-27T06:03:03", "event": "train_log", "step": 2390, "epoch": 1.0084388185654007, "progress_pct": 16.81, "epoch_pct": 16.81, "eta": "58:55:02", "max_grad_norm": 0.8, "loss": 0.6588307023048401, "grad_norm": 1.067991852760315, "learning_rate": 9.776057137426889e-05} +{"ts": "2025-12-27T06:03:23", "event": "train_log", "step": 2392, "epoch": 1.009282700421941, "progress_pct": 16.82, "epoch_pct": 16.82, "eta": "58:53:05", "max_grad_norm": 0.8, "loss": 0.7045041918754578, "grad_norm": 1.2821543216705322, "learning_rate": 9.775345935859525e-05} +{"ts": "2025-12-27T06:03:42", "event": "train_log", "step": 2394, "epoch": 1.010126582278481, "progress_pct": 16.84, "epoch_pct": 16.84, "eta": "58:51:05", "max_grad_norm": 0.8, "loss": 0.7141479253768921, "grad_norm": 1.3160134553909302, "learning_rate": 9.774633632711569e-05} +{"ts": "2025-12-27T06:04:00", "event": "train_log", "step": 2396, "epoch": 1.010970464135021, "progress_pct": 16.85, "epoch_pct": 16.85, "eta": "58:49:06", "max_grad_norm": 0.8, "loss": 0.723293662071228, "grad_norm": 1.66774320602417, "learning_rate": 9.773920228147329e-05} +{"ts": "2025-12-27T06:04:20", "event": "train_log", "step": 2398, "epoch": 1.0118143459915612, "progress_pct": 16.86, "epoch_pct": 16.86, "eta": "58:47:11", "max_grad_norm": 0.8, "loss": 0.5812023878097534, "grad_norm": 1.027588963508606, "learning_rate": 9.77320572233138e-05} +{"ts": "2025-12-27T06:04:39", "event": "train_log", "step": 2400, "epoch": 1.0126582278481013, "progress_pct": 16.88, "epoch_pct": 16.88, "eta": "58:45:09", "max_grad_norm": 0.8, "loss": 0.7071458101272583, "grad_norm": 1.406507968902588, "learning_rate": 9.77249011542854e-05} +{"ts": "2025-12-27T06:18:53", "event": "train_log", "step": 2400, "epoch": 1.0126582278481013, "progress_pct": 16.88, "epoch_pct": 16.88, "eta": "59:55:16", "max_grad_norm": 0.8, "eval_loss": 0.7421699166297913, "eval_runtime": 854.2185, "eval_samples_per_second": 2.467, "eval_steps_per_second": 2.467} +{"ts": "2025-12-27T06:19:12", "event": "train_log", "step": 2402, "epoch": 1.0135021097046413, "progress_pct": 16.89, "epoch_pct": 16.89, "eta": "59:53:16", "max_grad_norm": 0.8, "loss": 0.7049722671508789, "grad_norm": 1.1236240863800049, "learning_rate": 9.771773407603889e-05} +{"ts": "2025-12-27T06:19:33", "event": "train_log", "step": 2404, "epoch": 1.0143459915611814, "progress_pct": 16.91, "epoch_pct": 16.91, "eta": "59:51:22", "max_grad_norm": 0.8, "loss": 0.635308027267456, "grad_norm": 1.1924289464950562, "learning_rate": 9.771055599022756e-05} +{"ts": "2025-12-27T06:19:52", "event": "train_log", "step": 2406, "epoch": 1.0151898734177216, "progress_pct": 16.92, "epoch_pct": 16.92, "eta": "59:49:22", "max_grad_norm": 0.8, "loss": 0.7286487817764282, "grad_norm": 1.1744966506958008, "learning_rate": 9.770336689850727e-05} +{"ts": "2025-12-27T06:20:11", "event": "train_log", "step": 2408, "epoch": 1.0160337552742615, "progress_pct": 16.93, "epoch_pct": 16.93, "eta": "59:47:18", "max_grad_norm": 0.8, "loss": 0.6828222274780273, "grad_norm": 1.2131173610687256, "learning_rate": 9.769616680253639e-05} +{"ts": "2025-12-27T06:20:32", "event": "train_log", "step": 2410, "epoch": 1.0168776371308017, "progress_pct": 16.95, "epoch_pct": 16.95, "eta": "59:45:24", "max_grad_norm": 0.8, "loss": 0.6652156114578247, "grad_norm": 1.0517828464508057, "learning_rate": 9.768895570397585e-05} +{"ts": "2025-12-27T06:20:52", "event": "train_log", "step": 2412, "epoch": 1.0177215189873419, "progress_pct": 16.96, "epoch_pct": 16.96, "eta": "59:43:28", "max_grad_norm": 0.8, "loss": 0.7278267741203308, "grad_norm": 1.1603758335113525, "learning_rate": 9.768173360448912e-05} +{"ts": "2025-12-27T06:21:11", "event": "train_log", "step": 2414, "epoch": 1.0185654008438818, "progress_pct": 16.98, "epoch_pct": 16.98, "eta": "59:41:27", "max_grad_norm": 0.8, "loss": 0.6082334518432617, "grad_norm": 1.3167752027511597, "learning_rate": 9.767450050574218e-05} +{"ts": "2025-12-27T06:21:30", "event": "train_log", "step": 2416, "epoch": 1.019409282700422, "progress_pct": 16.99, "epoch_pct": 16.99, "eta": "59:39:26", "max_grad_norm": 0.8, "loss": 0.67228102684021, "grad_norm": 1.1754449605941772, "learning_rate": 9.766725640940358e-05} +{"ts": "2025-12-27T06:21:50", "event": "train_log", "step": 2418, "epoch": 1.0202531645569621, "progress_pct": 17.0, "epoch_pct": 17.0, "eta": "59:37:31", "max_grad_norm": 0.8, "loss": 0.5984366536140442, "grad_norm": 1.060952067375183, "learning_rate": 9.766000131714442e-05} +{"ts": "2025-12-27T06:22:10", "event": "train_log", "step": 2420, "epoch": 1.021097046413502, "progress_pct": 17.02, "epoch_pct": 17.02, "eta": "59:35:33", "max_grad_norm": 0.8, "loss": 0.690661609172821, "grad_norm": 1.0826152563095093, "learning_rate": 9.765273523063825e-05} +{"ts": "2025-12-27T06:22:28", "event": "train_log", "step": 2422, "epoch": 1.0219409282700422, "progress_pct": 17.03, "epoch_pct": 17.03, "eta": "59:33:27", "max_grad_norm": 0.8, "loss": 0.7960668802261353, "grad_norm": 1.423723816871643, "learning_rate": 9.764545815156125e-05} +{"ts": "2025-12-27T06:22:50", "event": "train_log", "step": 2424, "epoch": 1.0227848101265822, "progress_pct": 17.05, "epoch_pct": 17.05, "eta": "59:31:39", "max_grad_norm": 0.8, "loss": 0.6971074342727661, "grad_norm": 1.0882549285888672, "learning_rate": 9.763817008159212e-05} +{"ts": "2025-12-27T06:23:08", "event": "train_log", "step": 2426, "epoch": 1.0236286919831223, "progress_pct": 17.06, "epoch_pct": 17.06, "eta": "59:29:37", "max_grad_norm": 0.8, "loss": 0.6854458451271057, "grad_norm": 1.1053040027618408, "learning_rate": 9.763087102241206e-05} +{"ts": "2025-12-27T06:23:29", "event": "train_log", "step": 2428, "epoch": 1.0244725738396625, "progress_pct": 17.07, "epoch_pct": 17.07, "eta": "59:27:45", "max_grad_norm": 0.8, "loss": 0.6724489331245422, "grad_norm": 1.1975224018096924, "learning_rate": 9.762356097570482e-05} +{"ts": "2025-12-27T06:23:49", "event": "train_log", "step": 2430, "epoch": 1.0253164556962024, "progress_pct": 17.09, "epoch_pct": 17.09, "eta": "59:25:48", "max_grad_norm": 0.8, "loss": 0.7064506411552429, "grad_norm": 1.1692171096801758, "learning_rate": 9.76162399431567e-05} +{"ts": "2025-12-27T06:24:08", "event": "train_log", "step": 2432, "epoch": 1.0261603375527426, "progress_pct": 17.1, "epoch_pct": 17.1, "eta": "59:23:51", "max_grad_norm": 0.8, "loss": 0.6605257391929626, "grad_norm": 1.1927787065505981, "learning_rate": 9.760890792645649e-05} +{"ts": "2025-12-27T06:24:28", "event": "train_log", "step": 2434, "epoch": 1.0270042194092828, "progress_pct": 17.12, "epoch_pct": 17.12, "eta": "59:21:54", "max_grad_norm": 0.8, "loss": 0.6872501373291016, "grad_norm": 1.4147427082061768, "learning_rate": 9.760156492729558e-05} +{"ts": "2025-12-27T06:24:48", "event": "train_log", "step": 2436, "epoch": 1.0278481012658227, "progress_pct": 17.13, "epoch_pct": 17.13, "eta": "59:20:00", "max_grad_norm": 0.8, "loss": 0.7117500305175781, "grad_norm": 1.2503126859664917, "learning_rate": 9.759421094736785e-05} +{"ts": "2025-12-27T06:25:09", "event": "train_log", "step": 2438, "epoch": 1.0286919831223629, "progress_pct": 17.14, "epoch_pct": 17.14, "eta": "59:18:08", "max_grad_norm": 0.8, "loss": 0.6740369200706482, "grad_norm": 1.229978084564209, "learning_rate": 9.758684598836971e-05} +{"ts": "2025-12-27T06:25:28", "event": "train_log", "step": 2440, "epoch": 1.029535864978903, "progress_pct": 17.16, "epoch_pct": 17.16, "eta": "59:16:10", "max_grad_norm": 0.8, "loss": 0.7215790748596191, "grad_norm": 1.4765945672988892, "learning_rate": 9.757947005200014e-05} +{"ts": "2025-12-27T06:25:49", "event": "train_log", "step": 2442, "epoch": 1.030379746835443, "progress_pct": 17.17, "epoch_pct": 17.17, "eta": "59:14:18", "max_grad_norm": 0.8, "loss": 0.6961746215820312, "grad_norm": 1.282632827758789, "learning_rate": 9.757208313996061e-05} +{"ts": "2025-12-27T06:26:08", "event": "train_log", "step": 2444, "epoch": 1.0312236286919831, "progress_pct": 17.19, "epoch_pct": 17.19, "eta": "59:12:21", "max_grad_norm": 0.8, "loss": 0.6348349452018738, "grad_norm": 1.259828805923462, "learning_rate": 9.756468525395512e-05} +{"ts": "2025-12-27T06:26:28", "event": "train_log", "step": 2446, "epoch": 1.0320675105485233, "progress_pct": 17.2, "epoch_pct": 17.2, "eta": "59:10:27", "max_grad_norm": 0.8, "loss": 0.6756057739257812, "grad_norm": 1.0984172821044922, "learning_rate": 9.755727639569024e-05} +{"ts": "2025-12-27T06:26:45", "event": "train_log", "step": 2448, "epoch": 1.0329113924050632, "progress_pct": 17.22, "epoch_pct": 17.22, "eta": "59:08:18", "max_grad_norm": 0.8, "loss": 0.6968509554862976, "grad_norm": 1.235835075378418, "learning_rate": 9.754985656687506e-05} +{"ts": "2025-12-27T06:27:05", "event": "train_log", "step": 2450, "epoch": 1.0337552742616034, "progress_pct": 17.23, "epoch_pct": 17.23, "eta": "59:06:26", "max_grad_norm": 0.8, "loss": 0.6793950796127319, "grad_norm": 1.273032546043396, "learning_rate": 9.754242576922119e-05} +{"ts": "2025-12-27T06:27:26", "event": "train_log", "step": 2452, "epoch": 1.0345991561181433, "progress_pct": 17.24, "epoch_pct": 17.24, "eta": "59:04:34", "max_grad_norm": 0.8, "loss": 0.645270586013794, "grad_norm": 1.251996397972107, "learning_rate": 9.753498400444274e-05} +{"ts": "2025-12-27T06:27:45", "event": "train_log", "step": 2454, "epoch": 1.0354430379746835, "progress_pct": 17.26, "epoch_pct": 17.26, "eta": "59:02:36", "max_grad_norm": 0.8, "loss": 0.7291322350502014, "grad_norm": 1.4310805797576904, "learning_rate": 9.752753127425642e-05} +{"ts": "2025-12-27T06:28:04", "event": "train_log", "step": 2456, "epoch": 1.0362869198312237, "progress_pct": 17.27, "epoch_pct": 17.27, "eta": "59:00:39", "max_grad_norm": 0.8, "loss": 0.7553019523620605, "grad_norm": 1.6582196950912476, "learning_rate": 9.752006758038142e-05} +{"ts": "2025-12-27T06:28:25", "event": "train_log", "step": 2458, "epoch": 1.0371308016877636, "progress_pct": 17.29, "epoch_pct": 17.29, "eta": "58:58:50", "max_grad_norm": 0.8, "loss": 0.5637331008911133, "grad_norm": 1.081773042678833, "learning_rate": 9.751259292453947e-05} +{"ts": "2025-12-27T06:28:46", "event": "train_log", "step": 2460, "epoch": 1.0379746835443038, "progress_pct": 17.3, "epoch_pct": 17.3, "eta": "58:57:03", "max_grad_norm": 0.8, "loss": 0.6012396216392517, "grad_norm": 1.1483876705169678, "learning_rate": 9.750510730845483e-05} +{"ts": "2025-12-27T06:29:06", "event": "train_log", "step": 2462, "epoch": 1.038818565400844, "progress_pct": 17.31, "epoch_pct": 17.31, "eta": "58:55:09", "max_grad_norm": 0.8, "loss": 0.6795822381973267, "grad_norm": 1.0879185199737549, "learning_rate": 9.749761073385428e-05} +{"ts": "2025-12-27T06:29:26", "event": "train_log", "step": 2464, "epoch": 1.0396624472573839, "progress_pct": 17.33, "epoch_pct": 17.33, "eta": "58:53:17", "max_grad_norm": 0.8, "loss": 0.6895145773887634, "grad_norm": 1.2378218173980713, "learning_rate": 9.749010320246714e-05} +{"ts": "2025-12-27T06:29:45", "event": "train_log", "step": 2466, "epoch": 1.040506329113924, "progress_pct": 17.34, "epoch_pct": 17.34, "eta": "58:51:17", "max_grad_norm": 0.8, "loss": 0.7124115228652954, "grad_norm": 1.253233790397644, "learning_rate": 9.748258471602527e-05} +{"ts": "2025-12-27T06:30:03", "event": "train_log", "step": 2468, "epoch": 1.0413502109704642, "progress_pct": 17.36, "epoch_pct": 17.36, "eta": "58:49:16", "max_grad_norm": 0.8, "loss": 0.7304861545562744, "grad_norm": 1.3994864225387573, "learning_rate": 9.747505527626302e-05} +{"ts": "2025-12-27T06:30:23", "event": "train_log", "step": 2470, "epoch": 1.0421940928270041, "progress_pct": 17.37, "epoch_pct": 17.37, "eta": "58:47:25", "max_grad_norm": 0.8, "loss": 0.6845837831497192, "grad_norm": 1.2360669374465942, "learning_rate": 9.74675148849173e-05} +{"ts": "2025-12-27T06:30:44", "event": "train_log", "step": 2472, "epoch": 1.0430379746835443, "progress_pct": 17.38, "epoch_pct": 17.38, "eta": "58:45:39", "max_grad_norm": 0.8, "loss": 0.6780203580856323, "grad_norm": 1.126849889755249, "learning_rate": 9.74599635437275e-05} +{"ts": "2025-12-27T06:31:04", "event": "train_log", "step": 2474, "epoch": 1.0438818565400845, "progress_pct": 17.4, "epoch_pct": 17.4, "eta": "58:43:46", "max_grad_norm": 0.8, "loss": 0.7550003528594971, "grad_norm": 1.169788122177124, "learning_rate": 9.745240125443562e-05} +{"ts": "2025-12-27T06:31:23", "event": "train_log", "step": 2476, "epoch": 1.0447257383966244, "progress_pct": 17.41, "epoch_pct": 17.41, "eta": "58:41:46", "max_grad_norm": 0.8, "loss": 0.6910399198532104, "grad_norm": 1.1311867237091064, "learning_rate": 9.744482801878612e-05} +{"ts": "2025-12-27T06:31:43", "event": "train_log", "step": 2478, "epoch": 1.0455696202531646, "progress_pct": 17.43, "epoch_pct": 17.43, "eta": "58:39:55", "max_grad_norm": 0.8, "loss": 0.7164814472198486, "grad_norm": 1.1267731189727783, "learning_rate": 9.743724383852597e-05} +{"ts": "2025-12-27T06:32:02", "event": "train_log", "step": 2480, "epoch": 1.0464135021097047, "progress_pct": 17.44, "epoch_pct": 17.44, "eta": "58:38:00", "max_grad_norm": 0.8, "loss": 0.6428439617156982, "grad_norm": 1.2239704132080078, "learning_rate": 9.742964871540472e-05} +{"ts": "2025-12-27T06:32:22", "event": "train_log", "step": 2482, "epoch": 1.0472573839662447, "progress_pct": 17.45, "epoch_pct": 17.45, "eta": "58:36:08", "max_grad_norm": 0.8, "loss": 0.6994290351867676, "grad_norm": 1.1854743957519531, "learning_rate": 9.742204265117443e-05} +{"ts": "2025-12-27T06:32:44", "event": "train_log", "step": 2484, "epoch": 1.0481012658227848, "progress_pct": 17.47, "epoch_pct": 17.47, "eta": "58:34:25", "max_grad_norm": 0.8, "loss": 0.6725777983665466, "grad_norm": 1.0695894956588745, "learning_rate": 9.741442564758964e-05} +{"ts": "2025-12-27T06:33:04", "event": "train_log", "step": 2486, "epoch": 1.048945147679325, "progress_pct": 17.48, "epoch_pct": 17.48, "eta": "58:32:37", "max_grad_norm": 0.8, "loss": 0.6538674235343933, "grad_norm": 1.1799863576889038, "learning_rate": 9.740679770640748e-05} +{"ts": "2025-12-27T06:33:24", "event": "train_log", "step": 2488, "epoch": 1.049789029535865, "progress_pct": 17.5, "epoch_pct": 17.5, "eta": "58:30:44", "max_grad_norm": 0.8, "loss": 0.780756950378418, "grad_norm": 1.295546293258667, "learning_rate": 9.739915882938754e-05} +{"ts": "2025-12-27T06:33:44", "event": "train_log", "step": 2490, "epoch": 1.0506329113924051, "progress_pct": 17.51, "epoch_pct": 17.51, "eta": "58:28:55", "max_grad_norm": 0.8, "loss": 0.6657930612564087, "grad_norm": 1.2371755838394165, "learning_rate": 9.739150901829198e-05} +{"ts": "2025-12-27T06:34:04", "event": "train_log", "step": 2492, "epoch": 1.051476793248945, "progress_pct": 17.52, "epoch_pct": 17.52, "eta": "58:27:03", "max_grad_norm": 0.8, "loss": 0.6675208210945129, "grad_norm": 1.103037714958191, "learning_rate": 9.738384827488547e-05} +{"ts": "2025-12-27T06:34:23", "event": "train_log", "step": 2494, "epoch": 1.0523206751054852, "progress_pct": 17.54, "epoch_pct": 17.54, "eta": "58:25:09", "max_grad_norm": 0.8, "loss": 0.6693358421325684, "grad_norm": 1.1835435628890991, "learning_rate": 9.737617660093517e-05} +{"ts": "2025-12-27T06:34:44", "event": "train_log", "step": 2496, "epoch": 1.0531645569620254, "progress_pct": 17.55, "epoch_pct": 17.55, "eta": "58:23:25", "max_grad_norm": 0.8, "loss": 0.624502956867218, "grad_norm": 1.003771424293518, "learning_rate": 9.736849399821082e-05} +{"ts": "2025-12-27T06:35:04", "event": "train_log", "step": 2498, "epoch": 1.0540084388185653, "progress_pct": 17.57, "epoch_pct": 17.57, "eta": "58:21:34", "max_grad_norm": 0.8, "loss": 0.6350868344306946, "grad_norm": 1.1391769647598267, "learning_rate": 9.736080046848463e-05} +{"ts": "2025-12-27T06:35:23", "event": "train_log", "step": 2500, "epoch": 1.0548523206751055, "progress_pct": 17.58, "epoch_pct": 17.58, "eta": "58:19:38", "max_grad_norm": 0.8, "loss": 0.6721012592315674, "grad_norm": 1.376518726348877, "learning_rate": 9.735309601353134e-05} +{"ts": "2025-12-27T06:49:31", "event": "train_log", "step": 2500, "epoch": 1.0548523206751055, "progress_pct": 17.58, "epoch_pct": 17.58, "eta": "59:25:52", "max_grad_norm": 0.8, "eval_loss": 0.741338849067688, "eval_runtime": 847.7478, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485} +{"ts": "2025-12-27T06:49:52", "event": "train_log", "step": 2502, "epoch": 1.0556962025316456, "progress_pct": 17.59, "epoch_pct": 17.59, "eta": "59:24:06", "max_grad_norm": 0.8, "loss": 0.6888233423233032, "grad_norm": 1.194190502166748, "learning_rate": 9.734538063512824e-05} +{"ts": "2025-12-27T06:50:12", "event": "train_log", "step": 2504, "epoch": 1.0565400843881856, "progress_pct": 17.61, "epoch_pct": 17.61, "eta": "59:22:12", "max_grad_norm": 0.8, "loss": 0.7095553278923035, "grad_norm": 1.378830909729004, "learning_rate": 9.733765433505513e-05} +{"ts": "2025-12-27T06:50:32", "event": "train_log", "step": 2506, "epoch": 1.0573839662447257, "progress_pct": 17.62, "epoch_pct": 17.62, "eta": "59:20:16", "max_grad_norm": 0.8, "loss": 0.6734166145324707, "grad_norm": 1.1289541721343994, "learning_rate": 9.732991711509428e-05} +{"ts": "2025-12-27T06:50:51", "event": "train_log", "step": 2508, "epoch": 1.058227848101266, "progress_pct": 17.64, "epoch_pct": 17.64, "eta": "59:18:19", "max_grad_norm": 0.8, "loss": 0.7006195187568665, "grad_norm": 1.1858116388320923, "learning_rate": 9.732216897703054e-05} +{"ts": "2025-12-27T06:51:10", "event": "train_log", "step": 2510, "epoch": 1.0590717299578059, "progress_pct": 17.65, "epoch_pct": 17.65, "eta": "59:16:21", "max_grad_norm": 0.8, "loss": 0.6481205821037292, "grad_norm": 1.1365686655044556, "learning_rate": 9.731440992265127e-05} +{"ts": "2025-12-27T06:51:28", "event": "train_log", "step": 2512, "epoch": 1.059915611814346, "progress_pct": 17.67, "epoch_pct": 17.67, "eta": "59:14:18", "max_grad_norm": 0.8, "loss": 0.679282546043396, "grad_norm": 1.2886228561401367, "learning_rate": 9.730663995374632e-05} +{"ts": "2025-12-27T06:51:48", "event": "train_log", "step": 2514, "epoch": 1.0607594936708862, "progress_pct": 17.68, "epoch_pct": 17.68, "eta": "59:12:24", "max_grad_norm": 0.8, "loss": 0.7656359672546387, "grad_norm": 1.355322003364563, "learning_rate": 9.729885907210808e-05} +{"ts": "2025-12-27T06:52:08", "event": "train_log", "step": 2516, "epoch": 1.0616033755274261, "progress_pct": 17.69, "epoch_pct": 17.69, "eta": "59:10:30", "max_grad_norm": 0.8, "loss": 0.5996183156967163, "grad_norm": 1.1552364826202393, "learning_rate": 9.729106727953142e-05} +{"ts": "2025-12-27T06:52:28", "event": "train_log", "step": 2518, "epoch": 1.0624472573839663, "progress_pct": 17.71, "epoch_pct": 17.71, "eta": "59:08:39", "max_grad_norm": 0.8, "loss": 0.7599716782569885, "grad_norm": 1.1419235467910767, "learning_rate": 9.728326457781381e-05} +{"ts": "2025-12-27T06:52:48", "event": "train_log", "step": 2520, "epoch": 1.0632911392405062, "progress_pct": 17.72, "epoch_pct": 17.72, "eta": "59:06:48", "max_grad_norm": 0.8, "loss": 0.7150241732597351, "grad_norm": 1.2240079641342163, "learning_rate": 9.727545096875512e-05} +{"ts": "2025-12-27T06:53:08", "event": "train_log", "step": 2522, "epoch": 1.0641350210970464, "progress_pct": 17.74, "epoch_pct": 17.74, "eta": "59:04:56", "max_grad_norm": 0.8, "loss": 0.734352171421051, "grad_norm": 1.2463440895080566, "learning_rate": 9.726762645415785e-05} +{"ts": "2025-12-27T06:53:29", "event": "train_log", "step": 2524, "epoch": 1.0649789029535865, "progress_pct": 17.75, "epoch_pct": 17.75, "eta": "59:03:07", "max_grad_norm": 0.8, "loss": 0.6950796842575073, "grad_norm": 1.1680364608764648, "learning_rate": 9.725979103582697e-05} +{"ts": "2025-12-27T06:53:49", "event": "train_log", "step": 2526, "epoch": 1.0658227848101265, "progress_pct": 17.76, "epoch_pct": 17.76, "eta": "59:01:14", "max_grad_norm": 0.8, "loss": 0.7096341252326965, "grad_norm": 1.1680421829223633, "learning_rate": 9.725194471556991e-05} +{"ts": "2025-12-27T06:54:10", "event": "train_log", "step": 2528, "epoch": 1.0666666666666667, "progress_pct": 17.78, "epoch_pct": 17.78, "eta": "58:59:28", "max_grad_norm": 0.8, "loss": 0.6486304402351379, "grad_norm": 1.043717861175537, "learning_rate": 9.724408749519671e-05} +{"ts": "2025-12-27T06:54:30", "event": "train_log", "step": 2530, "epoch": 1.0675105485232068, "progress_pct": 17.79, "epoch_pct": 17.79, "eta": "58:57:38", "max_grad_norm": 0.8, "loss": 0.6519505381584167, "grad_norm": 1.1240284442901611, "learning_rate": 9.723621937651985e-05} +{"ts": "2025-12-27T06:54:50", "event": "train_log", "step": 2532, "epoch": 1.0683544303797468, "progress_pct": 17.81, "epoch_pct": 17.81, "eta": "58:55:46", "max_grad_norm": 0.8, "loss": 0.6724293231964111, "grad_norm": 1.185223937034607, "learning_rate": 9.722834036135439e-05} +{"ts": "2025-12-27T06:55:09", "event": "train_log", "step": 2534, "epoch": 1.069198312236287, "progress_pct": 17.82, "epoch_pct": 17.82, "eta": "58:53:49", "max_grad_norm": 0.8, "loss": 0.6886576414108276, "grad_norm": 1.3234196901321411, "learning_rate": 9.722045045151784e-05} +{"ts": "2025-12-27T06:55:29", "event": "train_log", "step": 2536, "epoch": 1.070042194092827, "progress_pct": 17.83, "epoch_pct": 17.83, "eta": "58:51:59", "max_grad_norm": 0.8, "loss": 0.688493549823761, "grad_norm": 1.333084225654602, "learning_rate": 9.721254964883024e-05} +{"ts": "2025-12-27T06:55:48", "event": "train_log", "step": 2538, "epoch": 1.070886075949367, "progress_pct": 17.85, "epoch_pct": 17.85, "eta": "58:50:01", "max_grad_norm": 0.8, "loss": 0.6527412533760071, "grad_norm": 1.2435462474822998, "learning_rate": 9.720463795511419e-05} +{"ts": "2025-12-27T06:56:10", "event": "train_log", "step": 2540, "epoch": 1.0717299578059072, "progress_pct": 17.86, "epoch_pct": 17.86, "eta": "58:48:18", "max_grad_norm": 0.8, "loss": 0.6508163809776306, "grad_norm": 1.1521880626678467, "learning_rate": 9.719671537219472e-05} +{"ts": "2025-12-27T06:56:32", "event": "train_log", "step": 2542, "epoch": 1.0725738396624473, "progress_pct": 17.88, "epoch_pct": 17.88, "eta": "58:46:39", "max_grad_norm": 0.8, "loss": 0.6954023838043213, "grad_norm": 1.015013575553894, "learning_rate": 9.718878190189947e-05} +{"ts": "2025-12-27T06:56:52", "event": "train_log", "step": 2544, "epoch": 1.0734177215189873, "progress_pct": 17.89, "epoch_pct": 17.89, "eta": "58:44:48", "max_grad_norm": 0.8, "loss": 0.7201322913169861, "grad_norm": 1.1507678031921387, "learning_rate": 9.718083754605851e-05} +{"ts": "2025-12-27T06:57:12", "event": "train_log", "step": 2546, "epoch": 1.0742616033755275, "progress_pct": 17.9, "epoch_pct": 17.9, "eta": "58:42:56", "max_grad_norm": 0.8, "loss": 0.6688649654388428, "grad_norm": 1.0569016933441162, "learning_rate": 9.717288230650444e-05} +{"ts": "2025-12-27T06:57:32", "event": "train_log", "step": 2548, "epoch": 1.0751054852320676, "progress_pct": 17.92, "epoch_pct": 17.92, "eta": "58:41:04", "max_grad_norm": 0.8, "loss": 0.7077898979187012, "grad_norm": 1.2178492546081543, "learning_rate": 9.716491618507241e-05} +{"ts": "2025-12-27T06:57:51", "event": "train_log", "step": 2550, "epoch": 1.0759493670886076, "progress_pct": 17.93, "epoch_pct": 17.93, "eta": "58:39:10", "max_grad_norm": 0.8, "loss": 0.7312119603157043, "grad_norm": 1.3587230443954468, "learning_rate": 9.715693918360002e-05} +{"ts": "2025-12-27T06:58:11", "event": "train_log", "step": 2552, "epoch": 1.0767932489451477, "progress_pct": 17.95, "epoch_pct": 17.95, "eta": "58:37:22", "max_grad_norm": 0.8, "loss": 0.6910589337348938, "grad_norm": 1.1930122375488281, "learning_rate": 9.714895130392744e-05} +{"ts": "2025-12-27T06:58:32", "event": "train_log", "step": 2554, "epoch": 1.0776371308016879, "progress_pct": 17.96, "epoch_pct": 17.96, "eta": "58:35:33", "max_grad_norm": 0.8, "loss": 0.7942836284637451, "grad_norm": 1.2440707683563232, "learning_rate": 9.71409525478973e-05} +{"ts": "2025-12-27T06:58:51", "event": "train_log", "step": 2556, "epoch": 1.0784810126582278, "progress_pct": 17.97, "epoch_pct": 17.97, "eta": "58:33:43", "max_grad_norm": 0.8, "loss": 0.6652286052703857, "grad_norm": 1.3755065202713013, "learning_rate": 9.713294291735477e-05} +{"ts": "2025-12-27T06:59:11", "event": "train_log", "step": 2558, "epoch": 1.079324894514768, "progress_pct": 17.99, "epoch_pct": 17.99, "eta": "58:31:53", "max_grad_norm": 0.8, "loss": 0.6025735139846802, "grad_norm": 1.165448784828186, "learning_rate": 9.71249224141475e-05} +{"ts": "2025-12-27T06:59:31", "event": "train_log", "step": 2560, "epoch": 1.080168776371308, "progress_pct": 18.0, "epoch_pct": 18.0, "eta": "58:30:01", "max_grad_norm": 0.8, "loss": 0.7343734502792358, "grad_norm": 1.2981204986572266, "learning_rate": 9.711689104012569e-05} +{"ts": "2025-12-27T06:59:50", "event": "train_log", "step": 2562, "epoch": 1.081012658227848, "progress_pct": 18.02, "epoch_pct": 18.02, "eta": "58:28:08", "max_grad_norm": 0.8, "loss": 0.6903306841850281, "grad_norm": 1.2040622234344482, "learning_rate": 9.710884879714202e-05} +{"ts": "2025-12-27T07:00:10", "event": "train_log", "step": 2564, "epoch": 1.0818565400843883, "progress_pct": 18.03, "epoch_pct": 18.03, "eta": "58:26:16", "max_grad_norm": 0.8, "loss": 0.69134920835495, "grad_norm": 1.1835904121398926, "learning_rate": 9.710079568705168e-05} +{"ts": "2025-12-27T07:00:29", "event": "train_log", "step": 2566, "epoch": 1.0827004219409282, "progress_pct": 18.05, "epoch_pct": 18.05, "eta": "58:24:23", "max_grad_norm": 0.8, "loss": 0.6471185088157654, "grad_norm": 1.3345229625701904, "learning_rate": 9.709273171171235e-05} +{"ts": "2025-12-27T07:00:50", "event": "train_log", "step": 2568, "epoch": 1.0835443037974684, "progress_pct": 18.06, "epoch_pct": 18.06, "eta": "58:22:40", "max_grad_norm": 0.8, "loss": 0.6302382349967957, "grad_norm": 1.0884469747543335, "learning_rate": 9.708465687298425e-05} +{"ts": "2025-12-27T07:01:09", "event": "train_log", "step": 2570, "epoch": 1.0843881856540085, "progress_pct": 18.07, "epoch_pct": 18.07, "eta": "58:20:45", "max_grad_norm": 0.8, "loss": 0.7329678535461426, "grad_norm": 1.1994211673736572, "learning_rate": 9.707657117273007e-05} +{"ts": "2025-12-27T07:01:27", "event": "train_log", "step": 2572, "epoch": 1.0852320675105485, "progress_pct": 18.09, "epoch_pct": 18.09, "eta": "58:18:49", "max_grad_norm": 0.8, "loss": 0.719862163066864, "grad_norm": 1.2609503269195557, "learning_rate": 9.706847461281507e-05} +{"ts": "2025-12-27T07:01:46", "event": "train_log", "step": 2574, "epoch": 1.0860759493670886, "progress_pct": 18.1, "epoch_pct": 18.1, "eta": "58:16:53", "max_grad_norm": 0.8, "loss": 0.7142901420593262, "grad_norm": 1.2686879634857178, "learning_rate": 9.706036719510694e-05} +{"ts": "2025-12-27T07:02:05", "event": "train_log", "step": 2576, "epoch": 1.0869198312236288, "progress_pct": 18.12, "epoch_pct": 18.12, "eta": "58:15:04", "max_grad_norm": 0.8, "loss": 0.7009075284004211, "grad_norm": 1.2763310670852661, "learning_rate": 9.705224892147591e-05} +{"ts": "2025-12-27T07:02:24", "event": "train_log", "step": 2578, "epoch": 1.0877637130801687, "progress_pct": 18.13, "epoch_pct": 18.13, "eta": "58:13:09", "max_grad_norm": 0.8, "loss": 0.6873779296875, "grad_norm": 1.1704022884368896, "learning_rate": 9.70441197937947e-05} +{"ts": "2025-12-27T07:02:45", "event": "train_log", "step": 2580, "epoch": 1.0886075949367089, "progress_pct": 18.14, "epoch_pct": 18.14, "eta": "58:11:25", "max_grad_norm": 0.8, "loss": 0.6437726020812988, "grad_norm": 1.0482875108718872, "learning_rate": 9.703597981393856e-05} +{"ts": "2025-12-27T07:03:03", "event": "train_log", "step": 2582, "epoch": 1.0894514767932488, "progress_pct": 18.16, "epoch_pct": 18.16, "eta": "58:09:29", "max_grad_norm": 0.8, "loss": 0.6933431625366211, "grad_norm": 1.28431236743927, "learning_rate": 9.702782898378521e-05} +{"ts": "2025-12-27T07:03:23", "event": "train_log", "step": 2584, "epoch": 1.090295358649789, "progress_pct": 18.17, "epoch_pct": 18.17, "eta": "58:07:40", "max_grad_norm": 0.8, "loss": 0.6488757133483887, "grad_norm": 1.0962283611297607, "learning_rate": 9.701966730521491e-05} +{"ts": "2025-12-27T07:03:41", "event": "train_log", "step": 2586, "epoch": 1.0911392405063292, "progress_pct": 18.19, "epoch_pct": 18.19, "eta": "58:05:44", "max_grad_norm": 0.8, "loss": 0.6385396122932434, "grad_norm": 1.2177873849868774, "learning_rate": 9.70114947801104e-05} +{"ts": "2025-12-27T07:04:00", "event": "train_log", "step": 2588, "epoch": 1.091983122362869, "progress_pct": 18.2, "epoch_pct": 18.2, "eta": "58:03:53", "max_grad_norm": 0.8, "loss": 0.6826614737510681, "grad_norm": 1.197059988975525, "learning_rate": 9.70033114103569e-05} +{"ts": "2025-12-27T07:04:20", "event": "train_log", "step": 2590, "epoch": 1.0928270042194093, "progress_pct": 18.21, "epoch_pct": 18.21, "eta": "58:02:06", "max_grad_norm": 0.8, "loss": 0.605629563331604, "grad_norm": 1.1624075174331665, "learning_rate": 9.699511719784217e-05} +{"ts": "2025-12-27T07:04:40", "event": "train_log", "step": 2592, "epoch": 1.0936708860759494, "progress_pct": 18.23, "epoch_pct": 18.23, "eta": "58:00:15", "max_grad_norm": 0.8, "loss": 0.734926700592041, "grad_norm": 1.2975167036056519, "learning_rate": 9.698691214445648e-05} +{"ts": "2025-12-27T07:04:58", "event": "train_log", "step": 2594, "epoch": 1.0945147679324894, "progress_pct": 18.24, "epoch_pct": 18.24, "eta": "57:58:22", "max_grad_norm": 0.8, "loss": 0.7281333804130554, "grad_norm": 1.215414047241211, "learning_rate": 9.697869625209255e-05} +{"ts": "2025-12-27T07:05:17", "event": "train_log", "step": 2596, "epoch": 1.0953586497890295, "progress_pct": 18.26, "epoch_pct": 18.26, "eta": "57:56:31", "max_grad_norm": 0.8, "loss": 0.7388250827789307, "grad_norm": 1.1862860918045044, "learning_rate": 9.697046952264563e-05} +{"ts": "2025-12-27T07:05:36", "event": "train_log", "step": 2598, "epoch": 1.0962025316455697, "progress_pct": 18.27, "epoch_pct": 18.27, "eta": "57:54:37", "max_grad_norm": 0.8, "loss": 0.6495320796966553, "grad_norm": 1.1127797365188599, "learning_rate": 9.696223195801348e-05} +{"ts": "2025-12-27T07:05:56", "event": "train_log", "step": 2600, "epoch": 1.0970464135021096, "progress_pct": 18.28, "epoch_pct": 18.28, "eta": "57:52:50", "max_grad_norm": 0.8, "loss": 0.7157143950462341, "grad_norm": 1.0863338708877563, "learning_rate": 9.695398356009636e-05} +{"ts": "2025-12-27T07:20:16", "event": "train_log", "step": 2600, "epoch": 1.0970464135021096, "progress_pct": 18.28, "epoch_pct": 18.28, "eta": "58:56:52", "max_grad_norm": 0.8, "eval_loss": 0.7377332448959351, "eval_runtime": 859.6612, "eval_samples_per_second": 2.451, "eval_steps_per_second": 2.451} +{"ts": "2025-12-27T07:20:36", "event": "train_log", "step": 2602, "epoch": 1.0978902953586498, "progress_pct": 18.3, "epoch_pct": 18.3, "eta": "58:55:04", "max_grad_norm": 0.8, "loss": 0.6597335934638977, "grad_norm": 1.1228652000427246, "learning_rate": 9.694572433079699e-05} +{"ts": "2025-12-27T07:20:56", "event": "train_log", "step": 2604, "epoch": 1.09873417721519, "progress_pct": 18.31, "epoch_pct": 18.31, "eta": "58:53:12", "max_grad_norm": 0.8, "loss": 0.6715680360794067, "grad_norm": 1.3077653646469116, "learning_rate": 9.69374542720206e-05} +{"ts": "2025-12-27T07:21:16", "event": "train_log", "step": 2606, "epoch": 1.09957805907173, "progress_pct": 18.33, "epoch_pct": 18.33, "eta": "58:51:25", "max_grad_norm": 0.8, "loss": 0.6910243034362793, "grad_norm": 1.241603970527649, "learning_rate": 9.692917338567499e-05} +{"ts": "2025-12-27T07:21:37", "event": "train_log", "step": 2608, "epoch": 1.10042194092827, "progress_pct": 18.34, "epoch_pct": 18.34, "eta": "58:49:36", "max_grad_norm": 0.8, "loss": 0.6519553065299988, "grad_norm": 1.1372551918029785, "learning_rate": 9.692088167367037e-05} +{"ts": "2025-12-27T07:21:56", "event": "train_log", "step": 2610, "epoch": 1.1012658227848102, "progress_pct": 18.35, "epoch_pct": 18.35, "eta": "58:47:42", "max_grad_norm": 0.8, "loss": 0.6542758941650391, "grad_norm": 1.2894765138626099, "learning_rate": 9.691257913791949e-05} +{"ts": "2025-12-27T07:22:17", "event": "train_log", "step": 2612, "epoch": 1.1021097046413502, "progress_pct": 18.37, "epoch_pct": 18.37, "eta": "58:46:01", "max_grad_norm": 0.8, "loss": 0.6886795163154602, "grad_norm": 1.0800915956497192, "learning_rate": 9.690426578033755e-05} +{"ts": "2025-12-27T07:22:37", "event": "train_log", "step": 2614, "epoch": 1.1029535864978903, "progress_pct": 18.38, "epoch_pct": 18.38, "eta": "58:44:08", "max_grad_norm": 0.8, "loss": 0.7512150406837463, "grad_norm": 1.3394384384155273, "learning_rate": 9.689594160284233e-05} +{"ts": "2025-12-27T07:22:57", "event": "train_log", "step": 2616, "epoch": 1.1037974683544305, "progress_pct": 18.4, "epoch_pct": 18.4, "eta": "58:42:18", "max_grad_norm": 0.8, "loss": 0.67207932472229, "grad_norm": 1.2175323963165283, "learning_rate": 9.688760660735402e-05} +{"ts": "2025-12-27T07:23:18", "event": "train_log", "step": 2618, "epoch": 1.1046413502109704, "progress_pct": 18.41, "epoch_pct": 18.41, "eta": "58:40:34", "max_grad_norm": 0.8, "loss": 0.6591740846633911, "grad_norm": 1.2181185483932495, "learning_rate": 9.687926079579537e-05} +{"ts": "2025-12-27T07:23:38", "event": "train_log", "step": 2620, "epoch": 1.1054852320675106, "progress_pct": 18.42, "epoch_pct": 18.42, "eta": "58:38:45", "max_grad_norm": 0.8, "loss": 0.6431041359901428, "grad_norm": 1.1740983724594116, "learning_rate": 9.68709041700916e-05} +{"ts": "2025-12-27T07:23:57", "event": "train_log", "step": 2622, "epoch": 1.1063291139240505, "progress_pct": 18.44, "epoch_pct": 18.44, "eta": "58:36:55", "max_grad_norm": 0.8, "loss": 0.6573615074157715, "grad_norm": 1.1792434453964233, "learning_rate": 9.686253673217038e-05} +{"ts": "2025-12-27T07:24:18", "event": "train_log", "step": 2624, "epoch": 1.1071729957805907, "progress_pct": 18.45, "epoch_pct": 18.45, "eta": "58:35:10", "max_grad_norm": 0.8, "loss": 0.5576209425926208, "grad_norm": 1.058391809463501, "learning_rate": 9.685415848396196e-05} +{"ts": "2025-12-27T07:24:37", "event": "train_log", "step": 2626, "epoch": 1.1080168776371309, "progress_pct": 18.47, "epoch_pct": 18.47, "eta": "58:33:17", "max_grad_norm": 0.8, "loss": 0.668684184551239, "grad_norm": 1.3203206062316895, "learning_rate": 9.684576942739903e-05} +{"ts": "2025-12-27T07:24:57", "event": "train_log", "step": 2628, "epoch": 1.1088607594936708, "progress_pct": 18.48, "epoch_pct": 18.48, "eta": "58:31:26", "max_grad_norm": 0.8, "loss": 0.6800089478492737, "grad_norm": 1.2391762733459473, "learning_rate": 9.68373695644168e-05} +{"ts": "2025-12-27T07:25:16", "event": "train_log", "step": 2630, "epoch": 1.109704641350211, "progress_pct": 18.5, "epoch_pct": 18.5, "eta": "58:29:32", "max_grad_norm": 0.8, "loss": 0.6433757543563843, "grad_norm": 1.2323405742645264, "learning_rate": 9.682895889695292e-05} +{"ts": "2025-12-27T07:25:36", "event": "train_log", "step": 2632, "epoch": 1.1105485232067511, "progress_pct": 18.51, "epoch_pct": 18.51, "eta": "58:27:44", "max_grad_norm": 0.8, "loss": 0.6628785729408264, "grad_norm": 1.2656551599502563, "learning_rate": 9.682053742694759e-05} +{"ts": "2025-12-27T07:25:54", "event": "train_log", "step": 2634, "epoch": 1.111392405063291, "progress_pct": 18.52, "epoch_pct": 18.52, "eta": "58:25:50", "max_grad_norm": 0.8, "loss": 0.6838971972465515, "grad_norm": 1.2984392642974854, "learning_rate": 9.681210515634349e-05} +{"ts": "2025-12-27T07:26:12", "event": "train_log", "step": 2636, "epoch": 1.1122362869198312, "progress_pct": 18.54, "epoch_pct": 18.54, "eta": "58:23:53", "max_grad_norm": 0.8, "loss": 0.7548647522926331, "grad_norm": 1.3200393915176392, "learning_rate": 9.680366208708576e-05} +{"ts": "2025-12-27T07:26:31", "event": "train_log", "step": 2638, "epoch": 1.1130801687763714, "progress_pct": 18.55, "epoch_pct": 18.55, "eta": "58:22:01", "max_grad_norm": 0.8, "loss": 0.6553335189819336, "grad_norm": 1.225388526916504, "learning_rate": 9.679520822112208e-05} +{"ts": "2025-12-27T07:26:51", "event": "train_log", "step": 2640, "epoch": 1.1139240506329113, "progress_pct": 18.57, "epoch_pct": 18.57, "eta": "58:20:10", "max_grad_norm": 0.8, "loss": 0.631401538848877, "grad_norm": 1.2350653409957886, "learning_rate": 9.678674356040259e-05} +{"ts": "2025-12-27T07:27:11", "event": "train_log", "step": 2642, "epoch": 1.1147679324894515, "progress_pct": 18.58, "epoch_pct": 18.58, "eta": "58:18:27", "max_grad_norm": 0.8, "loss": 0.6459156274795532, "grad_norm": 1.2325507402420044, "learning_rate": 9.677826810687989e-05} +{"ts": "2025-12-27T07:27:32", "event": "train_log", "step": 2644, "epoch": 1.1156118143459917, "progress_pct": 18.59, "epoch_pct": 18.59, "eta": "58:16:40", "max_grad_norm": 0.8, "loss": 0.6425284743309021, "grad_norm": 1.0008996725082397, "learning_rate": 9.676978186250915e-05} +{"ts": "2025-12-27T07:27:50", "event": "train_log", "step": 2646, "epoch": 1.1164556962025316, "progress_pct": 18.61, "epoch_pct": 18.61, "eta": "58:14:46", "max_grad_norm": 0.8, "loss": 0.6451422572135925, "grad_norm": 1.3767247200012207, "learning_rate": 9.676128482924796e-05} +{"ts": "2025-12-27T07:28:09", "event": "train_log", "step": 2648, "epoch": 1.1172995780590718, "progress_pct": 18.62, "epoch_pct": 18.62, "eta": "58:12:56", "max_grad_norm": 0.8, "loss": 0.6713272929191589, "grad_norm": 1.2070895433425903, "learning_rate": 9.675277700905643e-05} +{"ts": "2025-12-27T07:28:29", "event": "train_log", "step": 2650, "epoch": 1.1181434599156117, "progress_pct": 18.64, "epoch_pct": 18.64, "eta": "58:11:07", "max_grad_norm": 0.8, "loss": 0.6285044550895691, "grad_norm": 1.1582069396972656, "learning_rate": 9.674425840389716e-05} +{"ts": "2025-12-27T07:28:51", "event": "train_log", "step": 2652, "epoch": 1.1189873417721519, "progress_pct": 18.65, "epoch_pct": 18.65, "eta": "58:09:27", "max_grad_norm": 0.8, "loss": 0.624229907989502, "grad_norm": 1.1641311645507812, "learning_rate": 9.67357290157352e-05} +{"ts": "2025-12-27T07:29:10", "event": "train_log", "step": 2654, "epoch": 1.119831223628692, "progress_pct": 18.66, "epoch_pct": 18.66, "eta": "58:07:38", "max_grad_norm": 0.8, "loss": 0.7214919328689575, "grad_norm": 1.3071147203445435, "learning_rate": 9.672718884653814e-05} +{"ts": "2025-12-27T07:29:29", "event": "train_log", "step": 2656, "epoch": 1.120675105485232, "progress_pct": 18.68, "epoch_pct": 18.68, "eta": "58:05:48", "max_grad_norm": 0.8, "loss": 0.8062215447425842, "grad_norm": 1.2157800197601318, "learning_rate": 9.671863789827602e-05} +{"ts": "2025-12-27T07:29:49", "event": "train_log", "step": 2658, "epoch": 1.1215189873417721, "progress_pct": 18.69, "epoch_pct": 18.69, "eta": "58:04:01", "max_grad_norm": 0.8, "loss": 0.6362426280975342, "grad_norm": 1.2843927145004272, "learning_rate": 9.671007617292138e-05} +{"ts": "2025-12-27T07:30:08", "event": "train_log", "step": 2660, "epoch": 1.1223628691983123, "progress_pct": 18.71, "epoch_pct": 18.71, "eta": "58:02:11", "max_grad_norm": 0.8, "loss": 0.6181318163871765, "grad_norm": 1.1182712316513062, "learning_rate": 9.670150367244927e-05} +{"ts": "2025-12-27T07:30:27", "event": "train_log", "step": 2662, "epoch": 1.1232067510548522, "progress_pct": 18.72, "epoch_pct": 18.72, "eta": "58:00:18", "max_grad_norm": 0.8, "loss": 0.6973897218704224, "grad_norm": 1.566605806350708, "learning_rate": 9.669292039883717e-05} +{"ts": "2025-12-27T07:30:47", "event": "train_log", "step": 2664, "epoch": 1.1240506329113924, "progress_pct": 18.73, "epoch_pct": 18.73, "eta": "57:58:31", "max_grad_norm": 0.8, "loss": 0.6117324829101562, "grad_norm": 1.0726850032806396, "learning_rate": 9.66843263540651e-05} +{"ts": "2025-12-27T07:31:06", "event": "train_log", "step": 2666, "epoch": 1.1248945147679326, "progress_pct": 18.75, "epoch_pct": 18.75, "eta": "57:56:43", "max_grad_norm": 0.8, "loss": 0.642676830291748, "grad_norm": 1.2953020334243774, "learning_rate": 9.66757215401155e-05} +{"ts": "2025-12-27T07:31:25", "event": "train_log", "step": 2668, "epoch": 1.1257383966244725, "progress_pct": 18.76, "epoch_pct": 18.76, "eta": "57:54:53", "max_grad_norm": 0.8, "loss": 0.6757452487945557, "grad_norm": 1.1184383630752563, "learning_rate": 9.66671059589734e-05} +{"ts": "2025-12-27T07:31:44", "event": "train_log", "step": 2670, "epoch": 1.1265822784810127, "progress_pct": 18.78, "epoch_pct": 18.78, "eta": "57:53:04", "max_grad_norm": 0.8, "loss": 0.6861951947212219, "grad_norm": 1.2732970714569092, "learning_rate": 9.66584796126262e-05} +{"ts": "2025-12-27T07:32:04", "event": "train_log", "step": 2672, "epoch": 1.1274261603375528, "progress_pct": 18.79, "epoch_pct": 18.79, "eta": "57:51:16", "max_grad_norm": 0.8, "loss": 0.6727077960968018, "grad_norm": 1.2713000774383545, "learning_rate": 9.664984250306383e-05} +{"ts": "2025-12-27T07:32:23", "event": "train_log", "step": 2674, "epoch": 1.1282700421940928, "progress_pct": 18.8, "epoch_pct": 18.8, "eta": "57:49:27", "max_grad_norm": 0.8, "loss": 0.7355974912643433, "grad_norm": 1.269827961921692, "learning_rate": 9.664119463227874e-05} +{"ts": "2025-12-27T07:32:42", "event": "train_log", "step": 2676, "epoch": 1.129113924050633, "progress_pct": 18.82, "epoch_pct": 18.82, "eta": "57:47:38", "max_grad_norm": 0.8, "loss": 0.7121313214302063, "grad_norm": 1.3067172765731812, "learning_rate": 9.663253600226581e-05} +{"ts": "2025-12-27T07:33:02", "event": "train_log", "step": 2678, "epoch": 1.129957805907173, "progress_pct": 18.83, "epoch_pct": 18.83, "eta": "57:45:53", "max_grad_norm": 0.8, "loss": 0.6671369075775146, "grad_norm": 1.2958797216415405, "learning_rate": 9.662386661502242e-05} +{"ts": "2025-12-27T07:33:21", "event": "train_log", "step": 2680, "epoch": 1.130801687763713, "progress_pct": 18.85, "epoch_pct": 18.85, "eta": "57:44:00", "max_grad_norm": 0.8, "loss": 0.6153768301010132, "grad_norm": 1.2943401336669922, "learning_rate": 9.661518647254842e-05} +{"ts": "2025-12-27T07:33:40", "event": "train_log", "step": 2682, "epoch": 1.1316455696202532, "progress_pct": 18.86, "epoch_pct": 18.86, "eta": "57:42:15", "max_grad_norm": 0.8, "loss": 0.6070778965950012, "grad_norm": 1.1744167804718018, "learning_rate": 9.660649557684616e-05} +{"ts": "2025-12-27T07:34:00", "event": "train_log", "step": 2684, "epoch": 1.1324894514767934, "progress_pct": 18.87, "epoch_pct": 18.87, "eta": "57:40:27", "max_grad_norm": 0.8, "loss": 0.676887035369873, "grad_norm": 1.159209132194519, "learning_rate": 9.659779392992047e-05} +{"ts": "2025-12-27T07:34:20", "event": "train_log", "step": 2686, "epoch": 1.1333333333333333, "progress_pct": 18.89, "epoch_pct": 18.89, "eta": "57:38:42", "max_grad_norm": 0.8, "loss": 0.6086745262145996, "grad_norm": 1.1937510967254639, "learning_rate": 9.658908153377866e-05} +{"ts": "2025-12-27T07:34:40", "event": "train_log", "step": 2688, "epoch": 1.1341772151898735, "progress_pct": 18.9, "epoch_pct": 18.9, "eta": "57:36:59", "max_grad_norm": 0.8, "loss": 0.6493708491325378, "grad_norm": 1.1461687088012695, "learning_rate": 9.658035839043049e-05} +{"ts": "2025-12-27T07:35:00", "event": "train_log", "step": 2690, "epoch": 1.1350210970464134, "progress_pct": 18.92, "epoch_pct": 18.92, "eta": "57:35:16", "max_grad_norm": 0.8, "loss": 0.6813004016876221, "grad_norm": 2.066361665725708, "learning_rate": 9.657162450188824e-05} +{"ts": "2025-12-27T07:35:21", "event": "train_log", "step": 2692, "epoch": 1.1358649789029536, "progress_pct": 18.93, "epoch_pct": 18.93, "eta": "57:33:35", "max_grad_norm": 0.8, "loss": 0.721062183380127, "grad_norm": 1.086910367012024, "learning_rate": 9.656287987016664e-05} +{"ts": "2025-12-27T07:35:42", "event": "train_log", "step": 2694, "epoch": 1.1367088607594937, "progress_pct": 18.95, "epoch_pct": 18.95, "eta": "57:31:56", "max_grad_norm": 0.8, "loss": 0.5975021123886108, "grad_norm": 1.1869292259216309, "learning_rate": 9.65541244972829e-05} +{"ts": "2025-12-27T07:36:01", "event": "train_log", "step": 2696, "epoch": 1.1375527426160337, "progress_pct": 18.96, "epoch_pct": 18.96, "eta": "57:30:07", "max_grad_norm": 0.8, "loss": 0.6818324327468872, "grad_norm": 1.2456518411636353, "learning_rate": 9.654535838525674e-05} +{"ts": "2025-12-27T07:36:21", "event": "train_log", "step": 2698, "epoch": 1.1383966244725738, "progress_pct": 18.97, "epoch_pct": 18.97, "eta": "57:28:20", "max_grad_norm": 0.8, "loss": 0.6844469308853149, "grad_norm": 1.5271464586257935, "learning_rate": 9.653658153611031e-05} +{"ts": "2025-12-27T07:36:41", "event": "train_log", "step": 2700, "epoch": 1.139240506329114, "progress_pct": 18.99, "epoch_pct": 18.99, "eta": "57:26:39", "max_grad_norm": 0.8, "loss": 0.6388684511184692, "grad_norm": 1.1403794288635254, "learning_rate": 9.652779395186827e-05} +{"ts": "2025-12-27T07:51:03", "event": "train_log", "step": 2700, "epoch": 1.139240506329114, "progress_pct": 18.99, "epoch_pct": 18.99, "eta": "58:27:56", "max_grad_norm": 0.8, "eval_loss": 0.7335711717605591, "eval_runtime": 861.9651, "eval_samples_per_second": 2.444, "eval_steps_per_second": 2.444} +{"ts": "2025-12-27T07:51:23", "event": "train_log", "step": 2702, "epoch": 1.140084388185654, "progress_pct": 19.0, "epoch_pct": 19.0, "eta": "58:26:08", "max_grad_norm": 0.8, "loss": 0.6154619455337524, "grad_norm": 1.1091634035110474, "learning_rate": 9.651899563455775e-05} +{"ts": "2025-12-27T07:51:43", "event": "train_log", "step": 2704, "epoch": 1.140928270042194, "progress_pct": 19.02, "epoch_pct": 19.02, "eta": "58:24:21", "max_grad_norm": 0.8, "loss": 0.629319429397583, "grad_norm": 1.3280601501464844, "learning_rate": 9.651018658620837e-05} +{"ts": "2025-12-27T07:52:02", "event": "train_log", "step": 2706, "epoch": 1.1417721518987343, "progress_pct": 19.03, "epoch_pct": 19.03, "eta": "58:22:33", "max_grad_norm": 0.8, "loss": 0.6088175773620605, "grad_norm": 1.226806402206421, "learning_rate": 9.650136680885216e-05} +{"ts": "2025-12-27T07:52:22", "event": "train_log", "step": 2708, "epoch": 1.1426160337552742, "progress_pct": 19.04, "epoch_pct": 19.04, "eta": "58:20:45", "max_grad_norm": 0.8, "loss": 0.6199659705162048, "grad_norm": 1.0593408346176147, "learning_rate": 9.649253630452372e-05} +{"ts": "2025-12-27T07:52:41", "event": "train_log", "step": 2710, "epoch": 1.1434599156118144, "progress_pct": 19.06, "epoch_pct": 19.06, "eta": "58:18:55", "max_grad_norm": 0.8, "loss": 0.7233364582061768, "grad_norm": 1.1112475395202637, "learning_rate": 9.648369507526008e-05} +{"ts": "2025-12-27T07:53:01", "event": "train_log", "step": 2712, "epoch": 1.1443037974683543, "progress_pct": 19.07, "epoch_pct": 19.07, "eta": "58:17:08", "max_grad_norm": 0.8, "loss": 0.6687955856323242, "grad_norm": 1.1737885475158691, "learning_rate": 9.647484312310068e-05} +{"ts": "2025-12-27T07:53:20", "event": "train_log", "step": 2714, "epoch": 1.1451476793248945, "progress_pct": 19.09, "epoch_pct": 19.09, "eta": "58:15:19", "max_grad_norm": 0.8, "loss": 0.6508969068527222, "grad_norm": 1.194532036781311, "learning_rate": 9.646598045008756e-05} +{"ts": "2025-12-27T07:53:40", "event": "train_log", "step": 2716, "epoch": 1.1459915611814346, "progress_pct": 19.1, "epoch_pct": 19.1, "eta": "58:13:30", "max_grad_norm": 0.8, "loss": 0.6408317685127258, "grad_norm": 1.069395899772644, "learning_rate": 9.645710705826517e-05} +{"ts": "2025-12-27T07:53:59", "event": "train_log", "step": 2718, "epoch": 1.1468354430379746, "progress_pct": 19.11, "epoch_pct": 19.11, "eta": "58:11:41", "max_grad_norm": 0.8, "loss": 0.650763750076294, "grad_norm": 1.2429133653640747, "learning_rate": 9.644822294968037e-05} +{"ts": "2025-12-27T07:54:19", "event": "train_log", "step": 2720, "epoch": 1.1476793248945147, "progress_pct": 19.13, "epoch_pct": 19.13, "eta": "58:09:53", "max_grad_norm": 0.8, "loss": 0.6952191591262817, "grad_norm": 1.2950133085250854, "learning_rate": 9.64393281263826e-05} +{"ts": "2025-12-27T07:54:39", "event": "train_log", "step": 2722, "epoch": 1.148523206751055, "progress_pct": 19.14, "epoch_pct": 19.14, "eta": "58:08:10", "max_grad_norm": 0.8, "loss": 0.6772956252098083, "grad_norm": 1.1972628831863403, "learning_rate": 9.643042259042372e-05} +{"ts": "2025-12-27T07:54:58", "event": "train_log", "step": 2724, "epoch": 1.1493670886075948, "progress_pct": 19.16, "epoch_pct": 19.16, "eta": "58:06:19", "max_grad_norm": 0.8, "loss": 0.6734447479248047, "grad_norm": 1.1670407056808472, "learning_rate": 9.642150634385805e-05} +{"ts": "2025-12-27T07:55:18", "event": "train_log", "step": 2726, "epoch": 1.150210970464135, "progress_pct": 19.17, "epoch_pct": 19.17, "eta": "58:04:36", "max_grad_norm": 0.8, "loss": 0.6387717127799988, "grad_norm": 1.120302677154541, "learning_rate": 9.641257938874243e-05} +{"ts": "2025-12-27T07:55:38", "event": "train_log", "step": 2728, "epoch": 1.1510548523206752, "progress_pct": 19.18, "epoch_pct": 19.18, "eta": "58:02:48", "max_grad_norm": 0.8, "loss": 0.6592874526977539, "grad_norm": 1.1241344213485718, "learning_rate": 9.640364172713609e-05} +{"ts": "2025-12-27T07:55:57", "event": "train_log", "step": 2730, "epoch": 1.1518987341772151, "progress_pct": 19.2, "epoch_pct": 19.2, "eta": "58:01:00", "max_grad_norm": 0.8, "loss": 0.7257466912269592, "grad_norm": 1.2627261877059937, "learning_rate": 9.639469336110083e-05} +{"ts": "2025-12-27T07:56:17", "event": "train_log", "step": 2732, "epoch": 1.1527426160337553, "progress_pct": 19.21, "epoch_pct": 19.21, "eta": "57:59:15", "max_grad_norm": 0.8, "loss": 0.572188138961792, "grad_norm": 1.0528618097305298, "learning_rate": 9.638573429270083e-05} +{"ts": "2025-12-27T07:56:35", "event": "train_log", "step": 2734, "epoch": 1.1535864978902954, "progress_pct": 19.23, "epoch_pct": 19.23, "eta": "57:57:21", "max_grad_norm": 0.8, "loss": 0.678981602191925, "grad_norm": 1.212536334991455, "learning_rate": 9.637676452400277e-05} +{"ts": "2025-12-27T07:56:54", "event": "train_log", "step": 2736, "epoch": 1.1544303797468354, "progress_pct": 19.24, "epoch_pct": 19.24, "eta": "57:55:30", "max_grad_norm": 0.8, "loss": 0.6375001072883606, "grad_norm": 1.152167797088623, "learning_rate": 9.636778405707582e-05} +{"ts": "2025-12-27T07:57:13", "event": "train_log", "step": 2738, "epoch": 1.1552742616033755, "progress_pct": 19.25, "epoch_pct": 19.25, "eta": "57:53:42", "max_grad_norm": 0.8, "loss": 0.7602289319038391, "grad_norm": 1.2400429248809814, "learning_rate": 9.635879289399161e-05} +{"ts": "2025-12-27T07:57:33", "event": "train_log", "step": 2740, "epoch": 1.1561181434599157, "progress_pct": 19.27, "epoch_pct": 19.27, "eta": "57:51:58", "max_grad_norm": 0.8, "loss": 0.6209543943405151, "grad_norm": 1.3488622903823853, "learning_rate": 9.634979103682421e-05} +{"ts": "2025-12-27T07:57:53", "event": "train_log", "step": 2742, "epoch": 1.1569620253164556, "progress_pct": 19.28, "epoch_pct": 19.28, "eta": "57:50:13", "max_grad_norm": 0.8, "loss": 0.6215830445289612, "grad_norm": 1.1999555826187134, "learning_rate": 9.634077848765019e-05} +{"ts": "2025-12-27T07:58:11", "event": "train_log", "step": 2744, "epoch": 1.1578059071729958, "progress_pct": 19.3, "epoch_pct": 19.3, "eta": "57:48:22", "max_grad_norm": 0.8, "loss": 0.6634654998779297, "grad_norm": 1.2008578777313232, "learning_rate": 9.633175524854855e-05} +{"ts": "2025-12-27T07:58:30", "event": "train_log", "step": 2746, "epoch": 1.158649789029536, "progress_pct": 19.31, "epoch_pct": 19.31, "eta": "57:46:31", "max_grad_norm": 0.8, "loss": 0.7515161633491516, "grad_norm": 1.3920676708221436, "learning_rate": 9.63227213216008e-05} +{"ts": "2025-12-27T07:58:48", "event": "train_log", "step": 2748, "epoch": 1.159493670886076, "progress_pct": 19.32, "epoch_pct": 19.32, "eta": "57:44:39", "max_grad_norm": 0.8, "loss": 0.724361777305603, "grad_norm": 1.0551656484603882, "learning_rate": 9.631367670889089e-05} +{"ts": "2025-12-27T07:59:07", "event": "train_log", "step": 2750, "epoch": 1.160337552742616, "progress_pct": 19.34, "epoch_pct": 19.34, "eta": "57:42:50", "max_grad_norm": 0.8, "loss": 0.6673553586006165, "grad_norm": 1.2820028066635132, "learning_rate": 9.630462141250523e-05} +{"ts": "2025-12-27T07:59:26", "event": "train_log", "step": 2752, "epoch": 1.1611814345991562, "progress_pct": 19.35, "epoch_pct": 19.35, "eta": "57:41:02", "max_grad_norm": 0.8, "loss": 0.7029784917831421, "grad_norm": 1.1452983617782593, "learning_rate": 9.62955554345327e-05} +{"ts": "2025-12-27T07:59:46", "event": "train_log", "step": 2754, "epoch": 1.1620253164556962, "progress_pct": 19.37, "epoch_pct": 19.37, "eta": "57:39:19", "max_grad_norm": 0.8, "loss": 0.7355457544326782, "grad_norm": 1.1808624267578125, "learning_rate": 9.628647877706466e-05} +{"ts": "2025-12-27T08:00:05", "event": "train_log", "step": 2756, "epoch": 1.1628691983122363, "progress_pct": 19.38, "epoch_pct": 19.38, "eta": "57:37:33", "max_grad_norm": 0.8, "loss": 0.6144933700561523, "grad_norm": 1.0574703216552734, "learning_rate": 9.627739144219492e-05} +{"ts": "2025-12-27T08:00:25", "event": "train_log", "step": 2758, "epoch": 1.1637130801687763, "progress_pct": 19.4, "epoch_pct": 19.4, "eta": "57:35:49", "max_grad_norm": 0.8, "loss": 0.6843759417533875, "grad_norm": 1.215733528137207, "learning_rate": 9.626829343201974e-05} +{"ts": "2025-12-27T08:00:46", "event": "train_log", "step": 2760, "epoch": 1.1645569620253164, "progress_pct": 19.41, "epoch_pct": 19.41, "eta": "57:34:09", "max_grad_norm": 0.8, "loss": 0.6197049617767334, "grad_norm": 1.1667706966400146, "learning_rate": 9.625918474863787e-05} +{"ts": "2025-12-27T08:01:06", "event": "train_log", "step": 2762, "epoch": 1.1654008438818566, "progress_pct": 19.42, "epoch_pct": 19.42, "eta": "57:32:25", "max_grad_norm": 0.8, "loss": 0.715958297252655, "grad_norm": 1.3765631914138794, "learning_rate": 9.62500653941505e-05} +{"ts": "2025-12-27T08:01:24", "event": "train_log", "step": 2764, "epoch": 1.1662447257383965, "progress_pct": 19.44, "epoch_pct": 19.44, "eta": "57:30:35", "max_grad_norm": 0.8, "loss": 0.7433139085769653, "grad_norm": 1.173715591430664, "learning_rate": 9.62409353706613e-05} +{"ts": "2025-12-27T08:01:44", "event": "train_log", "step": 2766, "epoch": 1.1670886075949367, "progress_pct": 19.45, "epoch_pct": 19.45, "eta": "57:28:52", "max_grad_norm": 0.8, "loss": 0.7174371480941772, "grad_norm": 1.1837430000305176, "learning_rate": 9.623179468027637e-05} +{"ts": "2025-12-27T08:02:05", "event": "train_log", "step": 2768, "epoch": 1.1679324894514769, "progress_pct": 19.47, "epoch_pct": 19.47, "eta": "57:27:13", "max_grad_norm": 0.8, "loss": 0.7184823751449585, "grad_norm": 1.1577154397964478, "learning_rate": 9.622264332510432e-05} +{"ts": "2025-12-27T08:02:23", "event": "train_log", "step": 2770, "epoch": 1.1687763713080168, "progress_pct": 19.48, "epoch_pct": 19.48, "eta": "57:25:23", "max_grad_norm": 0.8, "loss": 0.693343460559845, "grad_norm": 1.165246605873108, "learning_rate": 9.621348130725617e-05} +{"ts": "2025-12-27T08:02:43", "event": "train_log", "step": 2772, "epoch": 1.169620253164557, "progress_pct": 19.49, "epoch_pct": 19.49, "eta": "57:23:38", "max_grad_norm": 0.8, "loss": 0.6999852061271667, "grad_norm": 1.2853080034255981, "learning_rate": 9.620430862884542e-05} +{"ts": "2025-12-27T08:03:03", "event": "train_log", "step": 2774, "epoch": 1.1704641350210971, "progress_pct": 19.51, "epoch_pct": 19.51, "eta": "57:21:56", "max_grad_norm": 0.8, "loss": 0.6034331321716309, "grad_norm": 1.1782865524291992, "learning_rate": 9.619512529198806e-05} +{"ts": "2025-12-27T08:03:21", "event": "train_log", "step": 2776, "epoch": 1.171308016877637, "progress_pct": 19.52, "epoch_pct": 19.52, "eta": "57:20:06", "max_grad_norm": 0.8, "loss": 0.7588269710540771, "grad_norm": 1.4055447578430176, "learning_rate": 9.61859312988025e-05} +{"ts": "2025-12-27T08:03:40", "event": "train_log", "step": 2778, "epoch": 1.1721518987341772, "progress_pct": 19.54, "epoch_pct": 19.54, "eta": "57:18:21", "max_grad_norm": 0.8, "loss": 0.6913981437683105, "grad_norm": 1.1148805618286133, "learning_rate": 9.617672665140957e-05} +{"ts": "2025-12-27T08:04:01", "event": "train_log", "step": 2780, "epoch": 1.1729957805907172, "progress_pct": 19.55, "epoch_pct": 19.55, "eta": "57:16:43", "max_grad_norm": 0.8, "loss": 0.5976925492286682, "grad_norm": 1.1311042308807373, "learning_rate": 9.616751135193266e-05} +{"ts": "2025-12-27T08:04:21", "event": "train_log", "step": 2782, "epoch": 1.1738396624472573, "progress_pct": 19.56, "epoch_pct": 19.56, "eta": "57:14:59", "max_grad_norm": 0.8, "loss": 0.6897050142288208, "grad_norm": 1.2378602027893066, "learning_rate": 9.615828540249754e-05} +{"ts": "2025-12-27T08:04:42", "event": "train_log", "step": 2784, "epoch": 1.1746835443037975, "progress_pct": 19.58, "epoch_pct": 19.58, "eta": "57:13:21", "max_grad_norm": 0.8, "loss": 0.6772098541259766, "grad_norm": 1.3445732593536377, "learning_rate": 9.614904880523248e-05} +{"ts": "2025-12-27T08:05:00", "event": "train_log", "step": 2786, "epoch": 1.1755274261603375, "progress_pct": 19.59, "epoch_pct": 19.59, "eta": "57:11:30", "max_grad_norm": 0.8, "loss": 0.6354818344116211, "grad_norm": 1.3380862474441528, "learning_rate": 9.613980156226815e-05} +{"ts": "2025-12-27T08:05:22", "event": "train_log", "step": 2788, "epoch": 1.1763713080168776, "progress_pct": 19.61, "epoch_pct": 19.61, "eta": "57:09:56", "max_grad_norm": 0.8, "loss": 0.6541208028793335, "grad_norm": 1.0955157279968262, "learning_rate": 9.613054367573773e-05} +{"ts": "2025-12-27T08:05:42", "event": "train_log", "step": 2790, "epoch": 1.1772151898734178, "progress_pct": 19.62, "epoch_pct": 19.62, "eta": "57:08:16", "max_grad_norm": 0.8, "loss": 0.6472887992858887, "grad_norm": 1.0176626443862915, "learning_rate": 9.612127514777686e-05} +{"ts": "2025-12-27T08:06:01", "event": "train_log", "step": 2792, "epoch": 1.1780590717299577, "progress_pct": 19.63, "epoch_pct": 19.63, "eta": "57:06:30", "max_grad_norm": 0.8, "loss": 0.7511212229728699, "grad_norm": 1.2644864320755005, "learning_rate": 9.611199598052357e-05} +{"ts": "2025-12-27T08:06:21", "event": "train_log", "step": 2794, "epoch": 1.1789029535864979, "progress_pct": 19.65, "epoch_pct": 19.65, "eta": "57:04:48", "max_grad_norm": 0.8, "loss": 0.696236789226532, "grad_norm": 1.248197317123413, "learning_rate": 9.61027061761184e-05} +{"ts": "2025-12-27T08:06:40", "event": "train_log", "step": 2796, "epoch": 1.179746835443038, "progress_pct": 19.66, "epoch_pct": 19.66, "eta": "57:03:06", "max_grad_norm": 0.8, "loss": 0.5962010622024536, "grad_norm": 1.189935564994812, "learning_rate": 9.609340573670436e-05} +{"ts": "2025-12-27T08:07:00", "event": "train_log", "step": 2798, "epoch": 1.180590717299578, "progress_pct": 19.68, "epoch_pct": 19.68, "eta": "57:01:25", "max_grad_norm": 0.8, "loss": 0.5981685519218445, "grad_norm": 1.1760492324829102, "learning_rate": 9.608409466442685e-05} +{"ts": "2025-12-27T08:07:20", "event": "train_log", "step": 2800, "epoch": 1.1814345991561181, "progress_pct": 19.69, "epoch_pct": 19.69, "eta": "56:59:42", "max_grad_norm": 0.8, "loss": 0.6186091303825378, "grad_norm": 1.1820716857910156, "learning_rate": 9.607477296143374e-05} +{"ts": "2025-12-27T08:21:29", "event": "train_log", "step": 2800, "epoch": 1.1814345991561181, "progress_pct": 19.69, "epoch_pct": 19.69, "eta": "57:57:27", "max_grad_norm": 0.8, "eval_loss": 0.7298192977905273, "eval_runtime": 849.544, "eval_samples_per_second": 2.48, "eval_steps_per_second": 2.48} +{"ts": "2025-12-27T08:21:50", "event": "train_log", "step": 2802, "epoch": 1.1822784810126583, "progress_pct": 19.7, "epoch_pct": 19.7, "eta": "57:55:45", "max_grad_norm": 0.8, "loss": 0.5859389901161194, "grad_norm": 1.0353888273239136, "learning_rate": 9.606544062987541e-05} +{"ts": "2025-12-27T08:22:08", "event": "train_log", "step": 2804, "epoch": 1.1831223628691983, "progress_pct": 19.72, "epoch_pct": 19.72, "eta": "57:53:53", "max_grad_norm": 0.8, "loss": 0.6573460698127747, "grad_norm": 1.3141933679580688, "learning_rate": 9.605609767190464e-05} +{"ts": "2025-12-27T08:22:27", "event": "train_log", "step": 2806, "epoch": 1.1839662447257384, "progress_pct": 19.73, "epoch_pct": 19.73, "eta": "57:52:07", "max_grad_norm": 0.8, "loss": 0.6991921067237854, "grad_norm": 1.1209372282028198, "learning_rate": 9.604674408967664e-05} +{"ts": "2025-12-27T08:22:48", "event": "train_log", "step": 2808, "epoch": 1.1848101265822786, "progress_pct": 19.75, "epoch_pct": 19.75, "eta": "57:50:26", "max_grad_norm": 0.8, "loss": 0.6438087821006775, "grad_norm": 1.2830493450164795, "learning_rate": 9.603737988534913e-05} +{"ts": "2025-12-27T08:23:07", "event": "train_log", "step": 2810, "epoch": 1.1856540084388185, "progress_pct": 19.76, "epoch_pct": 19.76, "eta": "57:48:38", "max_grad_norm": 0.8, "loss": 0.6452094316482544, "grad_norm": 1.1427195072174072, "learning_rate": 9.602800506108225e-05} +{"ts": "2025-12-27T08:23:25", "event": "train_log", "step": 2812, "epoch": 1.1864978902953587, "progress_pct": 19.77, "epoch_pct": 19.77, "eta": "57:46:47", "max_grad_norm": 0.8, "loss": 0.6745601296424866, "grad_norm": 1.316420078277588, "learning_rate": 9.601861961903857e-05} +{"ts": "2025-12-27T08:23:44", "event": "train_log", "step": 2814, "epoch": 1.1873417721518988, "progress_pct": 19.79, "epoch_pct": 19.79, "eta": "57:45:01", "max_grad_norm": 0.8, "loss": 0.6761514544487, "grad_norm": 1.1643308401107788, "learning_rate": 9.600922356138317e-05} +{"ts": "2025-12-27T08:24:04", "event": "train_log", "step": 2816, "epoch": 1.1881856540084388, "progress_pct": 19.8, "epoch_pct": 19.8, "eta": "57:43:15", "max_grad_norm": 0.8, "loss": 0.6453908681869507, "grad_norm": 1.036056399345398, "learning_rate": 9.59998168902835e-05} +{"ts": "2025-12-27T08:24:22", "event": "train_log", "step": 2818, "epoch": 1.189029535864979, "progress_pct": 19.82, "epoch_pct": 19.82, "eta": "57:41:27", "max_grad_norm": 0.8, "loss": 0.6576406359672546, "grad_norm": 1.2211129665374756, "learning_rate": 9.599039960790954e-05} +{"ts": "2025-12-27T08:24:42", "event": "train_log", "step": 2820, "epoch": 1.189873417721519, "progress_pct": 19.83, "epoch_pct": 19.83, "eta": "57:39:45", "max_grad_norm": 0.8, "loss": 0.6214181780815125, "grad_norm": 1.084114670753479, "learning_rate": 9.598097171643364e-05} +{"ts": "2025-12-27T08:25:02", "event": "train_log", "step": 2822, "epoch": 1.190717299578059, "progress_pct": 19.85, "epoch_pct": 19.85, "eta": "57:38:00", "max_grad_norm": 0.8, "loss": 0.6381646990776062, "grad_norm": 1.1297314167022705, "learning_rate": 9.597153321803064e-05} +{"ts": "2025-12-27T08:25:21", "event": "train_log", "step": 2824, "epoch": 1.1915611814345992, "progress_pct": 19.86, "epoch_pct": 19.86, "eta": "57:36:14", "max_grad_norm": 0.8, "loss": 0.7129076719284058, "grad_norm": 1.2568120956420898, "learning_rate": 9.596208411487784e-05} +{"ts": "2025-12-27T08:25:41", "event": "train_log", "step": 2826, "epoch": 1.1924050632911392, "progress_pct": 19.87, "epoch_pct": 19.87, "eta": "57:34:30", "max_grad_norm": 0.8, "loss": 0.7123546004295349, "grad_norm": 1.07041335105896, "learning_rate": 9.595262440915493e-05} +{"ts": "2025-12-27T08:26:00", "event": "train_log", "step": 2828, "epoch": 1.1932489451476793, "progress_pct": 19.89, "epoch_pct": 19.89, "eta": "57:32:45", "max_grad_norm": 0.8, "loss": 0.7263038158416748, "grad_norm": 1.3950074911117554, "learning_rate": 9.594315410304413e-05} +{"ts": "2025-12-27T08:26:20", "event": "train_log", "step": 2830, "epoch": 1.1940928270042195, "progress_pct": 19.9, "epoch_pct": 19.9, "eta": "57:31:03", "max_grad_norm": 0.8, "loss": 0.6863036751747131, "grad_norm": 1.2470672130584717, "learning_rate": 9.593367319873002e-05} +{"ts": "2025-12-27T08:26:42", "event": "train_log", "step": 2832, "epoch": 1.1949367088607594, "progress_pct": 19.92, "epoch_pct": 19.92, "eta": "57:29:29", "max_grad_norm": 0.8, "loss": 0.745354175567627, "grad_norm": 1.2065461874008179, "learning_rate": 9.592418169839968e-05} +{"ts": "2025-12-27T08:27:02", "event": "train_log", "step": 2834, "epoch": 1.1957805907172996, "progress_pct": 19.93, "epoch_pct": 19.93, "eta": "57:27:46", "max_grad_norm": 0.8, "loss": 0.6401656866073608, "grad_norm": 1.1710152626037598, "learning_rate": 9.591467960424261e-05} +{"ts": "2025-12-27T08:27:21", "event": "train_log", "step": 2836, "epoch": 1.1966244725738397, "progress_pct": 19.94, "epoch_pct": 19.94, "eta": "57:25:59", "max_grad_norm": 0.8, "loss": 0.7402615547180176, "grad_norm": 1.3324087858200073, "learning_rate": 9.590516691845077e-05} +{"ts": "2025-12-27T08:27:41", "event": "train_log", "step": 2838, "epoch": 1.1974683544303797, "progress_pct": 19.96, "epoch_pct": 19.96, "eta": "57:24:20", "max_grad_norm": 0.8, "loss": 0.5723769068717957, "grad_norm": 1.0100195407867432, "learning_rate": 9.589564364321855e-05} +{"ts": "2025-12-27T08:28:02", "event": "train_log", "step": 2840, "epoch": 1.1983122362869199, "progress_pct": 19.97, "epoch_pct": 19.97, "eta": "57:22:39", "max_grad_norm": 0.8, "loss": 0.6618966460227966, "grad_norm": 1.2706246376037598, "learning_rate": 9.588610978074277e-05} +{"ts": "2025-12-27T08:28:23", "event": "train_log", "step": 2842, "epoch": 1.1991561181434598, "progress_pct": 19.99, "epoch_pct": 19.99, "eta": "57:21:03", "max_grad_norm": 0.8, "loss": 0.7090804576873779, "grad_norm": 1.1921758651733398, "learning_rate": 9.587656533322273e-05} +{"ts": "2025-12-27T08:28:43", "event": "train_log", "step": 2844, "epoch": 1.2, "progress_pct": 20.0, "epoch_pct": 20.0, "eta": "57:19:22", "max_grad_norm": 0.8, "loss": 0.6930652856826782, "grad_norm": 1.36713445186615, "learning_rate": 9.586701030286014e-05} +{"ts": "2025-12-27T08:29:01", "event": "train_log", "step": 2846, "epoch": 1.2008438818565401, "progress_pct": 20.01, "epoch_pct": 20.01, "eta": "57:17:33", "max_grad_norm": 0.8, "loss": 0.7386236190795898, "grad_norm": 1.3084295988082886, "learning_rate": 9.585744469185917e-05} +{"ts": "2025-12-27T08:29:22", "event": "train_log", "step": 2848, "epoch": 1.20168776371308, "progress_pct": 20.03, "epoch_pct": 20.03, "eta": "57:15:55", "max_grad_norm": 0.8, "loss": 0.6179903149604797, "grad_norm": 1.198922038078308, "learning_rate": 9.584786850242642e-05} +{"ts": "2025-12-27T08:29:40", "event": "train_log", "step": 2850, "epoch": 1.2025316455696202, "progress_pct": 20.04, "epoch_pct": 20.04, "eta": "57:14:08", "max_grad_norm": 0.8, "loss": 0.7027528882026672, "grad_norm": 1.2106369733810425, "learning_rate": 9.583828173677092e-05} +{"ts": "2025-12-27T08:30:00", "event": "train_log", "step": 2852, "epoch": 1.2033755274261604, "progress_pct": 20.06, "epoch_pct": 20.06, "eta": "57:12:23", "max_grad_norm": 0.8, "loss": 0.6612945199012756, "grad_norm": 1.2959522008895874, "learning_rate": 9.582868439710418e-05} +{"ts": "2025-12-27T08:30:18", "event": "train_log", "step": 2854, "epoch": 1.2042194092827003, "progress_pct": 20.07, "epoch_pct": 20.07, "eta": "57:10:36", "max_grad_norm": 0.8, "loss": 0.7085917592048645, "grad_norm": 1.1441705226898193, "learning_rate": 9.58190764856401e-05} +{"ts": "2025-12-27T08:30:37", "event": "train_log", "step": 2856, "epoch": 1.2050632911392405, "progress_pct": 20.08, "epoch_pct": 20.08, "eta": "57:08:53", "max_grad_norm": 0.8, "loss": 0.7480600476264954, "grad_norm": 1.1586185693740845, "learning_rate": 9.580945800459504e-05} +{"ts": "2025-12-27T08:30:55", "event": "train_log", "step": 2858, "epoch": 1.2059071729957807, "progress_pct": 20.1, "epoch_pct": 20.1, "eta": "57:07:05", "max_grad_norm": 0.8, "loss": 0.7185836434364319, "grad_norm": 1.2068266868591309, "learning_rate": 9.579982895618783e-05} +{"ts": "2025-12-27T08:31:15", "event": "train_log", "step": 2860, "epoch": 1.2067510548523206, "progress_pct": 20.11, "epoch_pct": 20.11, "eta": "57:05:22", "max_grad_norm": 0.8, "loss": 0.6737306118011475, "grad_norm": 1.2188525199890137, "learning_rate": 9.579018934263966e-05} +{"ts": "2025-12-27T08:31:35", "event": "train_log", "step": 2862, "epoch": 1.2075949367088608, "progress_pct": 20.13, "epoch_pct": 20.13, "eta": "57:03:43", "max_grad_norm": 0.8, "loss": 0.7239293456077576, "grad_norm": 1.1513181924819946, "learning_rate": 9.578053916617423e-05} +{"ts": "2025-12-27T08:31:55", "event": "train_log", "step": 2864, "epoch": 1.208438818565401, "progress_pct": 20.14, "epoch_pct": 20.14, "eta": "57:02:03", "max_grad_norm": 0.8, "loss": 0.6416276097297668, "grad_norm": 1.2063703536987305, "learning_rate": 9.577087842901764e-05} +{"ts": "2025-12-27T08:32:15", "event": "train_log", "step": 2866, "epoch": 1.2092827004219409, "progress_pct": 20.15, "epoch_pct": 20.15, "eta": "57:00:20", "max_grad_norm": 0.8, "loss": 0.697213351726532, "grad_norm": 1.102460503578186, "learning_rate": 9.576120713339844e-05} +{"ts": "2025-12-27T08:32:35", "event": "train_log", "step": 2868, "epoch": 1.210126582278481, "progress_pct": 20.17, "epoch_pct": 20.17, "eta": "56:58:42", "max_grad_norm": 0.8, "loss": 0.6664742231369019, "grad_norm": 1.2484638690948486, "learning_rate": 9.575152528154763e-05} +{"ts": "2025-12-27T08:32:55", "event": "train_log", "step": 2870, "epoch": 1.2109704641350212, "progress_pct": 20.18, "epoch_pct": 20.18, "eta": "56:57:03", "max_grad_norm": 0.8, "loss": 0.6914868354797363, "grad_norm": 1.4476624727249146, "learning_rate": 9.57418328756986e-05} +{"ts": "2025-12-27T08:33:16", "event": "train_log", "step": 2872, "epoch": 1.2118143459915611, "progress_pct": 20.2, "epoch_pct": 20.2, "eta": "56:55:24", "max_grad_norm": 0.8, "loss": 0.662024736404419, "grad_norm": 1.0130122900009155, "learning_rate": 9.573212991808722e-05} +{"ts": "2025-12-27T08:33:37", "event": "train_log", "step": 2874, "epoch": 1.2126582278481013, "progress_pct": 20.21, "epoch_pct": 20.21, "eta": "56:53:48", "max_grad_norm": 0.8, "loss": 0.6330409646034241, "grad_norm": 1.014470100402832, "learning_rate": 9.572241641095177e-05} +{"ts": "2025-12-27T08:33:57", "event": "train_log", "step": 2876, "epoch": 1.2135021097046415, "progress_pct": 20.23, "epoch_pct": 20.23, "eta": "56:52:11", "max_grad_norm": 0.8, "loss": 0.6607463955879211, "grad_norm": 1.1803333759307861, "learning_rate": 9.571269235653298e-05} +{"ts": "2025-12-27T08:34:17", "event": "train_log", "step": 2878, "epoch": 1.2143459915611814, "progress_pct": 20.24, "epoch_pct": 20.24, "eta": "56:50:31", "max_grad_norm": 0.8, "loss": 0.6925629377365112, "grad_norm": 1.261366844177246, "learning_rate": 9.570295775707398e-05} +{"ts": "2025-12-27T08:34:37", "event": "train_log", "step": 2880, "epoch": 1.2151898734177216, "progress_pct": 20.25, "epoch_pct": 20.25, "eta": "56:48:52", "max_grad_norm": 0.8, "loss": 0.7070510983467102, "grad_norm": 1.226670503616333, "learning_rate": 9.569321261482037e-05} +{"ts": "2025-12-27T08:34:58", "event": "train_log", "step": 2882, "epoch": 1.2160337552742617, "progress_pct": 20.27, "epoch_pct": 20.27, "eta": "56:47:16", "max_grad_norm": 0.8, "loss": 0.7243561744689941, "grad_norm": 1.164565920829773, "learning_rate": 9.568345693202016e-05} +{"ts": "2025-12-27T08:35:19", "event": "train_log", "step": 2884, "epoch": 1.2168776371308017, "progress_pct": 20.28, "epoch_pct": 20.28, "eta": "56:45:39", "max_grad_norm": 0.8, "loss": 0.6316909790039062, "grad_norm": 1.060331106185913, "learning_rate": 9.567369071092382e-05} +{"ts": "2025-12-27T08:35:38", "event": "train_log", "step": 2886, "epoch": 1.2177215189873418, "progress_pct": 20.3, "epoch_pct": 20.3, "eta": "56:43:59", "max_grad_norm": 0.8, "loss": 0.6139125227928162, "grad_norm": 1.1998693943023682, "learning_rate": 9.566391395378419e-05} +{"ts": "2025-12-27T08:35:59", "event": "train_log", "step": 2888, "epoch": 1.2185654008438818, "progress_pct": 20.31, "epoch_pct": 20.31, "eta": "56:42:21", "max_grad_norm": 0.8, "loss": 0.688897430896759, "grad_norm": 1.1875834465026855, "learning_rate": 9.565412666285661e-05} +{"ts": "2025-12-27T08:36:18", "event": "train_log", "step": 2890, "epoch": 1.219409282700422, "progress_pct": 20.32, "epoch_pct": 20.32, "eta": "56:40:39", "max_grad_norm": 0.8, "loss": 0.684590756893158, "grad_norm": 1.199174404144287, "learning_rate": 9.564432884039882e-05} +{"ts": "2025-12-27T08:36:37", "event": "train_log", "step": 2892, "epoch": 1.220253164556962, "progress_pct": 20.34, "epoch_pct": 20.34, "eta": "56:38:57", "max_grad_norm": 0.8, "loss": 0.67433100938797, "grad_norm": 1.2428219318389893, "learning_rate": 9.563452048867099e-05} +{"ts": "2025-12-27T08:36:57", "event": "train_log", "step": 2894, "epoch": 1.221097046413502, "progress_pct": 20.35, "epoch_pct": 20.35, "eta": "56:37:20", "max_grad_norm": 0.8, "loss": 0.6959785223007202, "grad_norm": 1.0826431512832642, "learning_rate": 9.562470160993568e-05} +{"ts": "2025-12-27T08:37:16", "event": "train_log", "step": 2896, "epoch": 1.2219409282700422, "progress_pct": 20.37, "epoch_pct": 20.37, "eta": "56:35:37", "max_grad_norm": 0.8, "loss": 0.6443175673484802, "grad_norm": 1.3140246868133545, "learning_rate": 9.561487220645797e-05} +{"ts": "2025-12-27T08:37:35", "event": "train_log", "step": 2898, "epoch": 1.2227848101265824, "progress_pct": 20.38, "epoch_pct": 20.38, "eta": "56:33:55", "max_grad_norm": 0.8, "loss": 0.6715332865715027, "grad_norm": 1.2758334875106812, "learning_rate": 9.560503228050529e-05} +{"ts": "2025-12-27T08:37:54", "event": "train_log", "step": 2900, "epoch": 1.2236286919831223, "progress_pct": 20.39, "epoch_pct": 20.39, "eta": "56:32:11", "max_grad_norm": 0.8, "loss": 0.6896081566810608, "grad_norm": 1.3326421976089478, "learning_rate": 9.559518183434753e-05} +{"ts": "2025-12-27T08:52:09", "event": "train_log", "step": 2900, "epoch": 1.2236286919831223, "progress_pct": 20.39, "epoch_pct": 20.39, "eta": "57:27:47", "max_grad_norm": 0.8, "eval_loss": 0.7281573414802551, "eval_runtime": 854.563, "eval_samples_per_second": 2.466, "eval_steps_per_second": 2.466} +{"ts": "2025-12-27T08:52:29", "event": "train_log", "step": 2902, "epoch": 1.2244725738396625, "progress_pct": 20.41, "epoch_pct": 20.41, "eta": "57:26:06", "max_grad_norm": 0.8, "loss": 0.6797633171081543, "grad_norm": 1.3225606679916382, "learning_rate": 9.558532087025697e-05} +{"ts": "2025-12-27T08:52:48", "event": "train_log", "step": 2904, "epoch": 1.2253164556962026, "progress_pct": 20.42, "epoch_pct": 20.42, "eta": "57:24:23", "max_grad_norm": 0.8, "loss": 0.6510948538780212, "grad_norm": 1.3058340549468994, "learning_rate": 9.55754493905084e-05} +{"ts": "2025-12-27T08:53:08", "event": "train_log", "step": 2906, "epoch": 1.2261603375527426, "progress_pct": 20.44, "epoch_pct": 20.44, "eta": "57:22:42", "max_grad_norm": 0.8, "loss": 0.6481176614761353, "grad_norm": 1.140268087387085, "learning_rate": 9.556556739737892e-05} +{"ts": "2025-12-27T08:53:27", "event": "train_log", "step": 2908, "epoch": 1.2270042194092827, "progress_pct": 20.45, "epoch_pct": 20.45, "eta": "57:20:57", "max_grad_norm": 0.8, "loss": 0.7533771991729736, "grad_norm": 1.465113639831543, "learning_rate": 9.555567489314816e-05} +{"ts": "2025-12-27T08:53:47", "event": "train_log", "step": 2910, "epoch": 1.2278481012658227, "progress_pct": 20.46, "epoch_pct": 20.46, "eta": "57:19:18", "max_grad_norm": 0.8, "loss": 0.6924305558204651, "grad_norm": 1.1468979120254517, "learning_rate": 9.554577188009812e-05} +{"ts": "2025-12-27T08:54:08", "event": "train_log", "step": 2912, "epoch": 1.2286919831223628, "progress_pct": 20.48, "epoch_pct": 20.48, "eta": "57:17:38", "max_grad_norm": 0.8, "loss": 0.7082820534706116, "grad_norm": 1.2193517684936523, "learning_rate": 9.553585836051321e-05} +{"ts": "2025-12-27T08:54:27", "event": "train_log", "step": 2914, "epoch": 1.229535864978903, "progress_pct": 20.49, "epoch_pct": 20.49, "eta": "57:15:54", "max_grad_norm": 0.8, "loss": 0.6735695004463196, "grad_norm": 1.2015037536621094, "learning_rate": 9.552593433668034e-05} +{"ts": "2025-12-27T08:54:47", "event": "train_log", "step": 2916, "epoch": 1.230379746835443, "progress_pct": 20.51, "epoch_pct": 20.51, "eta": "57:14:15", "max_grad_norm": 0.8, "loss": 0.7312048673629761, "grad_norm": 1.1915435791015625, "learning_rate": 9.551599981088874e-05} +{"ts": "2025-12-27T08:55:05", "event": "train_log", "step": 2918, "epoch": 1.231223628691983, "progress_pct": 20.52, "epoch_pct": 20.52, "eta": "57:12:29", "max_grad_norm": 0.8, "loss": 0.6590308547019958, "grad_norm": 1.2849410772323608, "learning_rate": 9.550605478543013e-05} +{"ts": "2025-12-27T08:55:24", "event": "train_log", "step": 2920, "epoch": 1.2320675105485233, "progress_pct": 20.53, "epoch_pct": 20.53, "eta": "57:10:43", "max_grad_norm": 0.8, "loss": 0.6237715482711792, "grad_norm": 1.192238688468933, "learning_rate": 9.549609926259866e-05} +{"ts": "2025-12-27T08:55:46", "event": "train_log", "step": 2922, "epoch": 1.2329113924050632, "progress_pct": 20.55, "epoch_pct": 20.55, "eta": "57:09:11", "max_grad_norm": 0.8, "loss": 0.6546295881271362, "grad_norm": 1.141845703125, "learning_rate": 9.548613324469085e-05} +{"ts": "2025-12-27T08:56:07", "event": "train_log", "step": 2924, "epoch": 1.2337552742616034, "progress_pct": 20.56, "epoch_pct": 20.56, "eta": "57:07:35", "max_grad_norm": 0.8, "loss": 0.5800934433937073, "grad_norm": 1.1662311553955078, "learning_rate": 9.547615673400566e-05} +{"ts": "2025-12-27T08:56:28", "event": "train_log", "step": 2926, "epoch": 1.2345991561181435, "progress_pct": 20.58, "epoch_pct": 20.58, "eta": "57:05:59", "max_grad_norm": 0.8, "loss": 0.6487136483192444, "grad_norm": 1.120578646659851, "learning_rate": 9.546616973284453e-05} +{"ts": "2025-12-27T08:56:50", "event": "train_log", "step": 2928, "epoch": 1.2354430379746835, "progress_pct": 20.59, "epoch_pct": 20.59, "eta": "57:04:26", "max_grad_norm": 0.8, "loss": 0.7515342235565186, "grad_norm": 1.0884860754013062, "learning_rate": 9.54561722435112e-05} +{"ts": "2025-12-27T08:57:09", "event": "train_log", "step": 2930, "epoch": 1.2362869198312236, "progress_pct": 20.6, "epoch_pct": 20.6, "eta": "57:02:44", "max_grad_norm": 0.8, "loss": 0.7162003517150879, "grad_norm": 1.4208670854568481, "learning_rate": 9.544616426831196e-05} +{"ts": "2025-12-27T08:57:29", "event": "train_log", "step": 2932, "epoch": 1.2371308016877638, "progress_pct": 20.62, "epoch_pct": 20.62, "eta": "57:01:03", "max_grad_norm": 0.8, "loss": 0.708450198173523, "grad_norm": 1.083389401435852, "learning_rate": 9.543614580955543e-05} +{"ts": "2025-12-27T08:57:49", "event": "train_log", "step": 2934, "epoch": 1.2379746835443037, "progress_pct": 20.63, "epoch_pct": 20.63, "eta": "56:59:25", "max_grad_norm": 0.8, "loss": 0.6255859732627869, "grad_norm": 1.141364336013794, "learning_rate": 9.542611686955268e-05} +{"ts": "2025-12-27T08:58:11", "event": "train_log", "step": 2936, "epoch": 1.238818565400844, "progress_pct": 20.65, "epoch_pct": 20.65, "eta": "56:57:53", "max_grad_norm": 0.8, "loss": 0.6485402584075928, "grad_norm": 1.122036099433899, "learning_rate": 9.54160774506172e-05} +{"ts": "2025-12-27T08:58:31", "event": "train_log", "step": 2938, "epoch": 1.239662447257384, "progress_pct": 20.66, "epoch_pct": 20.66, "eta": "56:56:13", "max_grad_norm": 0.8, "loss": 0.6735473871231079, "grad_norm": 1.3514165878295898, "learning_rate": 9.540602755506487e-05} +{"ts": "2025-12-27T08:58:51", "event": "train_log", "step": 2940, "epoch": 1.240506329113924, "progress_pct": 20.68, "epoch_pct": 20.68, "eta": "56:54:36", "max_grad_norm": 0.8, "loss": 0.6154970526695251, "grad_norm": 1.1762629747390747, "learning_rate": 9.539596718521403e-05} +{"ts": "2025-12-27T08:59:10", "event": "train_log", "step": 2942, "epoch": 1.2413502109704642, "progress_pct": 20.69, "epoch_pct": 20.69, "eta": "56:52:53", "max_grad_norm": 0.8, "loss": 0.6410251259803772, "grad_norm": 1.1609408855438232, "learning_rate": 9.53858963433854e-05} +{"ts": "2025-12-27T08:59:30", "event": "train_log", "step": 2944, "epoch": 1.2421940928270043, "progress_pct": 20.7, "epoch_pct": 20.7, "eta": "56:51:15", "max_grad_norm": 0.8, "loss": 0.6841039657592773, "grad_norm": 1.1750361919403076, "learning_rate": 9.537581503190214e-05} +{"ts": "2025-12-27T08:59:51", "event": "train_log", "step": 2946, "epoch": 1.2430379746835443, "progress_pct": 20.72, "epoch_pct": 20.72, "eta": "56:49:39", "max_grad_norm": 0.8, "loss": 0.7293462753295898, "grad_norm": 1.3125680685043335, "learning_rate": 9.536572325308982e-05} +{"ts": "2025-12-27T09:00:11", "event": "train_log", "step": 2948, "epoch": 1.2438818565400844, "progress_pct": 20.73, "epoch_pct": 20.73, "eta": "56:48:01", "max_grad_norm": 0.8, "loss": 0.7713663578033447, "grad_norm": 1.1737277507781982, "learning_rate": 9.53556210092764e-05} +{"ts": "2025-12-27T09:00:30", "event": "train_log", "step": 2950, "epoch": 1.2447257383966246, "progress_pct": 20.75, "epoch_pct": 20.75, "eta": "56:46:18", "max_grad_norm": 0.8, "loss": 0.6612298488616943, "grad_norm": 1.1702152490615845, "learning_rate": 9.53455083027923e-05} +{"ts": "2025-12-27T09:00:49", "event": "train_log", "step": 2952, "epoch": 1.2455696202531645, "progress_pct": 20.76, "epoch_pct": 20.76, "eta": "56:44:37", "max_grad_norm": 0.8, "loss": 0.6725803017616272, "grad_norm": 1.2594486474990845, "learning_rate": 9.533538513597028e-05} +{"ts": "2025-12-27T09:01:09", "event": "train_log", "step": 2954, "epoch": 1.2464135021097047, "progress_pct": 20.77, "epoch_pct": 20.77, "eta": "56:42:58", "max_grad_norm": 0.8, "loss": 0.6421069502830505, "grad_norm": 1.180816411972046, "learning_rate": 9.532525151114562e-05} +{"ts": "2025-12-27T09:01:27", "event": "train_log", "step": 2956, "epoch": 1.2472573839662446, "progress_pct": 20.79, "epoch_pct": 20.79, "eta": "56:41:12", "max_grad_norm": 0.8, "loss": 0.7042996287345886, "grad_norm": 1.25814688205719, "learning_rate": 9.531510743065593e-05} +{"ts": "2025-12-27T09:01:46", "event": "train_log", "step": 2958, "epoch": 1.2481012658227848, "progress_pct": 20.8, "epoch_pct": 20.8, "eta": "56:39:30", "max_grad_norm": 0.8, "loss": 0.7359137535095215, "grad_norm": 1.2101783752441406, "learning_rate": 9.530495289684122e-05} +{"ts": "2025-12-27T09:02:06", "event": "train_log", "step": 2960, "epoch": 1.248945147679325, "progress_pct": 20.82, "epoch_pct": 20.82, "eta": "56:37:51", "max_grad_norm": 0.8, "loss": 0.6186386346817017, "grad_norm": 1.1438405513763428, "learning_rate": 9.5294787912044e-05} +{"ts": "2025-12-27T09:02:25", "event": "train_log", "step": 2962, "epoch": 1.249789029535865, "progress_pct": 20.83, "epoch_pct": 20.83, "eta": "56:36:12", "max_grad_norm": 0.8, "loss": 0.6243056058883667, "grad_norm": 1.163364291191101, "learning_rate": 9.52846124786091e-05} +{"ts": "2025-12-27T09:02:46", "event": "train_log", "step": 2964, "epoch": 1.250632911392405, "progress_pct": 20.84, "epoch_pct": 20.84, "eta": "56:34:37", "max_grad_norm": 0.8, "loss": 0.6568763852119446, "grad_norm": 1.0695953369140625, "learning_rate": 9.52744265988838e-05} +{"ts": "2025-12-27T09:03:05", "event": "train_log", "step": 2966, "epoch": 1.2514767932489452, "progress_pct": 20.86, "epoch_pct": 20.86, "eta": "56:32:56", "max_grad_norm": 0.8, "loss": 0.6486776471138, "grad_norm": 1.2228879928588867, "learning_rate": 9.52642302752178e-05} +{"ts": "2025-12-27T09:03:24", "event": "train_log", "step": 2968, "epoch": 1.2523206751054852, "progress_pct": 20.87, "epoch_pct": 20.87, "eta": "56:31:15", "max_grad_norm": 0.8, "loss": 0.6293455958366394, "grad_norm": 1.2262967824935913, "learning_rate": 9.52540235099632e-05} +{"ts": "2025-12-27T09:03:44", "event": "train_log", "step": 2970, "epoch": 1.2531645569620253, "progress_pct": 20.89, "epoch_pct": 20.89, "eta": "56:29:36", "max_grad_norm": 0.8, "loss": 0.6549884080886841, "grad_norm": 1.0862956047058105, "learning_rate": 9.524380630547449e-05} +{"ts": "2025-12-27T09:04:04", "event": "train_log", "step": 2972, "epoch": 1.2540084388185653, "progress_pct": 20.9, "epoch_pct": 20.9, "eta": "56:27:58", "max_grad_norm": 0.8, "loss": 0.6126490831375122, "grad_norm": 1.1721880435943604, "learning_rate": 9.52335786641086e-05} +{"ts": "2025-12-27T09:04:24", "event": "train_log", "step": 2974, "epoch": 1.2548523206751054, "progress_pct": 20.91, "epoch_pct": 20.91, "eta": "56:26:20", "max_grad_norm": 0.8, "loss": 0.7078590393066406, "grad_norm": 1.2452391386032104, "learning_rate": 9.522334058822483e-05} +{"ts": "2025-12-27T09:04:42", "event": "train_log", "step": 2976, "epoch": 1.2556962025316456, "progress_pct": 20.93, "epoch_pct": 20.93, "eta": "56:24:38", "max_grad_norm": 0.8, "loss": 0.6166214942932129, "grad_norm": 1.2290222644805908, "learning_rate": 9.521309208018492e-05} +{"ts": "2025-12-27T09:05:02", "event": "train_log", "step": 2978, "epoch": 1.2565400843881855, "progress_pct": 20.94, "epoch_pct": 20.94, "eta": "56:23:00", "max_grad_norm": 0.8, "loss": 0.666228175163269, "grad_norm": 1.1823618412017822, "learning_rate": 9.520283314235299e-05} +{"ts": "2025-12-27T09:05:21", "event": "train_log", "step": 2980, "epoch": 1.2573839662447257, "progress_pct": 20.96, "epoch_pct": 20.96, "eta": "56:21:18", "max_grad_norm": 0.8, "loss": 0.7436795830726624, "grad_norm": 1.1702475547790527, "learning_rate": 9.51925637770956e-05} +{"ts": "2025-12-27T09:05:41", "event": "train_log", "step": 2982, "epoch": 1.2582278481012659, "progress_pct": 20.97, "epoch_pct": 20.97, "eta": "56:19:42", "max_grad_norm": 0.8, "loss": 0.7120893001556396, "grad_norm": 1.0879321098327637, "learning_rate": 9.518228398678168e-05} +{"ts": "2025-12-27T09:06:01", "event": "train_log", "step": 2984, "epoch": 1.2590717299578058, "progress_pct": 20.98, "epoch_pct": 20.98, "eta": "56:18:05", "max_grad_norm": 0.8, "loss": 0.6931713223457336, "grad_norm": 1.1608418226242065, "learning_rate": 9.517199377378261e-05} +{"ts": "2025-12-27T09:06:20", "event": "train_log", "step": 2986, "epoch": 1.259915611814346, "progress_pct": 21.0, "epoch_pct": 21.0, "eta": "56:16:26", "max_grad_norm": 0.8, "loss": 0.6803538799285889, "grad_norm": 1.1289087533950806, "learning_rate": 9.51616931404721e-05} +{"ts": "2025-12-27T09:06:39", "event": "train_log", "step": 2988, "epoch": 1.2607594936708861, "progress_pct": 21.01, "epoch_pct": 21.01, "eta": "56:14:47", "max_grad_norm": 0.8, "loss": 0.6499706506729126, "grad_norm": 1.1622236967086792, "learning_rate": 9.515138208922633e-05} +{"ts": "2025-12-27T09:06:58", "event": "train_log", "step": 2990, "epoch": 1.261603375527426, "progress_pct": 21.03, "epoch_pct": 21.03, "eta": "56:13:07", "max_grad_norm": 0.8, "loss": 0.6132655739784241, "grad_norm": 1.2492594718933105, "learning_rate": 9.514106062242386e-05} +{"ts": "2025-12-27T09:07:19", "event": "train_log", "step": 2992, "epoch": 1.2624472573839662, "progress_pct": 21.04, "epoch_pct": 21.04, "eta": "56:11:33", "max_grad_norm": 0.8, "loss": 0.6309265494346619, "grad_norm": 1.1538822650909424, "learning_rate": 9.513072874244567e-05} +{"ts": "2025-12-27T09:07:39", "event": "train_log", "step": 2994, "epoch": 1.2632911392405064, "progress_pct": 21.05, "epoch_pct": 21.05, "eta": "56:09:56", "max_grad_norm": 0.8, "loss": 0.6297751665115356, "grad_norm": 1.0828478336334229, "learning_rate": 9.512038645167509e-05} +{"ts": "2025-12-27T09:07:58", "event": "train_log", "step": 2996, "epoch": 1.2641350210970463, "progress_pct": 21.07, "epoch_pct": 21.07, "eta": "56:08:16", "max_grad_norm": 0.8, "loss": 0.6335258483886719, "grad_norm": 1.2440937757492065, "learning_rate": 9.511003375249792e-05} +{"ts": "2025-12-27T09:08:17", "event": "train_log", "step": 2998, "epoch": 1.2649789029535865, "progress_pct": 21.08, "epoch_pct": 21.08, "eta": "56:06:38", "max_grad_norm": 0.8, "loss": 0.6513770818710327, "grad_norm": 1.1259970664978027, "learning_rate": 9.50996706473023e-05} +{"ts": "2025-12-27T09:08:37", "event": "train_log", "step": 3000, "epoch": 1.2658227848101267, "progress_pct": 21.1, "epoch_pct": 21.1, "eta": "56:05:01", "max_grad_norm": 0.8, "loss": 0.6490892767906189, "grad_norm": 1.1530309915542603, "learning_rate": 9.508929713847884e-05} +{"ts": "2025-12-27T09:23:05", "event": "train_log", "step": 3000, "epoch": 1.2658227848101267, "progress_pct": 21.1, "epoch_pct": 21.1, "eta": "56:59:07", "max_grad_norm": 0.8, "eval_loss": 0.72515869140625, "eval_runtime": 868.0515, "eval_samples_per_second": 2.427, "eval_steps_per_second": 2.427} +{"ts": "2025-12-27T09:23:24", "event": "train_log", "step": 3002, "epoch": 1.2666666666666666, "progress_pct": 21.11, "epoch_pct": 21.11, "eta": "56:57:24", "max_grad_norm": 0.8, "loss": 0.6936060786247253, "grad_norm": 1.2257169485092163, "learning_rate": 9.507891322842048e-05} +{"ts": "2025-12-27T09:23:45", "event": "train_log", "step": 3004, "epoch": 1.2675105485232068, "progress_pct": 21.13, "epoch_pct": 21.13, "eta": "56:55:51", "max_grad_norm": 0.8, "loss": 0.5941951870918274, "grad_norm": 1.0380109548568726, "learning_rate": 9.506851891952259e-05} +{"ts": "2025-12-27T09:24:04", "event": "train_log", "step": 3006, "epoch": 1.268354430379747, "progress_pct": 21.14, "epoch_pct": 21.14, "eta": "56:54:11", "max_grad_norm": 0.8, "loss": 0.648429811000824, "grad_norm": 1.2830222845077515, "learning_rate": 9.505811421418296e-05} +{"ts": "2025-12-27T09:24:23", "event": "train_log", "step": 3008, "epoch": 1.2691983122362869, "progress_pct": 21.15, "epoch_pct": 21.15, "eta": "56:52:28", "max_grad_norm": 0.8, "loss": 0.6868565678596497, "grad_norm": 1.2212986946105957, "learning_rate": 9.504769911480171e-05} +{"ts": "2025-12-27T09:24:43", "event": "train_log", "step": 3010, "epoch": 1.270042194092827, "progress_pct": 21.17, "epoch_pct": 21.17, "eta": "56:50:49", "max_grad_norm": 0.8, "loss": 0.6777986288070679, "grad_norm": 1.104656457901001, "learning_rate": 9.503727362378145e-05} +{"ts": "2025-12-27T09:25:01", "event": "train_log", "step": 3012, "epoch": 1.2708860759493672, "progress_pct": 21.18, "epoch_pct": 21.18, "eta": "56:49:05", "max_grad_norm": 0.8, "loss": 0.6581128239631653, "grad_norm": 1.1449005603790283, "learning_rate": 9.502683774352713e-05} +{"ts": "2025-12-27T09:25:19", "event": "train_log", "step": 3014, "epoch": 1.2717299578059071, "progress_pct": 21.2, "epoch_pct": 21.2, "eta": "56:47:17", "max_grad_norm": 0.8, "loss": 0.689930260181427, "grad_norm": 1.2753362655639648, "learning_rate": 9.501639147644608e-05} +{"ts": "2025-12-27T09:25:38", "event": "train_log", "step": 3016, "epoch": 1.2725738396624473, "progress_pct": 21.21, "epoch_pct": 21.21, "eta": "56:45:35", "max_grad_norm": 0.8, "loss": 0.7549214363098145, "grad_norm": 1.3367106914520264, "learning_rate": 9.500593482494809e-05} +{"ts": "2025-12-27T09:25:56", "event": "train_log", "step": 3018, "epoch": 1.2734177215189875, "progress_pct": 21.22, "epoch_pct": 21.22, "eta": "56:43:53", "max_grad_norm": 0.8, "loss": 0.6713513135910034, "grad_norm": 1.2309048175811768, "learning_rate": 9.499546779144528e-05} +{"ts": "2025-12-27T09:26:16", "event": "train_log", "step": 3020, "epoch": 1.2742616033755274, "progress_pct": 21.24, "epoch_pct": 21.24, "eta": "56:42:13", "max_grad_norm": 0.8, "loss": 0.7045458555221558, "grad_norm": 1.3833240270614624, "learning_rate": 9.49849903783522e-05} +{"ts": "2025-12-27T09:26:36", "event": "train_log", "step": 3022, "epoch": 1.2751054852320676, "progress_pct": 21.25, "epoch_pct": 21.25, "eta": "56:40:35", "max_grad_norm": 0.8, "loss": 0.708249568939209, "grad_norm": 1.1402570009231567, "learning_rate": 9.49745025880858e-05} +{"ts": "2025-12-27T09:26:55", "event": "train_log", "step": 3024, "epoch": 1.2759493670886077, "progress_pct": 21.27, "epoch_pct": 21.27, "eta": "56:38:56", "max_grad_norm": 0.8, "loss": 0.616210401058197, "grad_norm": 1.0476267337799072, "learning_rate": 9.496400442306541e-05} +{"ts": "2025-12-27T09:27:15", "event": "train_log", "step": 3026, "epoch": 1.2767932489451477, "progress_pct": 21.28, "epoch_pct": 21.28, "eta": "56:37:18", "max_grad_norm": 0.8, "loss": 0.6691827178001404, "grad_norm": 1.1045979261398315, "learning_rate": 9.495349588571274e-05} +{"ts": "2025-12-27T09:27:35", "event": "train_log", "step": 3028, "epoch": 1.2776371308016878, "progress_pct": 21.29, "epoch_pct": 21.29, "eta": "56:35:41", "max_grad_norm": 0.8, "loss": 0.6198306083679199, "grad_norm": 1.1760368347167969, "learning_rate": 9.494297697845194e-05} +{"ts": "2025-12-27T09:27:56", "event": "train_log", "step": 3030, "epoch": 1.2784810126582278, "progress_pct": 21.31, "epoch_pct": 21.31, "eta": "56:34:09", "max_grad_norm": 0.8, "loss": 0.5756480097770691, "grad_norm": 1.0015549659729004, "learning_rate": 9.493244770370946e-05} +{"ts": "2025-12-27T09:28:15", "event": "train_log", "step": 3032, "epoch": 1.279324894514768, "progress_pct": 21.32, "epoch_pct": 21.32, "eta": "56:32:29", "max_grad_norm": 0.8, "loss": 0.6794419884681702, "grad_norm": 1.2190428972244263, "learning_rate": 9.492190806391427e-05} +{"ts": "2025-12-27T09:28:37", "event": "train_log", "step": 3034, "epoch": 1.2801687763713079, "progress_pct": 21.34, "epoch_pct": 21.34, "eta": "56:30:57", "max_grad_norm": 0.8, "loss": 0.5847988724708557, "grad_norm": 1.0210410356521606, "learning_rate": 9.491135806149762e-05} +{"ts": "2025-12-27T09:28:56", "event": "train_log", "step": 3036, "epoch": 1.281012658227848, "progress_pct": 21.35, "epoch_pct": 21.35, "eta": "56:29:19", "max_grad_norm": 0.8, "loss": 0.6760231256484985, "grad_norm": 1.0678503513336182, "learning_rate": 9.490079769889319e-05} +{"ts": "2025-12-27T09:29:16", "event": "train_log", "step": 3038, "epoch": 1.2818565400843882, "progress_pct": 21.36, "epoch_pct": 21.36, "eta": "56:27:41", "max_grad_norm": 0.8, "loss": 0.7188448309898376, "grad_norm": 1.1811012029647827, "learning_rate": 9.489022697853709e-05} +{"ts": "2025-12-27T09:29:34", "event": "train_log", "step": 3040, "epoch": 1.2827004219409281, "progress_pct": 21.38, "epoch_pct": 21.38, "eta": "56:25:58", "max_grad_norm": 0.8, "loss": 0.674904465675354, "grad_norm": 1.1134302616119385, "learning_rate": 9.487964590286776e-05} +{"ts": "2025-12-27T09:29:53", "event": "train_log", "step": 3042, "epoch": 1.2835443037974683, "progress_pct": 21.39, "epoch_pct": 21.39, "eta": "56:24:17", "max_grad_norm": 0.8, "loss": 0.6016344428062439, "grad_norm": 1.1868232488632202, "learning_rate": 9.486905447432603e-05} +{"ts": "2025-12-27T09:30:12", "event": "train_log", "step": 3044, "epoch": 1.2843881856540085, "progress_pct": 21.41, "epoch_pct": 21.41, "eta": "56:22:37", "max_grad_norm": 0.8, "loss": 0.6965603828430176, "grad_norm": 1.1586613655090332, "learning_rate": 9.485845269535517e-05} +{"ts": "2025-12-27T09:30:32", "event": "train_log", "step": 3046, "epoch": 1.2852320675105484, "progress_pct": 21.42, "epoch_pct": 21.42, "eta": "56:21:01", "max_grad_norm": 0.8, "loss": 0.656144380569458, "grad_norm": 1.149837613105774, "learning_rate": 9.48478405684008e-05} +{"ts": "2025-12-27T09:30:51", "event": "train_log", "step": 3048, "epoch": 1.2860759493670886, "progress_pct": 21.43, "epoch_pct": 21.43, "eta": "56:19:21", "max_grad_norm": 0.8, "loss": 0.6388653516769409, "grad_norm": 1.228752613067627, "learning_rate": 9.48372180959109e-05} +{"ts": "2025-12-27T09:31:11", "event": "train_log", "step": 3050, "epoch": 1.2869198312236287, "progress_pct": 21.45, "epoch_pct": 21.45, "eta": "56:17:47", "max_grad_norm": 0.8, "loss": 0.6255465745925903, "grad_norm": 1.2403100728988647, "learning_rate": 9.482658528033595e-05} +{"ts": "2025-12-27T09:31:30", "event": "train_log", "step": 3052, "epoch": 1.2877637130801687, "progress_pct": 21.46, "epoch_pct": 21.46, "eta": "56:16:06", "max_grad_norm": 0.8, "loss": 0.6828253269195557, "grad_norm": 1.2483839988708496, "learning_rate": 9.481594212412865e-05} +{"ts": "2025-12-27T09:31:50", "event": "train_log", "step": 3054, "epoch": 1.2886075949367088, "progress_pct": 21.48, "epoch_pct": 21.48, "eta": "56:14:29", "max_grad_norm": 0.8, "loss": 0.7072080373764038, "grad_norm": 1.4161021709442139, "learning_rate": 9.480528862974422e-05} +{"ts": "2025-12-27T09:32:09", "event": "train_log", "step": 3056, "epoch": 1.289451476793249, "progress_pct": 21.49, "epoch_pct": 21.49, "eta": "56:12:51", "max_grad_norm": 0.8, "loss": 0.6082415580749512, "grad_norm": 1.1500437259674072, "learning_rate": 9.479462479964021e-05} +{"ts": "2025-12-27T09:32:29", "event": "train_log", "step": 3058, "epoch": 1.290295358649789, "progress_pct": 21.5, "epoch_pct": 21.5, "eta": "56:11:15", "max_grad_norm": 0.8, "loss": 0.6653015613555908, "grad_norm": 1.196595549583435, "learning_rate": 9.478395063627654e-05} +{"ts": "2025-12-27T09:32:48", "event": "train_log", "step": 3060, "epoch": 1.2911392405063291, "progress_pct": 21.52, "epoch_pct": 21.52, "eta": "56:09:34", "max_grad_norm": 0.8, "loss": 0.7095832824707031, "grad_norm": 1.2832285165786743, "learning_rate": 9.477326614211557e-05} +{"ts": "2025-12-27T09:33:07", "event": "train_log", "step": 3062, "epoch": 1.2919831223628693, "progress_pct": 21.53, "epoch_pct": 21.53, "eta": "56:07:55", "max_grad_norm": 0.8, "loss": 0.7183426022529602, "grad_norm": 1.2234288454055786, "learning_rate": 9.476257131962198e-05} +{"ts": "2025-12-27T09:33:25", "event": "train_log", "step": 3064, "epoch": 1.2928270042194092, "progress_pct": 21.55, "epoch_pct": 21.55, "eta": "56:06:13", "max_grad_norm": 0.8, "loss": 0.713284432888031, "grad_norm": 1.2350459098815918, "learning_rate": 9.475186617126286e-05} +{"ts": "2025-12-27T09:33:44", "event": "train_log", "step": 3066, "epoch": 1.2936708860759494, "progress_pct": 21.56, "epoch_pct": 21.56, "eta": "56:04:35", "max_grad_norm": 0.8, "loss": 0.6580002307891846, "grad_norm": 1.2079555988311768, "learning_rate": 9.47411506995077e-05} +{"ts": "2025-12-27T09:34:02", "event": "train_log", "step": 3068, "epoch": 1.2945147679324895, "progress_pct": 21.58, "epoch_pct": 21.58, "eta": "56:02:54", "max_grad_norm": 0.8, "loss": 0.5967763662338257, "grad_norm": 1.129796028137207, "learning_rate": 9.473042490682835e-05} +{"ts": "2025-12-27T09:34:21", "event": "train_log", "step": 3070, "epoch": 1.2953586497890295, "progress_pct": 21.59, "epoch_pct": 21.59, "eta": "56:01:16", "max_grad_norm": 0.8, "loss": 0.6724388003349304, "grad_norm": 1.1706618070602417, "learning_rate": 9.471968879569901e-05} +{"ts": "2025-12-27T09:34:40", "event": "train_log", "step": 3072, "epoch": 1.2962025316455696, "progress_pct": 21.6, "epoch_pct": 21.6, "eta": "55:59:37", "max_grad_norm": 0.8, "loss": 0.6527577638626099, "grad_norm": 1.0336005687713623, "learning_rate": 9.470894236859635e-05} +{"ts": "2025-12-27T09:35:00", "event": "train_log", "step": 3074, "epoch": 1.2970464135021098, "progress_pct": 21.62, "epoch_pct": 21.62, "eta": "55:58:02", "max_grad_norm": 0.8, "loss": 0.677132785320282, "grad_norm": 1.1124558448791504, "learning_rate": 9.469818562799932e-05} +{"ts": "2025-12-27T09:35:20", "event": "train_log", "step": 3076, "epoch": 1.2978902953586497, "progress_pct": 21.63, "epoch_pct": 21.63, "eta": "55:56:25", "max_grad_norm": 0.8, "loss": 0.649718165397644, "grad_norm": 1.158069372177124, "learning_rate": 9.468741857638933e-05} +{"ts": "2025-12-27T09:35:39", "event": "train_log", "step": 3078, "epoch": 1.29873417721519, "progress_pct": 21.65, "epoch_pct": 21.65, "eta": "55:54:48", "max_grad_norm": 0.8, "loss": 0.6872133612632751, "grad_norm": 1.092926263809204, "learning_rate": 9.46766412162501e-05} +{"ts": "2025-12-27T09:35:58", "event": "train_log", "step": 3080, "epoch": 1.29957805907173, "progress_pct": 21.66, "epoch_pct": 21.66, "eta": "55:53:12", "max_grad_norm": 0.8, "loss": 0.6495246291160583, "grad_norm": 1.1324822902679443, "learning_rate": 9.466585355006777e-05} +{"ts": "2025-12-27T09:36:19", "event": "train_log", "step": 3082, "epoch": 1.30042194092827, "progress_pct": 21.67, "epoch_pct": 21.67, "eta": "55:51:38", "max_grad_norm": 0.8, "loss": 0.6730570197105408, "grad_norm": 1.5882837772369385, "learning_rate": 9.465505558033086e-05} +{"ts": "2025-12-27T09:36:39", "event": "train_log", "step": 3084, "epoch": 1.3012658227848102, "progress_pct": 21.69, "epoch_pct": 21.69, "eta": "55:50:06", "max_grad_norm": 0.8, "loss": 0.5677527785301208, "grad_norm": 0.9866069555282593, "learning_rate": 9.464424730953023e-05} +{"ts": "2025-12-27T09:36:58", "event": "train_log", "step": 3086, "epoch": 1.3021097046413503, "progress_pct": 21.7, "epoch_pct": 21.7, "eta": "55:48:29", "max_grad_norm": 0.8, "loss": 0.6247856020927429, "grad_norm": 1.1560224294662476, "learning_rate": 9.463342874015917e-05} +{"ts": "2025-12-27T09:37:19", "event": "train_log", "step": 3088, "epoch": 1.3029535864978903, "progress_pct": 21.72, "epoch_pct": 21.72, "eta": "55:46:58", "max_grad_norm": 0.8, "loss": 0.6889358758926392, "grad_norm": 1.135939359664917, "learning_rate": 9.462259987471329e-05} +{"ts": "2025-12-27T09:37:37", "event": "train_log", "step": 3090, "epoch": 1.3037974683544304, "progress_pct": 21.73, "epoch_pct": 21.73, "eta": "55:45:16", "max_grad_norm": 0.8, "loss": 0.7097522020339966, "grad_norm": 1.3935760259628296, "learning_rate": 9.461176071569063e-05} +{"ts": "2025-12-27T09:37:56", "event": "train_log", "step": 3092, "epoch": 1.3046413502109704, "progress_pct": 21.74, "epoch_pct": 21.74, "eta": "55:43:40", "max_grad_norm": 0.8, "loss": 0.7044580578804016, "grad_norm": 1.153518795967102, "learning_rate": 9.460091126559155e-05} +{"ts": "2025-12-27T09:38:17", "event": "train_log", "step": 3094, "epoch": 1.3054852320675105, "progress_pct": 21.76, "epoch_pct": 21.76, "eta": "55:42:10", "max_grad_norm": 0.8, "loss": 0.6119300723075867, "grad_norm": 1.2112717628479004, "learning_rate": 9.45900515269188e-05} +{"ts": "2025-12-27T09:38:36", "event": "train_log", "step": 3096, "epoch": 1.3063291139240507, "progress_pct": 21.77, "epoch_pct": 21.77, "eta": "55:40:31", "max_grad_norm": 0.8, "loss": 0.7150222063064575, "grad_norm": 1.295591115951538, "learning_rate": 9.457918150217754e-05} +{"ts": "2025-12-27T09:38:56", "event": "train_log", "step": 3098, "epoch": 1.3071729957805907, "progress_pct": 21.79, "epoch_pct": 21.79, "eta": "55:38:59", "max_grad_norm": 0.8, "loss": 0.6043334007263184, "grad_norm": 1.1175775527954102, "learning_rate": 9.456830119387527e-05} +{"ts": "2025-12-27T09:39:16", "event": "train_log", "step": 3100, "epoch": 1.3080168776371308, "progress_pct": 21.8, "epoch_pct": 21.8, "eta": "55:37:25", "max_grad_norm": 0.8, "loss": 0.6354425549507141, "grad_norm": 1.4022588729858398, "learning_rate": 9.455741060452186e-05} +{"ts": "2025-12-27T09:53:39", "event": "train_log", "step": 3100, "epoch": 1.3080168776371308, "progress_pct": 21.8, "epoch_pct": 21.8, "eta": "56:28:59", "max_grad_norm": 0.8, "eval_loss": 0.7225774526596069, "eval_runtime": 862.4006, "eval_samples_per_second": 2.443, "eval_steps_per_second": 2.443} +{"ts": "2025-12-27T09:53:59", "event": "train_log", "step": 3102, "epoch": 1.3088607594936708, "progress_pct": 21.81, "epoch_pct": 21.81, "eta": "56:27:23", "max_grad_norm": 0.8, "loss": 0.7281571626663208, "grad_norm": 1.1657692193984985, "learning_rate": 9.454650973662957e-05} +{"ts": "2025-12-27T09:54:17", "event": "train_log", "step": 3104, "epoch": 1.309704641350211, "progress_pct": 21.83, "epoch_pct": 21.83, "eta": "56:25:40", "max_grad_norm": 0.8, "loss": 0.8038214445114136, "grad_norm": 1.6169127225875854, "learning_rate": 9.453559859271301e-05} +{"ts": "2025-12-27T09:54:37", "event": "train_log", "step": 3106, "epoch": 1.310548523206751, "progress_pct": 21.84, "epoch_pct": 21.84, "eta": "56:24:03", "max_grad_norm": 0.8, "loss": 0.6488606333732605, "grad_norm": 1.1256520748138428, "learning_rate": 9.452467717528918e-05} +{"ts": "2025-12-27T09:54:56", "event": "train_log", "step": 3108, "epoch": 1.311392405063291, "progress_pct": 21.86, "epoch_pct": 21.86, "eta": "56:22:27", "max_grad_norm": 0.8, "loss": 0.6897066235542297, "grad_norm": 1.1224530935287476, "learning_rate": 9.451374548687745e-05} +{"ts": "2025-12-27T09:55:16", "event": "train_log", "step": 3110, "epoch": 1.3122362869198312, "progress_pct": 21.87, "epoch_pct": 21.87, "eta": "56:20:51", "max_grad_norm": 0.8, "loss": 0.6332913041114807, "grad_norm": 1.1123055219650269, "learning_rate": 9.450280352999952e-05} +{"ts": "2025-12-27T09:55:34", "event": "train_log", "step": 3112, "epoch": 1.3130801687763713, "progress_pct": 21.88, "epoch_pct": 21.88, "eta": "56:19:09", "max_grad_norm": 0.8, "loss": 0.7426630854606628, "grad_norm": 1.1688940525054932, "learning_rate": 9.449185130717952e-05} +{"ts": "2025-12-27T09:55:52", "event": "train_log", "step": 3114, "epoch": 1.3139240506329113, "progress_pct": 21.9, "epoch_pct": 21.9, "eta": "56:17:26", "max_grad_norm": 0.8, "loss": 0.7156099677085876, "grad_norm": 1.1898044347763062, "learning_rate": 9.44808888209439e-05} +{"ts": "2025-12-27T09:56:12", "event": "train_log", "step": 3116, "epoch": 1.3147679324894515, "progress_pct": 21.91, "epoch_pct": 21.91, "eta": "56:15:50", "max_grad_norm": 0.8, "loss": 0.7150979042053223, "grad_norm": 1.3030686378479004, "learning_rate": 9.44699160738215e-05} +{"ts": "2025-12-27T09:56:32", "event": "train_log", "step": 3118, "epoch": 1.3156118143459916, "progress_pct": 21.93, "epoch_pct": 21.93, "eta": "56:14:16", "max_grad_norm": 0.8, "loss": 0.6687285900115967, "grad_norm": 1.1539074182510376, "learning_rate": 9.445893306834352e-05} +{"ts": "2025-12-27T09:56:50", "event": "train_log", "step": 3120, "epoch": 1.3164556962025316, "progress_pct": 21.94, "epoch_pct": 21.94, "eta": "56:12:33", "max_grad_norm": 0.8, "loss": 0.7340983152389526, "grad_norm": 1.311808466911316, "learning_rate": 9.444793980704355e-05} +{"ts": "2025-12-27T09:57:09", "event": "train_log", "step": 3122, "epoch": 1.3172995780590717, "progress_pct": 21.95, "epoch_pct": 21.95, "eta": "56:10:52", "max_grad_norm": 0.8, "loss": 0.6620677709579468, "grad_norm": 1.3325430154800415, "learning_rate": 9.44369362924575e-05} +{"ts": "2025-12-27T09:57:26", "event": "train_log", "step": 3124, "epoch": 1.3181434599156119, "progress_pct": 21.97, "epoch_pct": 21.97, "eta": "56:09:10", "max_grad_norm": 0.8, "loss": 0.6169955134391785, "grad_norm": 1.201518177986145, "learning_rate": 9.442592252712365e-05} +{"ts": "2025-12-27T09:57:47", "event": "train_log", "step": 3126, "epoch": 1.3189873417721518, "progress_pct": 21.98, "epoch_pct": 21.98, "eta": "56:07:36", "max_grad_norm": 0.8, "loss": 0.6696792840957642, "grad_norm": 1.2124013900756836, "learning_rate": 9.441489851358272e-05} +{"ts": "2025-12-27T09:58:05", "event": "train_log", "step": 3128, "epoch": 1.319831223628692, "progress_pct": 22.0, "epoch_pct": 22.0, "eta": "56:05:57", "max_grad_norm": 0.8, "loss": 0.7303428649902344, "grad_norm": 1.2186850309371948, "learning_rate": 9.440386425437768e-05} +{"ts": "2025-12-27T09:58:24", "event": "train_log", "step": 3130, "epoch": 1.3206751054852321, "progress_pct": 22.01, "epoch_pct": 22.01, "eta": "56:04:17", "max_grad_norm": 0.8, "loss": 0.7093026638031006, "grad_norm": 1.3780523538589478, "learning_rate": 9.439281975205396e-05} +{"ts": "2025-12-27T09:58:43", "event": "train_log", "step": 3132, "epoch": 1.321518987341772, "progress_pct": 22.03, "epoch_pct": 22.03, "eta": "56:02:37", "max_grad_norm": 0.8, "loss": 0.6821767687797546, "grad_norm": 1.233353614807129, "learning_rate": 9.438176500915932e-05} +{"ts": "2025-12-27T09:59:01", "event": "train_log", "step": 3134, "epoch": 1.3223628691983123, "progress_pct": 22.04, "epoch_pct": 22.04, "eta": "56:00:56", "max_grad_norm": 0.8, "loss": 0.700680136680603, "grad_norm": 1.2425329685211182, "learning_rate": 9.437070002824385e-05} +{"ts": "2025-12-27T09:59:21", "event": "train_log", "step": 3136, "epoch": 1.3232067510548524, "progress_pct": 22.05, "epoch_pct": 22.05, "eta": "55:59:22", "max_grad_norm": 0.8, "loss": 0.6173145771026611, "grad_norm": 1.1600432395935059, "learning_rate": 9.435962481186003e-05} +{"ts": "2025-12-27T09:59:40", "event": "train_log", "step": 3138, "epoch": 1.3240506329113924, "progress_pct": 22.07, "epoch_pct": 22.07, "eta": "55:57:45", "max_grad_norm": 0.8, "loss": 0.6597106456756592, "grad_norm": 1.279336929321289, "learning_rate": 9.434853936256272e-05} +{"ts": "2025-12-27T10:00:01", "event": "train_log", "step": 3140, "epoch": 1.3248945147679325, "progress_pct": 22.08, "epoch_pct": 22.08, "eta": "55:56:16", "max_grad_norm": 0.8, "loss": 0.6655287742614746, "grad_norm": 1.1787258386611938, "learning_rate": 9.433744368290909e-05} +{"ts": "2025-12-27T10:00:21", "event": "train_log", "step": 3142, "epoch": 1.3257383966244727, "progress_pct": 22.1, "epoch_pct": 22.1, "eta": "55:54:40", "max_grad_norm": 0.8, "loss": 0.6312944889068604, "grad_norm": 1.3658509254455566, "learning_rate": 9.432633777545874e-05} +{"ts": "2025-12-27T10:00:41", "event": "train_log", "step": 3144, "epoch": 1.3265822784810126, "progress_pct": 22.11, "epoch_pct": 22.11, "eta": "55:53:09", "max_grad_norm": 0.8, "loss": 0.6696156859397888, "grad_norm": 1.1220000982284546, "learning_rate": 9.431522164277356e-05} +{"ts": "2025-12-27T10:01:00", "event": "train_log", "step": 3146, "epoch": 1.3274261603375528, "progress_pct": 22.12, "epoch_pct": 22.12, "eta": "55:51:30", "max_grad_norm": 0.8, "loss": 0.6586571335792542, "grad_norm": 1.224761724472046, "learning_rate": 9.430409528741783e-05} +{"ts": "2025-12-27T10:01:19", "event": "train_log", "step": 3148, "epoch": 1.328270042194093, "progress_pct": 22.14, "epoch_pct": 22.14, "eta": "55:49:54", "max_grad_norm": 0.8, "loss": 0.64905846118927, "grad_norm": 1.227510929107666, "learning_rate": 9.429295871195821e-05} +{"ts": "2025-12-27T10:01:39", "event": "train_log", "step": 3150, "epoch": 1.3291139240506329, "progress_pct": 22.15, "epoch_pct": 22.15, "eta": "55:48:19", "max_grad_norm": 0.8, "loss": 0.6407933831214905, "grad_norm": 1.1359103918075562, "learning_rate": 9.428181191896366e-05} +{"ts": "2025-12-27T10:01:59", "event": "train_log", "step": 3152, "epoch": 1.329957805907173, "progress_pct": 22.17, "epoch_pct": 22.17, "eta": "55:46:46", "max_grad_norm": 0.8, "loss": 0.7004884481430054, "grad_norm": 1.2729473114013672, "learning_rate": 9.427065491100556e-05} +{"ts": "2025-12-27T10:02:20", "event": "train_log", "step": 3154, "epoch": 1.3308016877637132, "progress_pct": 22.18, "epoch_pct": 22.18, "eta": "55:45:15", "max_grad_norm": 0.8, "loss": 0.6835907101631165, "grad_norm": 1.1182841062545776, "learning_rate": 9.42594876906576e-05} +{"ts": "2025-12-27T10:02:39", "event": "train_log", "step": 3156, "epoch": 1.3316455696202532, "progress_pct": 22.19, "epoch_pct": 22.19, "eta": "55:43:38", "max_grad_norm": 0.8, "loss": 0.7476315498352051, "grad_norm": 1.2309781312942505, "learning_rate": 9.424831026049585e-05} +{"ts": "2025-12-27T10:02:58", "event": "train_log", "step": 3158, "epoch": 1.3324894514767933, "progress_pct": 22.21, "epoch_pct": 22.21, "eta": "55:42:03", "max_grad_norm": 0.8, "loss": 0.6811426281929016, "grad_norm": 1.0857728719711304, "learning_rate": 9.423712262309873e-05} +{"ts": "2025-12-27T10:03:18", "event": "train_log", "step": 3160, "epoch": 1.3333333333333333, "progress_pct": 22.22, "epoch_pct": 22.22, "eta": "55:40:30", "max_grad_norm": 0.8, "loss": 0.6403942108154297, "grad_norm": 1.299680233001709, "learning_rate": 9.4225924781047e-05} +{"ts": "2025-12-27T10:03:39", "event": "train_log", "step": 3162, "epoch": 1.3341772151898734, "progress_pct": 22.24, "epoch_pct": 22.24, "eta": "55:38:59", "max_grad_norm": 0.8, "loss": 0.6758930683135986, "grad_norm": 1.226472020149231, "learning_rate": 9.421471673692382e-05} +{"ts": "2025-12-27T10:03:58", "event": "train_log", "step": 3164, "epoch": 1.3350210970464136, "progress_pct": 22.25, "epoch_pct": 22.25, "eta": "55:37:25", "max_grad_norm": 0.8, "loss": 0.7119444608688354, "grad_norm": 1.1403205394744873, "learning_rate": 9.420349849331463e-05} +{"ts": "2025-12-27T10:04:17", "event": "train_log", "step": 3166, "epoch": 1.3358649789029535, "progress_pct": 22.26, "epoch_pct": 22.26, "eta": "55:35:46", "max_grad_norm": 0.8, "loss": 0.7411463260650635, "grad_norm": 1.2888442277908325, "learning_rate": 9.419227005280729e-05} +{"ts": "2025-12-27T10:04:37", "event": "train_log", "step": 3168, "epoch": 1.3367088607594937, "progress_pct": 22.28, "epoch_pct": 22.28, "eta": "55:34:13", "max_grad_norm": 0.8, "loss": 0.5992606282234192, "grad_norm": 1.1929190158843994, "learning_rate": 9.418103141799197e-05} +{"ts": "2025-12-27T10:04:57", "event": "train_log", "step": 3170, "epoch": 1.3375527426160336, "progress_pct": 22.29, "epoch_pct": 22.29, "eta": "55:32:42", "max_grad_norm": 0.8, "loss": 0.6728890538215637, "grad_norm": 1.2574355602264404, "learning_rate": 9.416978259146122e-05} +{"ts": "2025-12-27T10:05:19", "event": "train_log", "step": 3172, "epoch": 1.3383966244725738, "progress_pct": 22.31, "epoch_pct": 22.31, "eta": "55:31:15", "max_grad_norm": 0.8, "loss": 0.6294883489608765, "grad_norm": 0.9653727412223816, "learning_rate": 9.415852357580992e-05} +{"ts": "2025-12-27T10:05:38", "event": "train_log", "step": 3174, "epoch": 1.339240506329114, "progress_pct": 22.32, "epoch_pct": 22.32, "eta": "55:29:39", "max_grad_norm": 0.8, "loss": 0.6816665530204773, "grad_norm": 1.2107670307159424, "learning_rate": 9.414725437363532e-05} +{"ts": "2025-12-27T10:05:59", "event": "train_log", "step": 3176, "epoch": 1.340084388185654, "progress_pct": 22.33, "epoch_pct": 22.33, "eta": "55:28:09", "max_grad_norm": 0.8, "loss": 0.6186381578445435, "grad_norm": 1.024849534034729, "learning_rate": 9.4135974987537e-05} +{"ts": "2025-12-27T10:06:19", "event": "train_log", "step": 3178, "epoch": 1.340928270042194, "progress_pct": 22.35, "epoch_pct": 22.35, "eta": "55:26:37", "max_grad_norm": 0.8, "loss": 0.6071005463600159, "grad_norm": 1.1556614637374878, "learning_rate": 9.41246854201169e-05} +{"ts": "2025-12-27T10:06:38", "event": "train_log", "step": 3180, "epoch": 1.3417721518987342, "progress_pct": 22.36, "epoch_pct": 22.36, "eta": "55:25:04", "max_grad_norm": 0.8, "loss": 0.7871434092521667, "grad_norm": 1.2382808923721313, "learning_rate": 9.41133856739793e-05} +{"ts": "2025-12-27T10:07:00", "event": "train_log", "step": 3182, "epoch": 1.3426160337552742, "progress_pct": 22.38, "epoch_pct": 22.38, "eta": "55:23:38", "max_grad_norm": 0.8, "loss": 0.6578201651573181, "grad_norm": 1.0499578714370728, "learning_rate": 9.410207575173082e-05} +{"ts": "2025-12-27T10:07:20", "event": "train_log", "step": 3184, "epoch": 1.3434599156118143, "progress_pct": 22.39, "epoch_pct": 22.39, "eta": "55:22:06", "max_grad_norm": 0.8, "loss": 0.6271620392799377, "grad_norm": 1.2048250436782837, "learning_rate": 9.409075565598049e-05} +{"ts": "2025-12-27T10:07:40", "event": "train_log", "step": 3186, "epoch": 1.3443037974683545, "progress_pct": 22.41, "epoch_pct": 22.41, "eta": "55:20:33", "max_grad_norm": 0.8, "loss": 0.5773864388465881, "grad_norm": 1.0287591218948364, "learning_rate": 9.407942538933958e-05} +{"ts": "2025-12-27T10:07:59", "event": "train_log", "step": 3188, "epoch": 1.3451476793248944, "progress_pct": 22.42, "epoch_pct": 22.42, "eta": "55:18:59", "max_grad_norm": 0.8, "loss": 0.6745175719261169, "grad_norm": 1.1125097274780273, "learning_rate": 9.406808495442181e-05} +{"ts": "2025-12-27T10:08:20", "event": "train_log", "step": 3190, "epoch": 1.3459915611814346, "progress_pct": 22.43, "epoch_pct": 22.43, "eta": "55:17:31", "max_grad_norm": 0.8, "loss": 0.6001214385032654, "grad_norm": 1.036125898361206, "learning_rate": 9.405673435384319e-05} +{"ts": "2025-12-27T10:08:40", "event": "train_log", "step": 3192, "epoch": 1.3468354430379748, "progress_pct": 22.45, "epoch_pct": 22.45, "eta": "55:15:58", "max_grad_norm": 0.8, "loss": 0.6703945994377136, "grad_norm": 1.2771985530853271, "learning_rate": 9.404537359022207e-05} +{"ts": "2025-12-27T10:09:02", "event": "train_log", "step": 3194, "epoch": 1.3476793248945147, "progress_pct": 22.46, "epoch_pct": 22.46, "eta": "55:14:34", "max_grad_norm": 0.8, "loss": 0.6159096360206604, "grad_norm": 1.0891097784042358, "learning_rate": 9.403400266617918e-05} +{"ts": "2025-12-27T10:09:22", "event": "train_log", "step": 3196, "epoch": 1.3485232067510549, "progress_pct": 22.48, "epoch_pct": 22.48, "eta": "55:13:03", "max_grad_norm": 0.8, "loss": 0.6439315676689148, "grad_norm": 1.1926233768463135, "learning_rate": 9.402262158433755e-05} +{"ts": "2025-12-27T10:09:43", "event": "train_log", "step": 3198, "epoch": 1.349367088607595, "progress_pct": 22.49, "epoch_pct": 22.49, "eta": "55:11:33", "max_grad_norm": 0.8, "loss": 0.7125352025032043, "grad_norm": 1.272557020187378, "learning_rate": 9.40112303473226e-05} +{"ts": "2025-12-27T10:10:03", "event": "train_log", "step": 3200, "epoch": 1.350210970464135, "progress_pct": 22.5, "epoch_pct": 22.5, "eta": "55:10:04", "max_grad_norm": 0.8, "loss": 0.594719648361206, "grad_norm": 1.052037239074707, "learning_rate": 9.399982895776207e-05} +{"ts": "2025-12-27T10:24:10", "event": "train_log", "step": 3200, "epoch": 1.350210970464135, "progress_pct": 22.5, "epoch_pct": 22.5, "eta": "55:58:38", "max_grad_norm": 0.8, "eval_loss": 0.7200453281402588, "eval_runtime": 846.2953, "eval_samples_per_second": 2.49, "eval_steps_per_second": 2.49} +{"ts": "2025-12-27T10:24:30", "event": "train_log", "step": 3202, "epoch": 1.3510548523206751, "progress_pct": 22.52, "epoch_pct": 22.52, "eta": "55:57:05", "max_grad_norm": 0.8, "loss": 0.6390520334243774, "grad_norm": 1.204728126525879, "learning_rate": 9.398841741828601e-05} +{"ts": "2025-12-27T10:24:50", "event": "train_log", "step": 3204, "epoch": 1.3518987341772153, "progress_pct": 22.53, "epoch_pct": 22.53, "eta": "55:55:33", "max_grad_norm": 0.8, "loss": 0.6010531187057495, "grad_norm": 1.0873899459838867, "learning_rate": 9.397699573152689e-05} +{"ts": "2025-12-27T10:25:10", "event": "train_log", "step": 3206, "epoch": 1.3527426160337552, "progress_pct": 22.55, "epoch_pct": 22.55, "eta": "55:53:57", "max_grad_norm": 0.8, "loss": 0.724280834197998, "grad_norm": 1.3124359846115112, "learning_rate": 9.396556390011944e-05} +{"ts": "2025-12-27T10:25:30", "event": "train_log", "step": 3208, "epoch": 1.3535864978902954, "progress_pct": 22.56, "epoch_pct": 22.56, "eta": "55:52:25", "max_grad_norm": 0.8, "loss": 0.6430405378341675, "grad_norm": 1.2179948091506958, "learning_rate": 9.395412192670075e-05} +{"ts": "2025-12-27T10:25:50", "event": "train_log", "step": 3210, "epoch": 1.3544303797468356, "progress_pct": 22.57, "epoch_pct": 22.57, "eta": "55:50:52", "max_grad_norm": 0.8, "loss": 0.7188641428947449, "grad_norm": 1.2617219686508179, "learning_rate": 9.394266981391031e-05} +{"ts": "2025-12-27T10:26:10", "event": "train_log", "step": 3212, "epoch": 1.3552742616033755, "progress_pct": 22.59, "epoch_pct": 22.59, "eta": "55:49:19", "max_grad_norm": 0.8, "loss": 0.6724364757537842, "grad_norm": 1.2151501178741455, "learning_rate": 9.393120756438988e-05} +{"ts": "2025-12-27T10:26:30", "event": "train_log", "step": 3214, "epoch": 1.3561181434599157, "progress_pct": 22.6, "epoch_pct": 22.6, "eta": "55:47:46", "max_grad_norm": 0.8, "loss": 0.6340664625167847, "grad_norm": 1.221528172492981, "learning_rate": 9.391973518078357e-05} +{"ts": "2025-12-27T10:26:50", "event": "train_log", "step": 3216, "epoch": 1.3569620253164558, "progress_pct": 22.62, "epoch_pct": 22.62, "eta": "55:46:14", "max_grad_norm": 0.8, "loss": 0.6914255023002625, "grad_norm": 1.3180092573165894, "learning_rate": 9.390825266573786e-05} +{"ts": "2025-12-27T10:27:12", "event": "train_log", "step": 3218, "epoch": 1.3578059071729958, "progress_pct": 22.63, "epoch_pct": 22.63, "eta": "55:44:47", "max_grad_norm": 0.8, "loss": 0.6137136220932007, "grad_norm": 1.103994369506836, "learning_rate": 9.38967600219015e-05} +{"ts": "2025-12-27T10:27:31", "event": "train_log", "step": 3220, "epoch": 1.358649789029536, "progress_pct": 22.64, "epoch_pct": 22.64, "eta": "55:43:10", "max_grad_norm": 0.8, "loss": 0.7173700332641602, "grad_norm": 1.33389413356781, "learning_rate": 9.38852572519257e-05} +{"ts": "2025-12-27T10:27:51", "event": "train_log", "step": 3222, "epoch": 1.3594936708860759, "progress_pct": 22.66, "epoch_pct": 22.66, "eta": "55:41:37", "max_grad_norm": 0.8, "loss": 0.5942243933677673, "grad_norm": 1.1074159145355225, "learning_rate": 9.387374435846386e-05} +{"ts": "2025-12-27T10:28:10", "event": "train_log", "step": 3224, "epoch": 1.360337552742616, "progress_pct": 22.67, "epoch_pct": 22.67, "eta": "55:40:02", "max_grad_norm": 0.8, "loss": 0.6362866163253784, "grad_norm": 1.1157063245773315, "learning_rate": 9.386222134417182e-05} +{"ts": "2025-12-27T10:28:30", "event": "train_log", "step": 3226, "epoch": 1.3611814345991562, "progress_pct": 22.69, "epoch_pct": 22.69, "eta": "55:38:30", "max_grad_norm": 0.8, "loss": 0.6784523129463196, "grad_norm": 1.1717792749404907, "learning_rate": 9.38506882117077e-05} +{"ts": "2025-12-27T10:28:50", "event": "train_log", "step": 3228, "epoch": 1.3620253164556961, "progress_pct": 22.7, "epoch_pct": 22.7, "eta": "55:36:58", "max_grad_norm": 0.8, "loss": 0.6647377014160156, "grad_norm": 1.0946043729782104, "learning_rate": 9.383914496373197e-05} +{"ts": "2025-12-27T10:29:12", "event": "train_log", "step": 3230, "epoch": 1.3628691983122363, "progress_pct": 22.71, "epoch_pct": 22.71, "eta": "55:35:31", "max_grad_norm": 0.8, "loss": 0.6302075982093811, "grad_norm": 1.1519699096679688, "learning_rate": 9.382759160290746e-05} +{"ts": "2025-12-27T10:29:34", "event": "train_log", "step": 3232, "epoch": 1.3637130801687762, "progress_pct": 22.73, "epoch_pct": 22.73, "eta": "55:34:05", "max_grad_norm": 0.8, "loss": 0.5979090332984924, "grad_norm": 0.9928684830665588, "learning_rate": 9.381602813189929e-05} +{"ts": "2025-12-27T10:29:52", "event": "train_log", "step": 3234, "epoch": 1.3645569620253164, "progress_pct": 22.74, "epoch_pct": 22.74, "eta": "55:32:28", "max_grad_norm": 0.8, "loss": 0.6949353218078613, "grad_norm": 1.2488124370574951, "learning_rate": 9.380445455337492e-05} +{"ts": "2025-12-27T10:30:10", "event": "train_log", "step": 3236, "epoch": 1.3654008438818566, "progress_pct": 22.76, "epoch_pct": 22.76, "eta": "55:30:49", "max_grad_norm": 0.8, "loss": 0.7225558161735535, "grad_norm": 1.3884797096252441, "learning_rate": 9.379287087000416e-05} +{"ts": "2025-12-27T10:30:28", "event": "train_log", "step": 3238, "epoch": 1.3662447257383965, "progress_pct": 22.77, "epoch_pct": 22.77, "eta": "55:29:11", "max_grad_norm": 0.8, "loss": 0.6993390917778015, "grad_norm": 1.2981176376342773, "learning_rate": 9.378127708445917e-05} +{"ts": "2025-12-27T10:30:48", "event": "train_log", "step": 3240, "epoch": 1.3670886075949367, "progress_pct": 22.78, "epoch_pct": 22.78, "eta": "55:27:38", "max_grad_norm": 0.8, "loss": 0.6983805894851685, "grad_norm": 0.9884640574455261, "learning_rate": 9.376967319941438e-05} +{"ts": "2025-12-27T10:31:08", "event": "train_log", "step": 3242, "epoch": 1.3679324894514768, "progress_pct": 22.8, "epoch_pct": 22.8, "eta": "55:26:05", "max_grad_norm": 0.8, "loss": 0.7062534689903259, "grad_norm": 1.2051894664764404, "learning_rate": 9.375805921754659e-05} +{"ts": "2025-12-27T10:31:28", "event": "train_log", "step": 3244, "epoch": 1.3687763713080168, "progress_pct": 22.81, "epoch_pct": 22.81, "eta": "55:24:35", "max_grad_norm": 0.8, "loss": 0.6405107378959656, "grad_norm": 1.1943434476852417, "learning_rate": 9.374643514153494e-05} +{"ts": "2025-12-27T10:31:48", "event": "train_log", "step": 3246, "epoch": 1.369620253164557, "progress_pct": 22.83, "epoch_pct": 22.83, "eta": "55:23:01", "max_grad_norm": 0.8, "loss": 0.6844781637191772, "grad_norm": 1.249214768409729, "learning_rate": 9.373480097406086e-05} +{"ts": "2025-12-27T10:32:08", "event": "train_log", "step": 3248, "epoch": 1.370464135021097, "progress_pct": 22.84, "epoch_pct": 22.84, "eta": "55:21:30", "max_grad_norm": 0.8, "loss": 0.6048306226730347, "grad_norm": 1.1847131252288818, "learning_rate": 9.372315671780813e-05} +{"ts": "2025-12-27T10:32:27", "event": "train_log", "step": 3250, "epoch": 1.371308016877637, "progress_pct": 22.86, "epoch_pct": 22.86, "eta": "55:19:56", "max_grad_norm": 0.8, "loss": 0.6772685050964355, "grad_norm": 1.125545859336853, "learning_rate": 9.37115023754629e-05} +{"ts": "2025-12-27T10:32:44", "event": "train_log", "step": 3252, "epoch": 1.3721518987341772, "progress_pct": 22.87, "epoch_pct": 22.87, "eta": "55:18:14", "max_grad_norm": 0.8, "loss": 0.7536272406578064, "grad_norm": 1.466615915298462, "learning_rate": 9.369983794971354e-05} +{"ts": "2025-12-27T10:33:03", "event": "train_log", "step": 3254, "epoch": 1.3729957805907174, "progress_pct": 22.88, "epoch_pct": 22.88, "eta": "55:16:41", "max_grad_norm": 0.8, "loss": 0.6640655398368835, "grad_norm": 1.066699504852295, "learning_rate": 9.368816344325084e-05} +{"ts": "2025-12-27T10:33:23", "event": "train_log", "step": 3256, "epoch": 1.3738396624472573, "progress_pct": 22.9, "epoch_pct": 22.9, "eta": "55:15:10", "max_grad_norm": 0.8, "loss": 0.7029458284378052, "grad_norm": 1.4793988466262817, "learning_rate": 9.367647885876787e-05} +{"ts": "2025-12-27T10:33:42", "event": "train_log", "step": 3258, "epoch": 1.3746835443037975, "progress_pct": 22.91, "epoch_pct": 22.91, "eta": "55:13:34", "max_grad_norm": 0.8, "loss": 0.7231863737106323, "grad_norm": 1.258540153503418, "learning_rate": 9.366478419896006e-05} +{"ts": "2025-12-27T10:34:02", "event": "train_log", "step": 3260, "epoch": 1.3755274261603376, "progress_pct": 22.93, "epoch_pct": 22.93, "eta": "55:12:04", "max_grad_norm": 0.8, "loss": 0.6679144501686096, "grad_norm": 1.176106333732605, "learning_rate": 9.365307946652512e-05} +{"ts": "2025-12-27T10:34:22", "event": "train_log", "step": 3262, "epoch": 1.3763713080168776, "progress_pct": 22.94, "epoch_pct": 22.94, "eta": "55:10:31", "max_grad_norm": 0.8, "loss": 0.6282188296318054, "grad_norm": 1.3301753997802734, "learning_rate": 9.364136466416316e-05} +{"ts": "2025-12-27T10:34:40", "event": "train_log", "step": 3264, "epoch": 1.3772151898734177, "progress_pct": 22.95, "epoch_pct": 22.95, "eta": "55:08:54", "max_grad_norm": 0.8, "loss": 0.6870840191841125, "grad_norm": 1.3616732358932495, "learning_rate": 9.362963979457648e-05} +{"ts": "2025-12-27T10:35:00", "event": "train_log", "step": 3266, "epoch": 1.378059071729958, "progress_pct": 22.97, "epoch_pct": 22.97, "eta": "55:07:24", "max_grad_norm": 0.8, "loss": 0.6823731660842896, "grad_norm": 1.1982418298721313, "learning_rate": 9.361790486046985e-05} +{"ts": "2025-12-27T10:35:19", "event": "train_log", "step": 3268, "epoch": 1.3789029535864978, "progress_pct": 22.98, "epoch_pct": 22.98, "eta": "55:05:52", "max_grad_norm": 0.8, "loss": 0.6582897305488586, "grad_norm": 1.1869033575057983, "learning_rate": 9.360615986455024e-05} +{"ts": "2025-12-27T10:35:39", "event": "train_log", "step": 3270, "epoch": 1.379746835443038, "progress_pct": 23.0, "epoch_pct": 23.0, "eta": "55:04:22", "max_grad_norm": 0.8, "loss": 0.716654360294342, "grad_norm": 1.1192975044250488, "learning_rate": 9.359440480952703e-05} +{"ts": "2025-12-27T10:35:59", "event": "train_log", "step": 3272, "epoch": 1.3805907172995782, "progress_pct": 23.01, "epoch_pct": 23.01, "eta": "55:02:51", "max_grad_norm": 0.8, "loss": 0.6880061626434326, "grad_norm": 1.2210016250610352, "learning_rate": 9.358263969811189e-05} +{"ts": "2025-12-27T10:36:20", "event": "train_log", "step": 3274, "epoch": 1.381434599156118, "progress_pct": 23.02, "epoch_pct": 23.02, "eta": "55:01:22", "max_grad_norm": 0.8, "loss": 0.666864812374115, "grad_norm": 1.0358284711837769, "learning_rate": 9.357086453301878e-05} +{"ts": "2025-12-27T10:36:39", "event": "train_log", "step": 3276, "epoch": 1.3822784810126583, "progress_pct": 23.04, "epoch_pct": 23.04, "eta": "54:59:48", "max_grad_norm": 0.8, "loss": 0.6872087121009827, "grad_norm": 1.2790803909301758, "learning_rate": 9.355907931696401e-05} +{"ts": "2025-12-27T10:36:57", "event": "train_log", "step": 3278, "epoch": 1.3831223628691984, "progress_pct": 23.05, "epoch_pct": 23.05, "eta": "54:58:13", "max_grad_norm": 0.8, "loss": 0.5929665565490723, "grad_norm": 1.182991623878479, "learning_rate": 9.354728405266623e-05} +{"ts": "2025-12-27T10:37:18", "event": "train_log", "step": 3280, "epoch": 1.3839662447257384, "progress_pct": 23.07, "epoch_pct": 23.07, "eta": "54:56:44", "max_grad_norm": 0.8, "loss": 0.5928181409835815, "grad_norm": 1.1071184873580933, "learning_rate": 9.353547874284634e-05} +{"ts": "2025-12-27T10:37:36", "event": "train_log", "step": 3282, "epoch": 1.3848101265822785, "progress_pct": 23.08, "epoch_pct": 23.08, "eta": "54:55:09", "max_grad_norm": 0.8, "loss": 0.6783652901649475, "grad_norm": 1.3139623403549194, "learning_rate": 9.352366339022763e-05} +{"ts": "2025-12-27T10:37:54", "event": "train_log", "step": 3284, "epoch": 1.3856540084388187, "progress_pct": 23.09, "epoch_pct": 23.09, "eta": "54:53:32", "max_grad_norm": 0.8, "loss": 0.7652941346168518, "grad_norm": 1.2534632682800293, "learning_rate": 9.351183799753567e-05} +{"ts": "2025-12-27T10:38:12", "event": "train_log", "step": 3286, "epoch": 1.3864978902953586, "progress_pct": 23.11, "epoch_pct": 23.11, "eta": "54:51:56", "max_grad_norm": 0.8, "loss": 0.7430433630943298, "grad_norm": 1.4487930536270142, "learning_rate": 9.350000256749833e-05} +{"ts": "2025-12-27T10:38:33", "event": "train_log", "step": 3288, "epoch": 1.3873417721518988, "progress_pct": 23.12, "epoch_pct": 23.12, "eta": "54:50:28", "max_grad_norm": 0.8, "loss": 0.5854598879814148, "grad_norm": 1.0786021947860718, "learning_rate": 9.348815710284584e-05} +{"ts": "2025-12-27T10:38:53", "event": "train_log", "step": 3290, "epoch": 1.3881856540084387, "progress_pct": 23.14, "epoch_pct": 23.14, "eta": "54:48:58", "max_grad_norm": 0.8, "loss": 0.6365222334861755, "grad_norm": 1.0544480085372925, "learning_rate": 9.347630160631071e-05} +{"ts": "2025-12-27T10:39:12", "event": "train_log", "step": 3292, "epoch": 1.389029535864979, "progress_pct": 23.15, "epoch_pct": 23.15, "eta": "54:47:27", "max_grad_norm": 0.8, "loss": 0.6485803127288818, "grad_norm": 0.9989988207817078, "learning_rate": 9.346443608062778e-05} +{"ts": "2025-12-27T10:39:31", "event": "train_log", "step": 3294, "epoch": 1.389873417721519, "progress_pct": 23.16, "epoch_pct": 23.16, "eta": "54:45:54", "max_grad_norm": 0.8, "loss": 0.6417753100395203, "grad_norm": 1.100951910018921, "learning_rate": 9.345256052853419e-05} +{"ts": "2025-12-27T10:39:50", "event": "train_log", "step": 3296, "epoch": 1.390717299578059, "progress_pct": 23.18, "epoch_pct": 23.18, "eta": "54:44:22", "max_grad_norm": 0.8, "loss": 0.6333693861961365, "grad_norm": 1.1398471593856812, "learning_rate": 9.344067495276942e-05} +{"ts": "2025-12-27T10:40:09", "event": "train_log", "step": 3298, "epoch": 1.3915611814345992, "progress_pct": 23.19, "epoch_pct": 23.19, "eta": "54:42:47", "max_grad_norm": 0.8, "loss": 0.677288293838501, "grad_norm": 1.1745941638946533, "learning_rate": 9.342877935607521e-05} +{"ts": "2025-12-27T10:40:26", "event": "train_log", "step": 3300, "epoch": 1.3924050632911391, "progress_pct": 23.21, "epoch_pct": 23.21, "eta": "54:41:10", "max_grad_norm": 0.8, "loss": 0.7408396005630493, "grad_norm": 1.2651115655899048, "learning_rate": 9.34168737411957e-05} +{"ts": "2025-12-27T10:54:40", "event": "train_log", "step": 3300, "epoch": 1.3924050632911391, "progress_pct": 23.21, "epoch_pct": 23.21, "eta": "55:28:14", "max_grad_norm": 0.8, "eval_loss": 0.7173135876655579, "eval_runtime": 853.5344, "eval_samples_per_second": 2.469, "eval_steps_per_second": 2.469} +{"ts": "2025-12-27T10:55:00", "event": "train_log", "step": 3302, "epoch": 1.3932489451476793, "progress_pct": 23.22, "epoch_pct": 23.22, "eta": "55:26:42", "max_grad_norm": 0.8, "loss": 0.6810371279716492, "grad_norm": 1.0747730731964111, "learning_rate": 9.340495811087723e-05} +{"ts": "2025-12-27T10:55:19", "event": "train_log", "step": 3304, "epoch": 1.3940928270042194, "progress_pct": 23.23, "epoch_pct": 23.23, "eta": "55:25:09", "max_grad_norm": 0.8, "loss": 0.6693953275680542, "grad_norm": 1.2857651710510254, "learning_rate": 9.339303246786854e-05} +{"ts": "2025-12-27T10:55:38", "event": "train_log", "step": 3306, "epoch": 1.3949367088607594, "progress_pct": 23.25, "epoch_pct": 23.25, "eta": "55:23:33", "max_grad_norm": 0.8, "loss": 0.7019274234771729, "grad_norm": 1.4544212818145752, "learning_rate": 9.338109681492063e-05} +{"ts": "2025-12-27T10:55:58", "event": "train_log", "step": 3308, "epoch": 1.3957805907172995, "progress_pct": 23.26, "epoch_pct": 23.26, "eta": "55:22:04", "max_grad_norm": 0.8, "loss": 0.6074224710464478, "grad_norm": 1.687755823135376, "learning_rate": 9.336915115478685e-05} +{"ts": "2025-12-27T10:56:19", "event": "train_log", "step": 3310, "epoch": 1.3966244725738397, "progress_pct": 23.28, "epoch_pct": 23.28, "eta": "55:20:35", "max_grad_norm": 0.8, "loss": 0.6981383562088013, "grad_norm": 1.1645431518554688, "learning_rate": 9.33571954902228e-05} +{"ts": "2025-12-27T10:56:38", "event": "train_log", "step": 3312, "epoch": 1.3974683544303796, "progress_pct": 23.29, "epoch_pct": 23.29, "eta": "55:19:01", "max_grad_norm": 0.8, "loss": 0.7282926440238953, "grad_norm": 1.6173527240753174, "learning_rate": 9.334522982398646e-05} +{"ts": "2025-12-27T10:56:57", "event": "train_log", "step": 3314, "epoch": 1.3983122362869198, "progress_pct": 23.31, "epoch_pct": 23.31, "eta": "55:17:27", "max_grad_norm": 0.8, "loss": 0.6574883460998535, "grad_norm": 1.3132909536361694, "learning_rate": 9.333325415883804e-05} +{"ts": "2025-12-27T10:57:16", "event": "train_log", "step": 3316, "epoch": 1.39915611814346, "progress_pct": 23.32, "epoch_pct": 23.32, "eta": "55:15:53", "max_grad_norm": 0.8, "loss": 0.6559937596321106, "grad_norm": 1.1629762649536133, "learning_rate": 9.332126849754014e-05} +{"ts": "2025-12-27T10:57:35", "event": "train_log", "step": 3318, "epoch": 1.4, "progress_pct": 23.33, "epoch_pct": 23.33, "eta": "55:14:20", "max_grad_norm": 0.8, "loss": 0.683718740940094, "grad_norm": 1.1666897535324097, "learning_rate": 9.33092728428576e-05} +{"ts": "2025-12-27T10:57:55", "event": "train_log", "step": 3320, "epoch": 1.40084388185654, "progress_pct": 23.35, "epoch_pct": 23.35, "eta": "55:12:49", "max_grad_norm": 0.8, "loss": 0.6909779906272888, "grad_norm": 1.2269554138183594, "learning_rate": 9.329726719755756e-05} +{"ts": "2025-12-27T10:58:16", "event": "train_log", "step": 3322, "epoch": 1.4016877637130802, "progress_pct": 23.36, "epoch_pct": 23.36, "eta": "55:11:21", "max_grad_norm": 0.8, "loss": 0.6051948666572571, "grad_norm": 1.1010066270828247, "learning_rate": 9.328525156440952e-05} +{"ts": "2025-12-27T10:58:35", "event": "train_log", "step": 3324, "epoch": 1.4025316455696202, "progress_pct": 23.38, "epoch_pct": 23.38, "eta": "55:09:47", "max_grad_norm": 0.8, "loss": 0.6266679763793945, "grad_norm": 1.127143144607544, "learning_rate": 9.327322594618528e-05} +{"ts": "2025-12-27T10:58:55", "event": "train_log", "step": 3326, "epoch": 1.4033755274261603, "progress_pct": 23.39, "epoch_pct": 23.39, "eta": "55:08:18", "max_grad_norm": 0.8, "loss": 0.6587526202201843, "grad_norm": 1.2160708904266357, "learning_rate": 9.326119034565887e-05} +{"ts": "2025-12-27T10:59:16", "event": "train_log", "step": 3328, "epoch": 1.4042194092827005, "progress_pct": 23.4, "epoch_pct": 23.4, "eta": "55:06:51", "max_grad_norm": 0.8, "loss": 0.5916946530342102, "grad_norm": 1.0853947401046753, "learning_rate": 9.32491447656067e-05} +{"ts": "2025-12-27T10:59:36", "event": "train_log", "step": 3330, "epoch": 1.4050632911392404, "progress_pct": 23.42, "epoch_pct": 23.42, "eta": "55:05:21", "max_grad_norm": 0.8, "loss": 0.6032452583312988, "grad_norm": 1.2205027341842651, "learning_rate": 9.323708920880744e-05} +{"ts": "2025-12-27T10:59:57", "event": "train_log", "step": 3332, "epoch": 1.4059071729957806, "progress_pct": 23.43, "epoch_pct": 23.43, "eta": "55:03:52", "max_grad_norm": 0.8, "loss": 0.6649114489555359, "grad_norm": 1.1964668035507202, "learning_rate": 9.32250236780421e-05} +{"ts": "2025-12-27T11:00:15", "event": "train_log", "step": 3334, "epoch": 1.4067510548523208, "progress_pct": 23.45, "epoch_pct": 23.45, "eta": "55:02:17", "max_grad_norm": 0.8, "loss": 0.7142994403839111, "grad_norm": 1.2507994174957275, "learning_rate": 9.321294817609394e-05} +{"ts": "2025-12-27T11:00:36", "event": "train_log", "step": 3336, "epoch": 1.4075949367088607, "progress_pct": 23.46, "epoch_pct": 23.46, "eta": "55:00:50", "max_grad_norm": 0.8, "loss": 0.709568977355957, "grad_norm": 1.1310259103775024, "learning_rate": 9.320086270574854e-05} +{"ts": "2025-12-27T11:00:55", "event": "train_log", "step": 3338, "epoch": 1.4084388185654009, "progress_pct": 23.47, "epoch_pct": 23.47, "eta": "54:59:18", "max_grad_norm": 0.8, "loss": 0.7800853848457336, "grad_norm": 1.2454090118408203, "learning_rate": 9.318876726979385e-05} +{"ts": "2025-12-27T11:01:16", "event": "train_log", "step": 3340, "epoch": 1.409282700421941, "progress_pct": 23.49, "epoch_pct": 23.49, "eta": "54:57:49", "max_grad_norm": 0.8, "loss": 0.6187908053398132, "grad_norm": 1.1168389320373535, "learning_rate": 9.317666187101996e-05} +{"ts": "2025-12-27T11:01:34", "event": "train_log", "step": 3342, "epoch": 1.410126582278481, "progress_pct": 23.5, "epoch_pct": 23.5, "eta": "54:56:14", "max_grad_norm": 0.8, "loss": 0.6222613453865051, "grad_norm": 1.6696287393569946, "learning_rate": 9.316454651221942e-05} +{"ts": "2025-12-27T11:01:55", "event": "train_log", "step": 3344, "epoch": 1.4109704641350211, "progress_pct": 23.52, "epoch_pct": 23.52, "eta": "54:54:48", "max_grad_norm": 0.8, "loss": 0.6116594672203064, "grad_norm": 0.9500295519828796, "learning_rate": 9.315242119618698e-05} +{"ts": "2025-12-27T11:02:13", "event": "train_log", "step": 3346, "epoch": 1.4118143459915613, "progress_pct": 23.53, "epoch_pct": 23.53, "eta": "54:53:11", "max_grad_norm": 0.8, "loss": 0.633224368095398, "grad_norm": 1.186358094215393, "learning_rate": 9.314028592571973e-05} +{"ts": "2025-12-27T11:02:32", "event": "train_log", "step": 3348, "epoch": 1.4126582278481012, "progress_pct": 23.54, "epoch_pct": 23.54, "eta": "54:51:40", "max_grad_norm": 0.8, "loss": 0.6675921082496643, "grad_norm": 1.1855978965759277, "learning_rate": 9.312814070361705e-05} +{"ts": "2025-12-27T11:02:52", "event": "train_log", "step": 3350, "epoch": 1.4135021097046414, "progress_pct": 23.56, "epoch_pct": 23.56, "eta": "54:50:09", "max_grad_norm": 0.8, "loss": 0.7268879413604736, "grad_norm": 1.2465872764587402, "learning_rate": 9.311598553268059e-05} +{"ts": "2025-12-27T11:03:11", "event": "train_log", "step": 3352, "epoch": 1.4143459915611816, "progress_pct": 23.57, "epoch_pct": 23.57, "eta": "54:48:38", "max_grad_norm": 0.8, "loss": 0.6147416830062866, "grad_norm": 1.151274561882019, "learning_rate": 9.310382041571435e-05} +{"ts": "2025-12-27T11:03:31", "event": "train_log", "step": 3354, "epoch": 1.4151898734177215, "progress_pct": 23.59, "epoch_pct": 23.59, "eta": "54:47:08", "max_grad_norm": 0.8, "loss": 0.6678543090820312, "grad_norm": 1.1226807832717896, "learning_rate": 9.309164535552453e-05} +{"ts": "2025-12-27T11:03:50", "event": "train_log", "step": 3356, "epoch": 1.4160337552742617, "progress_pct": 23.6, "epoch_pct": 23.6, "eta": "54:45:35", "max_grad_norm": 0.8, "loss": 0.6334129571914673, "grad_norm": 1.375842571258545, "learning_rate": 9.307946035491975e-05} +{"ts": "2025-12-27T11:04:09", "event": "train_log", "step": 3358, "epoch": 1.4168776371308016, "progress_pct": 23.61, "epoch_pct": 23.61, "eta": "54:44:05", "max_grad_norm": 0.8, "loss": 0.6582583785057068, "grad_norm": 1.058353066444397, "learning_rate": 9.306726541671081e-05} +{"ts": "2025-12-27T11:04:29", "event": "train_log", "step": 3360, "epoch": 1.4177215189873418, "progress_pct": 23.63, "epoch_pct": 23.63, "eta": "54:42:36", "max_grad_norm": 0.8, "loss": 0.5877419114112854, "grad_norm": 1.0511330366134644, "learning_rate": 9.305506054371084e-05} +{"ts": "2025-12-27T11:04:47", "event": "train_log", "step": 3362, "epoch": 1.4185654008438817, "progress_pct": 23.64, "epoch_pct": 23.64, "eta": "54:41:00", "max_grad_norm": 0.8, "loss": 0.711665689945221, "grad_norm": 1.2246462106704712, "learning_rate": 9.304284573873532e-05} +{"ts": "2025-12-27T11:05:06", "event": "train_log", "step": 3364, "epoch": 1.4194092827004219, "progress_pct": 23.66, "epoch_pct": 23.66, "eta": "54:39:29", "max_grad_norm": 0.8, "loss": 0.6743642687797546, "grad_norm": 1.0242294073104858, "learning_rate": 9.303062100460193e-05} +{"ts": "2025-12-27T11:05:25", "event": "train_log", "step": 3366, "epoch": 1.420253164556962, "progress_pct": 23.67, "epoch_pct": 23.67, "eta": "54:37:56", "max_grad_norm": 0.8, "loss": 0.6825576424598694, "grad_norm": 1.1432100534439087, "learning_rate": 9.301838634413069e-05} +{"ts": "2025-12-27T11:05:46", "event": "train_log", "step": 3368, "epoch": 1.421097046413502, "progress_pct": 23.68, "epoch_pct": 23.68, "eta": "54:36:30", "max_grad_norm": 0.8, "loss": 0.624455988407135, "grad_norm": 1.0128604173660278, "learning_rate": 9.30061417601439e-05} +{"ts": "2025-12-27T11:06:05", "event": "train_log", "step": 3370, "epoch": 1.4219409282700421, "progress_pct": 23.7, "epoch_pct": 23.7, "eta": "54:34:58", "max_grad_norm": 0.8, "loss": 0.7029586434364319, "grad_norm": 1.2738330364227295, "learning_rate": 9.299388725546617e-05} +{"ts": "2025-12-27T11:06:25", "event": "train_log", "step": 3372, "epoch": 1.4227848101265823, "progress_pct": 23.71, "epoch_pct": 23.71, "eta": "54:33:30", "max_grad_norm": 0.8, "loss": 0.5994319915771484, "grad_norm": 1.0857324600219727, "learning_rate": 9.298162283292435e-05} +{"ts": "2025-12-27T11:06:44", "event": "train_log", "step": 3374, "epoch": 1.4236286919831223, "progress_pct": 23.73, "epoch_pct": 23.73, "eta": "54:32:00", "max_grad_norm": 0.8, "loss": 0.6537772417068481, "grad_norm": 1.0811917781829834, "learning_rate": 9.296934849534763e-05} +{"ts": "2025-12-27T11:07:04", "event": "train_log", "step": 3376, "epoch": 1.4244725738396624, "progress_pct": 23.74, "epoch_pct": 23.74, "eta": "54:30:30", "max_grad_norm": 0.8, "loss": 0.5775008201599121, "grad_norm": 1.006913185119629, "learning_rate": 9.295706424556745e-05} +{"ts": "2025-12-27T11:07:23", "event": "train_log", "step": 3378, "epoch": 1.4253164556962026, "progress_pct": 23.76, "epoch_pct": 23.76, "eta": "54:28:58", "max_grad_norm": 0.8, "loss": 0.7445536255836487, "grad_norm": 1.2306486368179321, "learning_rate": 9.294477008641755e-05} +{"ts": "2025-12-27T11:07:42", "event": "train_log", "step": 3380, "epoch": 1.4261603375527425, "progress_pct": 23.77, "epoch_pct": 23.77, "eta": "54:27:28", "max_grad_norm": 0.8, "loss": 0.6081538796424866, "grad_norm": 1.223608374595642, "learning_rate": 9.293246602073398e-05} +{"ts": "2025-12-27T11:08:02", "event": "train_log", "step": 3382, "epoch": 1.4270042194092827, "progress_pct": 23.78, "epoch_pct": 23.78, "eta": "54:26:00", "max_grad_norm": 0.8, "loss": 0.6134634613990784, "grad_norm": 1.0933321714401245, "learning_rate": 9.2920152051355e-05} +{"ts": "2025-12-27T11:08:22", "event": "train_log", "step": 3384, "epoch": 1.4278481012658228, "progress_pct": 23.8, "epoch_pct": 23.8, "eta": "54:24:31", "max_grad_norm": 0.8, "loss": 0.5961087346076965, "grad_norm": 1.1738401651382446, "learning_rate": 9.290782818112127e-05} +{"ts": "2025-12-27T11:08:42", "event": "train_log", "step": 3386, "epoch": 1.4286919831223628, "progress_pct": 23.81, "epoch_pct": 23.81, "eta": "54:23:04", "max_grad_norm": 0.8, "loss": 0.6284122467041016, "grad_norm": 1.1493438482284546, "learning_rate": 9.289549441287561e-05} +{"ts": "2025-12-27T11:09:01", "event": "train_log", "step": 3388, "epoch": 1.429535864978903, "progress_pct": 23.83, "epoch_pct": 23.83, "eta": "54:21:33", "max_grad_norm": 0.8, "loss": 0.6654639840126038, "grad_norm": 1.1907998323440552, "learning_rate": 9.288315074946324e-05} +{"ts": "2025-12-27T11:09:21", "event": "train_log", "step": 3390, "epoch": 1.4303797468354431, "progress_pct": 23.84, "epoch_pct": 23.84, "eta": "54:20:04", "max_grad_norm": 0.8, "loss": 0.652850329875946, "grad_norm": 1.3423025608062744, "learning_rate": 9.287079719373157e-05} +{"ts": "2025-12-27T11:09:40", "event": "train_log", "step": 3392, "epoch": 1.431223628691983, "progress_pct": 23.85, "epoch_pct": 23.85, "eta": "54:18:35", "max_grad_norm": 0.8, "loss": 0.703445315361023, "grad_norm": 1.3932039737701416, "learning_rate": 9.285843374853034e-05} +{"ts": "2025-12-27T11:10:00", "event": "train_log", "step": 3394, "epoch": 1.4320675105485232, "progress_pct": 23.87, "epoch_pct": 23.87, "eta": "54:17:06", "max_grad_norm": 0.8, "loss": 0.693265438079834, "grad_norm": 5.349400043487549, "learning_rate": 9.284606041671155e-05} +{"ts": "2025-12-27T11:10:20", "event": "train_log", "step": 3396, "epoch": 1.4329113924050634, "progress_pct": 23.88, "epoch_pct": 23.88, "eta": "54:15:38", "max_grad_norm": 0.8, "loss": 0.6578536033630371, "grad_norm": 1.0921961069107056, "learning_rate": 9.28336772011295e-05} +{"ts": "2025-12-27T11:10:38", "event": "train_log", "step": 3398, "epoch": 1.4337552742616033, "progress_pct": 23.9, "epoch_pct": 23.9, "eta": "54:14:07", "max_grad_norm": 0.8, "loss": 0.7092277407646179, "grad_norm": 1.184157133102417, "learning_rate": 9.282128410464074e-05} +{"ts": "2025-12-27T11:10:57", "event": "train_log", "step": 3400, "epoch": 1.4345991561181435, "progress_pct": 23.91, "epoch_pct": 23.91, "eta": "54:12:36", "max_grad_norm": 0.8, "loss": 0.6866328120231628, "grad_norm": 1.0923491716384888, "learning_rate": 9.280888113010415e-05} +{"ts": "2025-12-27T11:25:25", "event": "train_log", "step": 3400, "epoch": 1.4345991561181435, "progress_pct": 23.91, "epoch_pct": 23.91, "eta": "54:58:40", "max_grad_norm": 0.8, "eval_loss": 0.715917706489563, "eval_runtime": 868.51, "eval_samples_per_second": 2.426, "eval_steps_per_second": 2.426} +{"ts": "2025-12-27T11:25:44", "event": "train_log", "step": 3402, "epoch": 1.4354430379746836, "progress_pct": 23.92, "epoch_pct": 23.92, "eta": "54:57:07", "max_grad_norm": 0.8, "loss": 0.6617444157600403, "grad_norm": 1.2515597343444824, "learning_rate": 9.279646828038083e-05} +{"ts": "2025-12-27T11:26:03", "event": "train_log", "step": 3404, "epoch": 1.4362869198312236, "progress_pct": 23.94, "epoch_pct": 23.94, "eta": "54:55:33", "max_grad_norm": 0.8, "loss": 0.6373176574707031, "grad_norm": 1.2122540473937988, "learning_rate": 9.278404555833422e-05} +{"ts": "2025-12-27T11:26:22", "event": "train_log", "step": 3406, "epoch": 1.4371308016877637, "progress_pct": 23.95, "epoch_pct": 23.95, "eta": "54:54:02", "max_grad_norm": 0.8, "loss": 0.6506488919258118, "grad_norm": 1.191904902458191, "learning_rate": 9.277161296682997e-05} +{"ts": "2025-12-27T11:26:41", "event": "train_log", "step": 3408, "epoch": 1.437974683544304, "progress_pct": 23.97, "epoch_pct": 23.97, "eta": "54:52:27", "max_grad_norm": 0.8, "loss": 0.7172291874885559, "grad_norm": 1.2492214441299438, "learning_rate": 9.275917050873606e-05} +{"ts": "2025-12-27T11:27:00", "event": "train_log", "step": 3410, "epoch": 1.4388185654008439, "progress_pct": 23.98, "epoch_pct": 23.98, "eta": "54:50:58", "max_grad_norm": 0.8, "loss": 0.6180248260498047, "grad_norm": 1.0518640279769897, "learning_rate": 9.274671818692272e-05} +{"ts": "2025-12-27T11:27:20", "event": "train_log", "step": 3412, "epoch": 1.439662447257384, "progress_pct": 23.99, "epoch_pct": 23.99, "eta": "54:49:29", "max_grad_norm": 0.8, "loss": 0.6828892827033997, "grad_norm": 1.150563359260559, "learning_rate": 9.273425600426245e-05} +{"ts": "2025-12-27T11:27:40", "event": "train_log", "step": 3414, "epoch": 1.4405063291139242, "progress_pct": 24.01, "epoch_pct": 24.01, "eta": "54:47:59", "max_grad_norm": 0.8, "loss": 0.6585919857025146, "grad_norm": 1.76945960521698, "learning_rate": 9.272178396363005e-05} +{"ts": "2025-12-27T11:27:58", "event": "train_log", "step": 3416, "epoch": 1.4413502109704641, "progress_pct": 24.02, "epoch_pct": 24.02, "eta": "54:46:23", "max_grad_norm": 0.8, "loss": 0.7548692226409912, "grad_norm": 1.2367758750915527, "learning_rate": 9.270930206790257e-05} +{"ts": "2025-12-27T11:28:17", "event": "train_log", "step": 3418, "epoch": 1.4421940928270043, "progress_pct": 24.04, "epoch_pct": 24.04, "eta": "54:44:52", "max_grad_norm": 0.8, "loss": 0.7017102837562561, "grad_norm": 1.2292778491973877, "learning_rate": 9.269681031995936e-05} +{"ts": "2025-12-27T11:28:36", "event": "train_log", "step": 3420, "epoch": 1.4430379746835442, "progress_pct": 24.05, "epoch_pct": 24.05, "eta": "54:43:20", "max_grad_norm": 0.8, "loss": 0.6657648682594299, "grad_norm": 1.2193396091461182, "learning_rate": 9.268430872268202e-05} +{"ts": "2025-12-27T11:28:55", "event": "train_log", "step": 3422, "epoch": 1.4438818565400844, "progress_pct": 24.06, "epoch_pct": 24.06, "eta": "54:41:50", "max_grad_norm": 0.8, "loss": 0.6950910091400146, "grad_norm": 1.0505954027175903, "learning_rate": 9.267179727895443e-05} +{"ts": "2025-12-27T11:29:15", "event": "train_log", "step": 3424, "epoch": 1.4447257383966245, "progress_pct": 24.08, "epoch_pct": 24.08, "eta": "54:40:20", "max_grad_norm": 0.8, "loss": 0.689308226108551, "grad_norm": 1.1560698747634888, "learning_rate": 9.265927599166272e-05} +{"ts": "2025-12-27T11:29:34", "event": "train_log", "step": 3426, "epoch": 1.4455696202531645, "progress_pct": 24.09, "epoch_pct": 24.09, "eta": "54:38:50", "max_grad_norm": 0.8, "loss": 0.6481659412384033, "grad_norm": 1.189336895942688, "learning_rate": 9.264674486369533e-05} +{"ts": "2025-12-27T11:29:53", "event": "train_log", "step": 3428, "epoch": 1.4464135021097047, "progress_pct": 24.11, "epoch_pct": 24.11, "eta": "54:37:17", "max_grad_norm": 0.8, "loss": 0.6626612544059753, "grad_norm": 1.3527976274490356, "learning_rate": 9.263420389794294e-05} +{"ts": "2025-12-27T11:30:12", "event": "train_log", "step": 3430, "epoch": 1.4472573839662446, "progress_pct": 24.12, "epoch_pct": 24.12, "eta": "54:35:47", "max_grad_norm": 0.8, "loss": 0.690841794013977, "grad_norm": 1.096303105354309, "learning_rate": 9.262165309729854e-05} +{"ts": "2025-12-27T11:30:30", "event": "train_log", "step": 3432, "epoch": 1.4481012658227848, "progress_pct": 24.14, "epoch_pct": 24.14, "eta": "54:34:12", "max_grad_norm": 0.8, "loss": 0.6497649550437927, "grad_norm": 1.2131421566009521, "learning_rate": 9.260909246465732e-05} +{"ts": "2025-12-27T11:30:50", "event": "train_log", "step": 3434, "epoch": 1.448945147679325, "progress_pct": 24.15, "epoch_pct": 24.15, "eta": "54:32:42", "max_grad_norm": 0.8, "loss": 0.6236130595207214, "grad_norm": 1.1831032037734985, "learning_rate": 9.259652200291678e-05} +{"ts": "2025-12-27T11:31:09", "event": "train_log", "step": 3436, "epoch": 1.4497890295358649, "progress_pct": 24.16, "epoch_pct": 24.16, "eta": "54:31:13", "max_grad_norm": 0.8, "loss": 0.5223423838615417, "grad_norm": 0.9745979309082031, "learning_rate": 9.25839417149767e-05} +{"ts": "2025-12-27T11:31:28", "event": "train_log", "step": 3438, "epoch": 1.450632911392405, "progress_pct": 24.18, "epoch_pct": 24.18, "eta": "54:29:41", "max_grad_norm": 0.8, "loss": 0.6642022728919983, "grad_norm": 1.372460126876831, "learning_rate": 9.257135160373912e-05} +{"ts": "2025-12-27T11:31:49", "event": "train_log", "step": 3440, "epoch": 1.4514767932489452, "progress_pct": 24.19, "epoch_pct": 24.19, "eta": "54:28:16", "max_grad_norm": 0.8, "loss": 0.5426992774009705, "grad_norm": 1.421044111251831, "learning_rate": 9.255875167210832e-05} +{"ts": "2025-12-27T11:32:07", "event": "train_log", "step": 3442, "epoch": 1.4523206751054851, "progress_pct": 24.21, "epoch_pct": 24.21, "eta": "54:26:42", "max_grad_norm": 0.8, "loss": 0.6260567307472229, "grad_norm": 1.1694250106811523, "learning_rate": 9.254614192299086e-05} +{"ts": "2025-12-27T11:32:27", "event": "train_log", "step": 3444, "epoch": 1.4531645569620253, "progress_pct": 24.22, "epoch_pct": 24.22, "eta": "54:25:14", "max_grad_norm": 0.8, "loss": 0.5776100158691406, "grad_norm": 1.0892298221588135, "learning_rate": 9.253352235929558e-05} +{"ts": "2025-12-27T11:32:46", "event": "train_log", "step": 3446, "epoch": 1.4540084388185655, "progress_pct": 24.23, "epoch_pct": 24.23, "eta": "54:23:45", "max_grad_norm": 0.8, "loss": 0.6495202779769897, "grad_norm": 1.1841259002685547, "learning_rate": 9.252089298393356e-05} +{"ts": "2025-12-27T11:33:06", "event": "train_log", "step": 3448, "epoch": 1.4548523206751054, "progress_pct": 24.25, "epoch_pct": 24.25, "eta": "54:22:18", "max_grad_norm": 0.8, "loss": 0.6570594906806946, "grad_norm": 1.1133549213409424, "learning_rate": 9.250825379981815e-05} +{"ts": "2025-12-27T11:33:25", "event": "train_log", "step": 3450, "epoch": 1.4556962025316456, "progress_pct": 24.26, "epoch_pct": 24.26, "eta": "54:20:46", "max_grad_norm": 0.8, "loss": 0.6496587991714478, "grad_norm": 1.197100281715393, "learning_rate": 9.249560480986498e-05} +{"ts": "2025-12-27T11:33:43", "event": "train_log", "step": 3452, "epoch": 1.4565400843881857, "progress_pct": 24.28, "epoch_pct": 24.28, "eta": "54:19:13", "max_grad_norm": 0.8, "loss": 0.6644704341888428, "grad_norm": 1.1661107540130615, "learning_rate": 9.248294601699193e-05} +{"ts": "2025-12-27T11:34:01", "event": "train_log", "step": 3454, "epoch": 1.4573839662447257, "progress_pct": 24.29, "epoch_pct": 24.29, "eta": "54:17:40", "max_grad_norm": 0.8, "loss": 0.6451231241226196, "grad_norm": 1.2257879972457886, "learning_rate": 9.247027742411912e-05} +{"ts": "2025-12-27T11:34:21", "event": "train_log", "step": 3456, "epoch": 1.4582278481012658, "progress_pct": 24.3, "epoch_pct": 24.3, "eta": "54:16:13", "max_grad_norm": 0.8, "loss": 0.6108601093292236, "grad_norm": 1.3634982109069824, "learning_rate": 9.245759903416897e-05} +{"ts": "2025-12-27T11:34:40", "event": "train_log", "step": 3458, "epoch": 1.459071729957806, "progress_pct": 24.32, "epoch_pct": 24.32, "eta": "54:14:44", "max_grad_norm": 0.8, "loss": 0.6080004572868347, "grad_norm": 1.1802605390548706, "learning_rate": 9.244491085006615e-05} +{"ts": "2025-12-27T11:35:01", "event": "train_log", "step": 3460, "epoch": 1.459915611814346, "progress_pct": 24.33, "epoch_pct": 24.33, "eta": "54:13:18", "max_grad_norm": 0.8, "loss": 0.6406423449516296, "grad_norm": 1.280831217765808, "learning_rate": 9.243221287473756e-05} +{"ts": "2025-12-27T11:35:19", "event": "train_log", "step": 3462, "epoch": 1.460759493670886, "progress_pct": 24.35, "epoch_pct": 24.35, "eta": "54:11:47", "max_grad_norm": 0.8, "loss": 0.7320113778114319, "grad_norm": 1.3127192258834839, "learning_rate": 9.241950511111237e-05} +{"ts": "2025-12-27T11:35:39", "event": "train_log", "step": 3464, "epoch": 1.4616033755274263, "progress_pct": 24.36, "epoch_pct": 24.36, "eta": "54:10:18", "max_grad_norm": 0.8, "loss": 0.572110652923584, "grad_norm": 1.1711835861206055, "learning_rate": 9.240678756212204e-05} +{"ts": "2025-12-27T11:35:57", "event": "train_log", "step": 3466, "epoch": 1.4624472573839662, "progress_pct": 24.37, "epoch_pct": 24.37, "eta": "54:08:47", "max_grad_norm": 0.8, "loss": 0.7446795105934143, "grad_norm": 1.347143292427063, "learning_rate": 9.239406023070028e-05} +{"ts": "2025-12-27T11:36:17", "event": "train_log", "step": 3468, "epoch": 1.4632911392405064, "progress_pct": 24.39, "epoch_pct": 24.39, "eta": "54:07:19", "max_grad_norm": 0.8, "loss": 0.6709978580474854, "grad_norm": 1.4953652620315552, "learning_rate": 9.238132311978299e-05} +{"ts": "2025-12-27T11:36:36", "event": "train_log", "step": 3470, "epoch": 1.4641350210970465, "progress_pct": 24.4, "epoch_pct": 24.4, "eta": "54:05:50", "max_grad_norm": 0.8, "loss": 0.6691445112228394, "grad_norm": 1.2199387550354004, "learning_rate": 9.236857623230842e-05} +{"ts": "2025-12-27T11:36:56", "event": "train_log", "step": 3472, "epoch": 1.4649789029535865, "progress_pct": 24.42, "epoch_pct": 24.42, "eta": "54:04:24", "max_grad_norm": 0.8, "loss": 0.6964292526245117, "grad_norm": 1.0959199666976929, "learning_rate": 9.235581957121702e-05} +{"ts": "2025-12-27T11:37:16", "event": "train_log", "step": 3474, "epoch": 1.4658227848101266, "progress_pct": 24.43, "epoch_pct": 24.43, "eta": "54:02:58", "max_grad_norm": 0.8, "loss": 0.6880454421043396, "grad_norm": 1.455505609512329, "learning_rate": 9.234305313945149e-05} +{"ts": "2025-12-27T11:37:36", "event": "train_log", "step": 3476, "epoch": 1.4666666666666668, "progress_pct": 24.44, "epoch_pct": 24.44, "eta": "54:01:31", "max_grad_norm": 0.8, "loss": 0.6737138032913208, "grad_norm": 1.2820862531661987, "learning_rate": 9.233027693995681e-05} +{"ts": "2025-12-27T11:37:54", "event": "train_log", "step": 3478, "epoch": 1.4675105485232067, "progress_pct": 24.46, "epoch_pct": 24.46, "eta": "53:59:58", "max_grad_norm": 0.8, "loss": 0.6874006390571594, "grad_norm": 1.3459213972091675, "learning_rate": 9.231749097568023e-05} +{"ts": "2025-12-27T11:38:14", "event": "train_log", "step": 3480, "epoch": 1.4683544303797469, "progress_pct": 24.47, "epoch_pct": 24.47, "eta": "53:58:31", "max_grad_norm": 0.8, "loss": 0.7179469466209412, "grad_norm": 1.2815442085266113, "learning_rate": 9.230469524957119e-05} +{"ts": "2025-12-27T11:38:33", "event": "train_log", "step": 3482, "epoch": 1.469198312236287, "progress_pct": 24.49, "epoch_pct": 24.49, "eta": "53:57:03", "max_grad_norm": 0.8, "loss": 0.7525522112846375, "grad_norm": 1.6181597709655762, "learning_rate": 9.229188976458145e-05} +{"ts": "2025-12-27T11:38:54", "event": "train_log", "step": 3484, "epoch": 1.470042194092827, "progress_pct": 24.5, "epoch_pct": 24.5, "eta": "53:55:39", "max_grad_norm": 0.8, "loss": 0.5918128490447998, "grad_norm": 1.0633227825164795, "learning_rate": 9.227907452366495e-05} +{"ts": "2025-12-27T11:39:13", "event": "train_log", "step": 3486, "epoch": 1.4708860759493672, "progress_pct": 24.51, "epoch_pct": 24.51, "eta": "53:54:10", "max_grad_norm": 0.8, "loss": 0.6686186194419861, "grad_norm": 1.2055985927581787, "learning_rate": 9.226624952977796e-05} +{"ts": "2025-12-27T11:39:32", "event": "train_log", "step": 3488, "epoch": 1.471729957805907, "progress_pct": 24.53, "epoch_pct": 24.53, "eta": "53:52:42", "max_grad_norm": 0.8, "loss": 0.764410674571991, "grad_norm": 1.2495088577270508, "learning_rate": 9.225341478587893e-05} +{"ts": "2025-12-27T11:39:53", "event": "train_log", "step": 3490, "epoch": 1.4725738396624473, "progress_pct": 24.54, "epoch_pct": 24.54, "eta": "53:51:19", "max_grad_norm": 0.8, "loss": 0.7066780924797058, "grad_norm": 1.174229383468628, "learning_rate": 9.22405702949286e-05} +{"ts": "2025-12-27T11:40:13", "event": "train_log", "step": 3492, "epoch": 1.4734177215189874, "progress_pct": 24.56, "epoch_pct": 24.56, "eta": "53:49:53", "max_grad_norm": 0.8, "loss": 0.6740228533744812, "grad_norm": 1.0970302820205688, "learning_rate": 9.222771605988995e-05} +{"ts": "2025-12-27T11:40:32", "event": "train_log", "step": 3494, "epoch": 1.4742616033755274, "progress_pct": 24.57, "epoch_pct": 24.57, "eta": "53:48:23", "max_grad_norm": 0.8, "loss": 0.698371410369873, "grad_norm": 1.2470436096191406, "learning_rate": 9.221485208372822e-05} +{"ts": "2025-12-27T11:40:52", "event": "train_log", "step": 3496, "epoch": 1.4751054852320675, "progress_pct": 24.59, "epoch_pct": 24.59, "eta": "53:47:00", "max_grad_norm": 0.8, "loss": 0.6354188919067383, "grad_norm": 1.0750112533569336, "learning_rate": 9.220197836941084e-05} +{"ts": "2025-12-27T11:41:12", "event": "train_log", "step": 3498, "epoch": 1.4759493670886075, "progress_pct": 24.6, "epoch_pct": 24.6, "eta": "53:45:33", "max_grad_norm": 0.8, "loss": 0.7268608212471008, "grad_norm": 1.2656232118606567, "learning_rate": 9.218909491990757e-05} +{"ts": "2025-12-27T11:41:31", "event": "train_log", "step": 3500, "epoch": 1.4767932489451476, "progress_pct": 24.61, "epoch_pct": 24.61, "eta": "53:44:06", "max_grad_norm": 0.8, "loss": 0.6652966141700745, "grad_norm": 1.2389028072357178, "learning_rate": 9.217620173819037e-05} +{"ts": "2025-12-27T11:55:47", "event": "train_log", "step": 3500, "epoch": 1.4767932489451476, "progress_pct": 24.61, "epoch_pct": 24.61, "eta": "54:27:47", "max_grad_norm": 0.8, "eval_loss": 0.7155047059059143, "eval_runtime": 855.8428, "eval_samples_per_second": 2.462, "eval_steps_per_second": 2.462} +{"ts": "2025-12-27T11:56:07", "event": "train_log", "step": 3502, "epoch": 1.4776371308016878, "progress_pct": 24.63, "epoch_pct": 24.63, "eta": "54:26:20", "max_grad_norm": 0.8, "loss": 0.6845020651817322, "grad_norm": 1.218304991722107, "learning_rate": 9.216329882723343e-05} +{"ts": "2025-12-27T11:56:26", "event": "train_log", "step": 3504, "epoch": 1.4784810126582277, "progress_pct": 24.64, "epoch_pct": 24.64, "eta": "54:24:49", "max_grad_norm": 0.8, "loss": 0.6972519755363464, "grad_norm": 1.123903512954712, "learning_rate": 9.21503861900132e-05} +{"ts": "2025-12-27T11:56:46", "event": "train_log", "step": 3506, "epoch": 1.479324894514768, "progress_pct": 24.66, "epoch_pct": 24.66, "eta": "54:23:23", "max_grad_norm": 0.8, "loss": 0.6699702739715576, "grad_norm": 1.1827739477157593, "learning_rate": 9.213746382950839e-05} +{"ts": "2025-12-27T11:57:07", "event": "train_log", "step": 3508, "epoch": 1.480168776371308, "progress_pct": 24.67, "epoch_pct": 24.67, "eta": "54:21:58", "max_grad_norm": 0.8, "loss": 0.5623225569725037, "grad_norm": 0.9934872984886169, "learning_rate": 9.212453174869995e-05} +{"ts": "2025-12-27T11:57:25", "event": "train_log", "step": 3510, "epoch": 1.481012658227848, "progress_pct": 24.68, "epoch_pct": 24.68, "eta": "54:20:26", "max_grad_norm": 0.8, "loss": 0.6527173519134521, "grad_norm": 1.221093773841858, "learning_rate": 9.211158995057105e-05} +{"ts": "2025-12-27T11:57:44", "event": "train_log", "step": 3512, "epoch": 1.4818565400843882, "progress_pct": 24.7, "epoch_pct": 24.7, "eta": "54:18:55", "max_grad_norm": 0.8, "loss": 0.7015712261199951, "grad_norm": 1.4569166898727417, "learning_rate": 9.209863843810711e-05} +{"ts": "2025-12-27T11:58:04", "event": "train_log", "step": 3514, "epoch": 1.4827004219409283, "progress_pct": 24.71, "epoch_pct": 24.71, "eta": "54:17:29", "max_grad_norm": 0.8, "loss": 0.6442505717277527, "grad_norm": 1.0764813423156738, "learning_rate": 9.208567721429581e-05} +{"ts": "2025-12-27T11:58:24", "event": "train_log", "step": 3516, "epoch": 1.4835443037974683, "progress_pct": 24.73, "epoch_pct": 24.73, "eta": "54:16:01", "max_grad_norm": 0.8, "loss": 0.666451096534729, "grad_norm": 2.1307506561279297, "learning_rate": 9.207270628212704e-05} +{"ts": "2025-12-27T11:58:43", "event": "train_log", "step": 3518, "epoch": 1.4843881856540084, "progress_pct": 24.74, "epoch_pct": 24.74, "eta": "54:14:30", "max_grad_norm": 0.8, "loss": 0.6354807019233704, "grad_norm": 1.180590271949768, "learning_rate": 9.205972564459296e-05} +{"ts": "2025-12-27T11:59:01", "event": "train_log", "step": 3520, "epoch": 1.4852320675105486, "progress_pct": 24.75, "epoch_pct": 24.75, "eta": "54:13:00", "max_grad_norm": 0.8, "loss": 0.6080324053764343, "grad_norm": 1.2999447584152222, "learning_rate": 9.204673530468795e-05} +{"ts": "2025-12-27T11:59:19", "event": "train_log", "step": 3522, "epoch": 1.4860759493670885, "progress_pct": 24.77, "epoch_pct": 24.77, "eta": "54:11:28", "max_grad_norm": 0.8, "loss": 0.6411244869232178, "grad_norm": 1.1680655479431152, "learning_rate": 9.203373526540862e-05} +{"ts": "2025-12-27T11:59:39", "event": "train_log", "step": 3524, "epoch": 1.4869198312236287, "progress_pct": 24.78, "epoch_pct": 24.78, "eta": "54:10:00", "max_grad_norm": 0.8, "loss": 0.6498287916183472, "grad_norm": 1.0565013885498047, "learning_rate": 9.202072552975383e-05} +{"ts": "2025-12-27T11:59:58", "event": "train_log", "step": 3526, "epoch": 1.4877637130801689, "progress_pct": 24.8, "epoch_pct": 24.8, "eta": "54:08:30", "max_grad_norm": 0.8, "loss": 0.633613109588623, "grad_norm": 1.246267318725586, "learning_rate": 9.20077061007247e-05} +{"ts": "2025-12-27T12:00:17", "event": "train_log", "step": 3528, "epoch": 1.4886075949367088, "progress_pct": 24.81, "epoch_pct": 24.81, "eta": "54:07:00", "max_grad_norm": 0.8, "loss": 0.6102107167243958, "grad_norm": 1.0626300573349, "learning_rate": 9.199467698132453e-05} +{"ts": "2025-12-27T12:00:36", "event": "train_log", "step": 3530, "epoch": 1.489451476793249, "progress_pct": 24.82, "epoch_pct": 24.82, "eta": "54:05:33", "max_grad_norm": 0.8, "loss": 0.669352114200592, "grad_norm": 1.256600260734558, "learning_rate": 9.198163817455892e-05} +{"ts": "2025-12-27T12:00:58", "event": "train_log", "step": 3532, "epoch": 1.4902953586497891, "progress_pct": 24.84, "epoch_pct": 24.84, "eta": "54:04:10", "max_grad_norm": 0.8, "loss": 0.6305804252624512, "grad_norm": 1.143188238143921, "learning_rate": 9.196858968343565e-05} +{"ts": "2025-12-27T12:01:18", "event": "train_log", "step": 3534, "epoch": 1.491139240506329, "progress_pct": 24.85, "epoch_pct": 24.85, "eta": "54:02:45", "max_grad_norm": 0.8, "loss": 0.6256994605064392, "grad_norm": 1.1471205949783325, "learning_rate": 9.195553151096475e-05} +{"ts": "2025-12-27T12:01:37", "event": "train_log", "step": 3536, "epoch": 1.4919831223628692, "progress_pct": 24.87, "epoch_pct": 24.87, "eta": "54:01:18", "max_grad_norm": 0.8, "loss": 0.6395107507705688, "grad_norm": 1.1771589517593384, "learning_rate": 9.194246366015851e-05} +{"ts": "2025-12-27T12:01:57", "event": "train_log", "step": 3538, "epoch": 1.4928270042194094, "progress_pct": 24.88, "epoch_pct": 24.88, "eta": "53:59:50", "max_grad_norm": 0.8, "loss": 0.6875160932540894, "grad_norm": 1.1997097730636597, "learning_rate": 9.192938613403144e-05} +{"ts": "2025-12-27T12:02:14", "event": "train_log", "step": 3540, "epoch": 1.4936708860759493, "progress_pct": 24.89, "epoch_pct": 24.89, "eta": "53:58:17", "max_grad_norm": 0.8, "loss": 0.7216510772705078, "grad_norm": 1.3962169885635376, "learning_rate": 9.191629893560024e-05} +{"ts": "2025-12-27T12:02:35", "event": "train_log", "step": 3542, "epoch": 1.4945147679324895, "progress_pct": 24.91, "epoch_pct": 24.91, "eta": "53:56:52", "max_grad_norm": 0.8, "loss": 0.6870693564414978, "grad_norm": 1.1835654973983765, "learning_rate": 9.19032020678839e-05} +{"ts": "2025-12-27T12:02:54", "event": "train_log", "step": 3544, "epoch": 1.4953586497890297, "progress_pct": 24.92, "epoch_pct": 24.92, "eta": "53:55:25", "max_grad_norm": 0.8, "loss": 0.6266092658042908, "grad_norm": 1.112331509590149, "learning_rate": 9.18900955339036e-05} +{"ts": "2025-12-27T12:03:16", "event": "train_log", "step": 3546, "epoch": 1.4962025316455696, "progress_pct": 24.94, "epoch_pct": 24.94, "eta": "53:54:05", "max_grad_norm": 0.8, "loss": 0.5906343460083008, "grad_norm": 1.0298354625701904, "learning_rate": 9.187697933668278e-05} +{"ts": "2025-12-27T12:03:35", "event": "train_log", "step": 3548, "epoch": 1.4970464135021098, "progress_pct": 24.95, "epoch_pct": 24.95, "eta": "53:52:37", "max_grad_norm": 0.8, "loss": 0.6203610897064209, "grad_norm": 1.2650012969970703, "learning_rate": 9.186385347924709e-05} +{"ts": "2025-12-27T12:03:58", "event": "train_log", "step": 3550, "epoch": 1.49789029535865, "progress_pct": 24.96, "epoch_pct": 24.96, "eta": "53:51:19", "max_grad_norm": 0.8, "loss": 0.6841281652450562, "grad_norm": 1.1208417415618896, "learning_rate": 9.185071796462441e-05} +{"ts": "2025-12-27T12:04:17", "event": "train_log", "step": 3552, "epoch": 1.4987341772151899, "progress_pct": 24.98, "epoch_pct": 24.98, "eta": "53:49:52", "max_grad_norm": 0.8, "loss": 0.7089514136314392, "grad_norm": 1.1319488286972046, "learning_rate": 9.183757279584486e-05} +{"ts": "2025-12-27T12:04:36", "event": "train_log", "step": 3554, "epoch": 1.49957805907173, "progress_pct": 24.99, "epoch_pct": 24.99, "eta": "53:48:24", "max_grad_norm": 0.8, "loss": 0.6663861870765686, "grad_norm": 1.1104235649108887, "learning_rate": 9.182441797594076e-05} +{"ts": "2025-12-27T12:04:56", "event": "train_log", "step": 3556, "epoch": 1.5004219409282702, "progress_pct": 25.01, "epoch_pct": 25.01, "eta": "53:46:58", "max_grad_norm": 0.8, "loss": 0.6713237762451172, "grad_norm": 1.161412000656128, "learning_rate": 9.18112535079467e-05} +{"ts": "2025-12-27T12:05:15", "event": "train_log", "step": 3558, "epoch": 1.5012658227848101, "progress_pct": 25.02, "epoch_pct": 25.02, "eta": "53:45:29", "max_grad_norm": 0.8, "loss": 0.6665274500846863, "grad_norm": 1.2925246953964233, "learning_rate": 9.179807939489945e-05} +{"ts": "2025-12-27T12:05:34", "event": "train_log", "step": 3560, "epoch": 1.50210970464135, "progress_pct": 25.04, "epoch_pct": 25.04, "eta": "53:44:03", "max_grad_norm": 0.8, "loss": 0.6881593465805054, "grad_norm": 1.0968270301818848, "learning_rate": 9.178489563983802e-05} +{"ts": "2025-12-27T12:05:54", "event": "train_log", "step": 3562, "epoch": 1.5029535864978905, "progress_pct": 25.05, "epoch_pct": 25.05, "eta": "53:42:37", "max_grad_norm": 0.8, "loss": 0.631568431854248, "grad_norm": 1.111439824104309, "learning_rate": 9.177170224580368e-05} +{"ts": "2025-12-27T12:06:14", "event": "train_log", "step": 3564, "epoch": 1.5037974683544304, "progress_pct": 25.06, "epoch_pct": 25.06, "eta": "53:41:11", "max_grad_norm": 0.8, "loss": 0.6896167397499084, "grad_norm": 1.6731075048446655, "learning_rate": 9.175849921583986e-05} +{"ts": "2025-12-27T12:06:35", "event": "train_log", "step": 3566, "epoch": 1.5046413502109703, "progress_pct": 25.08, "epoch_pct": 25.08, "eta": "53:39:48", "max_grad_norm": 0.8, "loss": 0.6285277605056763, "grad_norm": 1.226739525794983, "learning_rate": 9.174528655299226e-05} +{"ts": "2025-12-27T12:06:55", "event": "train_log", "step": 3568, "epoch": 1.5054852320675105, "progress_pct": 25.09, "epoch_pct": 25.09, "eta": "53:38:24", "max_grad_norm": 0.8, "loss": 0.6256678700447083, "grad_norm": 1.2030941247940063, "learning_rate": 9.17320642603088e-05} +{"ts": "2025-12-27T12:07:14", "event": "train_log", "step": 3570, "epoch": 1.5063291139240507, "progress_pct": 25.11, "epoch_pct": 25.11, "eta": "53:36:57", "max_grad_norm": 0.8, "loss": 0.6895992159843445, "grad_norm": 1.1980781555175781, "learning_rate": 9.171883234083958e-05} +{"ts": "2025-12-27T12:07:35", "event": "train_log", "step": 3572, "epoch": 1.5071729957805906, "progress_pct": 25.12, "epoch_pct": 25.12, "eta": "53:35:34", "max_grad_norm": 0.8, "loss": 0.6642275452613831, "grad_norm": 1.2083429098129272, "learning_rate": 9.170559079763696e-05} +{"ts": "2025-12-27T12:07:55", "event": "train_log", "step": 3574, "epoch": 1.5080168776371308, "progress_pct": 25.13, "epoch_pct": 25.13, "eta": "53:34:11", "max_grad_norm": 0.8, "loss": 0.7441924214363098, "grad_norm": 1.134020209312439, "learning_rate": 9.169233963375552e-05} +{"ts": "2025-12-27T12:08:14", "event": "train_log", "step": 3576, "epoch": 1.508860759493671, "progress_pct": 25.15, "epoch_pct": 25.15, "eta": "53:32:44", "max_grad_norm": 0.8, "loss": 0.6435995101928711, "grad_norm": 1.8178621530532837, "learning_rate": 9.167907885225204e-05} +{"ts": "2025-12-27T12:08:34", "event": "train_log", "step": 3578, "epoch": 1.5097046413502109, "progress_pct": 25.16, "epoch_pct": 25.16, "eta": "53:31:17", "max_grad_norm": 0.8, "loss": 0.6933603882789612, "grad_norm": 1.3850326538085938, "learning_rate": 9.166580845618553e-05} +{"ts": "2025-12-27T12:08:55", "event": "train_log", "step": 3580, "epoch": 1.510548523206751, "progress_pct": 25.18, "epoch_pct": 25.18, "eta": "53:29:56", "max_grad_norm": 0.8, "loss": 0.6686714887619019, "grad_norm": 1.2500641345977783, "learning_rate": 9.165252844861723e-05} +{"ts": "2025-12-27T12:09:15", "event": "train_log", "step": 3582, "epoch": 1.5113924050632912, "progress_pct": 25.19, "epoch_pct": 25.19, "eta": "53:28:31", "max_grad_norm": 0.8, "loss": 0.607890248298645, "grad_norm": 1.0226643085479736, "learning_rate": 9.163923883261056e-05} +{"ts": "2025-12-27T12:09:33", "event": "train_log", "step": 3584, "epoch": 1.5122362869198311, "progress_pct": 25.2, "epoch_pct": 25.2, "eta": "53:27:04", "max_grad_norm": 0.8, "loss": 0.6604583859443665, "grad_norm": 1.233402132987976, "learning_rate": 9.162593961123118e-05} +{"ts": "2025-12-27T12:09:52", "event": "train_log", "step": 3586, "epoch": 1.5130801687763713, "progress_pct": 25.22, "epoch_pct": 25.22, "eta": "53:25:35", "max_grad_norm": 0.8, "loss": 0.6756428480148315, "grad_norm": 1.2609056234359741, "learning_rate": 9.161263078754698e-05} +{"ts": "2025-12-27T12:10:12", "event": "train_log", "step": 3588, "epoch": 1.5139240506329115, "progress_pct": 25.23, "epoch_pct": 25.23, "eta": "53:24:13", "max_grad_norm": 0.8, "loss": 0.6990940570831299, "grad_norm": 1.22673761844635, "learning_rate": 9.159931236462805e-05} +{"ts": "2025-12-27T12:10:32", "event": "train_log", "step": 3590, "epoch": 1.5147679324894514, "progress_pct": 25.25, "epoch_pct": 25.25, "eta": "53:22:49", "max_grad_norm": 0.8, "loss": 0.6436648964881897, "grad_norm": 1.1386182308197021, "learning_rate": 9.158598434554668e-05} +{"ts": "2025-12-27T12:10:53", "event": "train_log", "step": 3592, "epoch": 1.5156118143459916, "progress_pct": 25.26, "epoch_pct": 25.26, "eta": "53:21:26", "max_grad_norm": 0.8, "loss": 0.6420145034790039, "grad_norm": 1.1136831045150757, "learning_rate": 9.157264673337739e-05} +{"ts": "2025-12-27T12:11:13", "event": "train_log", "step": 3594, "epoch": 1.5164556962025317, "progress_pct": 25.27, "epoch_pct": 25.27, "eta": "53:20:03", "max_grad_norm": 0.8, "loss": 0.6518592834472656, "grad_norm": 1.1957908868789673, "learning_rate": 9.155929953119693e-05} +{"ts": "2025-12-27T12:11:31", "event": "train_log", "step": 3596, "epoch": 1.5172995780590717, "progress_pct": 25.29, "epoch_pct": 25.29, "eta": "53:18:34", "max_grad_norm": 0.8, "loss": 0.6891129612922668, "grad_norm": 1.1049647331237793, "learning_rate": 9.154594274208422e-05} +{"ts": "2025-12-27T12:11:51", "event": "train_log", "step": 3598, "epoch": 1.5181434599156118, "progress_pct": 25.3, "epoch_pct": 25.3, "eta": "53:17:10", "max_grad_norm": 0.8, "loss": 0.6945107579231262, "grad_norm": 1.243675947189331, "learning_rate": 9.153257636912043e-05} +{"ts": "2025-12-27T12:12:11", "event": "train_log", "step": 3600, "epoch": 1.518987341772152, "progress_pct": 25.32, "epoch_pct": 25.32, "eta": "53:15:45", "max_grad_norm": 0.8, "loss": 0.7011660933494568, "grad_norm": 1.2633713483810425, "learning_rate": 9.15192004153889e-05} +{"ts": "2025-12-27T12:26:22", "event": "train_log", "step": 3600, "epoch": 1.518987341772152, "progress_pct": 25.32, "epoch_pct": 25.32, "eta": "53:57:36", "max_grad_norm": 0.8, "eval_loss": 0.7118256688117981, "eval_runtime": 851.3079, "eval_samples_per_second": 2.475, "eval_steps_per_second": 2.475} +{"ts": "2025-12-27T12:26:43", "event": "train_log", "step": 3602, "epoch": 1.519831223628692, "progress_pct": 25.33, "epoch_pct": 25.33, "eta": "53:56:14", "max_grad_norm": 0.8, "loss": 0.6843758821487427, "grad_norm": 1.2995525598526, "learning_rate": 9.150581488397525e-05} +{"ts": "2025-12-27T12:27:02", "event": "train_log", "step": 3604, "epoch": 1.520675105485232, "progress_pct": 25.34, "epoch_pct": 25.34, "eta": "53:54:46", "max_grad_norm": 0.8, "loss": 0.6699353456497192, "grad_norm": 1.3140910863876343, "learning_rate": 9.149241977796723e-05} +{"ts": "2025-12-27T12:27:22", "event": "train_log", "step": 3606, "epoch": 1.5215189873417723, "progress_pct": 25.36, "epoch_pct": 25.36, "eta": "53:53:19", "max_grad_norm": 0.8, "loss": 0.7269271612167358, "grad_norm": 1.2674909830093384, "learning_rate": 9.147901510045485e-05} +{"ts": "2025-12-27T12:27:44", "event": "train_log", "step": 3608, "epoch": 1.5223628691983122, "progress_pct": 25.37, "epoch_pct": 25.37, "eta": "53:52:00", "max_grad_norm": 0.8, "loss": 0.5556837916374207, "grad_norm": 1.0232038497924805, "learning_rate": 9.146560085453031e-05} +{"ts": "2025-12-27T12:28:03", "event": "train_log", "step": 3610, "epoch": 1.5232067510548524, "progress_pct": 25.39, "epoch_pct": 25.39, "eta": "53:50:33", "max_grad_norm": 0.8, "loss": 0.7273092269897461, "grad_norm": 1.2598992586135864, "learning_rate": 9.1452177043288e-05} +{"ts": "2025-12-27T12:28:23", "event": "train_log", "step": 3612, "epoch": 1.5240506329113925, "progress_pct": 25.4, "epoch_pct": 25.4, "eta": "53:49:06", "max_grad_norm": 0.8, "loss": 0.6897470355033875, "grad_norm": 1.2002917528152466, "learning_rate": 9.143874366982455e-05} +{"ts": "2025-12-27T12:28:44", "event": "train_log", "step": 3614, "epoch": 1.5248945147679325, "progress_pct": 25.41, "epoch_pct": 25.41, "eta": "53:47:46", "max_grad_norm": 0.8, "loss": 0.6060715913772583, "grad_norm": 1.0959099531173706, "learning_rate": 9.142530073723878e-05} +{"ts": "2025-12-27T12:29:05", "event": "train_log", "step": 3616, "epoch": 1.5257383966244724, "progress_pct": 25.43, "epoch_pct": 25.43, "eta": "53:46:23", "max_grad_norm": 0.8, "loss": 0.6585046052932739, "grad_norm": 1.9890750646591187, "learning_rate": 9.141184824863173e-05} +{"ts": "2025-12-27T12:29:25", "event": "train_log", "step": 3618, "epoch": 1.5265822784810128, "progress_pct": 25.44, "epoch_pct": 25.44, "eta": "53:44:59", "max_grad_norm": 0.8, "loss": 0.6022046804428101, "grad_norm": 1.1460137367248535, "learning_rate": 9.139838620710663e-05} +{"ts": "2025-12-27T12:29:45", "event": "train_log", "step": 3620, "epoch": 1.5274261603375527, "progress_pct": 25.46, "epoch_pct": 25.46, "eta": "53:43:32", "max_grad_norm": 0.8, "loss": 0.6332581639289856, "grad_norm": 1.193206548690796, "learning_rate": 9.138491461576888e-05} +{"ts": "2025-12-27T12:30:04", "event": "train_log", "step": 3622, "epoch": 1.5282700421940927, "progress_pct": 25.47, "epoch_pct": 25.47, "eta": "53:42:04", "max_grad_norm": 0.8, "loss": 0.6690208315849304, "grad_norm": 1.2813689708709717, "learning_rate": 9.137143347772614e-05} +{"ts": "2025-12-27T12:30:24", "event": "train_log", "step": 3624, "epoch": 1.529113924050633, "progress_pct": 25.49, "epoch_pct": 25.49, "eta": "53:40:39", "max_grad_norm": 0.8, "loss": 0.6034293174743652, "grad_norm": 1.0950052738189697, "learning_rate": 9.135794279608827e-05} +{"ts": "2025-12-27T12:30:42", "event": "train_log", "step": 3626, "epoch": 1.529957805907173, "progress_pct": 25.5, "epoch_pct": 25.5, "eta": "53:39:10", "max_grad_norm": 0.8, "loss": 0.7077960968017578, "grad_norm": 1.208884358406067, "learning_rate": 9.134444257396729e-05} +{"ts": "2025-12-27T12:31:03", "event": "train_log", "step": 3628, "epoch": 1.530801687763713, "progress_pct": 25.51, "epoch_pct": 25.51, "eta": "53:37:49", "max_grad_norm": 0.8, "loss": 0.6741147637367249, "grad_norm": 1.093759298324585, "learning_rate": 9.133093281447742e-05} +{"ts": "2025-12-27T12:31:22", "event": "train_log", "step": 3630, "epoch": 1.5316455696202531, "progress_pct": 25.53, "epoch_pct": 25.53, "eta": "53:36:20", "max_grad_norm": 0.8, "loss": 0.6816818118095398, "grad_norm": 1.1280012130737305, "learning_rate": 9.131741352073514e-05} +{"ts": "2025-12-27T12:31:40", "event": "train_log", "step": 3632, "epoch": 1.5324894514767933, "progress_pct": 25.54, "epoch_pct": 25.54, "eta": "53:34:50", "max_grad_norm": 0.8, "loss": 0.7149180769920349, "grad_norm": 1.2868385314941406, "learning_rate": 9.130388469585907e-05} +{"ts": "2025-12-27T12:32:00", "event": "train_log", "step": 3634, "epoch": 1.5333333333333332, "progress_pct": 25.56, "epoch_pct": 25.56, "eta": "53:33:27", "max_grad_norm": 0.8, "loss": 0.613467812538147, "grad_norm": 0.9654553532600403, "learning_rate": 9.129034634297007e-05} +{"ts": "2025-12-27T12:32:21", "event": "train_log", "step": 3636, "epoch": 1.5341772151898734, "progress_pct": 25.57, "epoch_pct": 25.57, "eta": "53:32:07", "max_grad_norm": 0.8, "loss": 0.7034116387367249, "grad_norm": 1.8958736658096313, "learning_rate": 9.127679846519115e-05} +{"ts": "2025-12-27T12:32:40", "event": "train_log", "step": 3638, "epoch": 1.5350210970464135, "progress_pct": 25.58, "epoch_pct": 25.58, "eta": "53:30:38", "max_grad_norm": 0.8, "loss": 0.7076106667518616, "grad_norm": 1.305284857749939, "learning_rate": 9.126324106564757e-05} +{"ts": "2025-12-27T12:33:00", "event": "train_log", "step": 3640, "epoch": 1.5358649789029535, "progress_pct": 25.6, "epoch_pct": 25.6, "eta": "53:29:14", "max_grad_norm": 0.8, "loss": 0.6671180725097656, "grad_norm": 1.1843762397766113, "learning_rate": 9.124967414746675e-05} +{"ts": "2025-12-27T12:33:21", "event": "train_log", "step": 3642, "epoch": 1.5367088607594936, "progress_pct": 25.61, "epoch_pct": 25.61, "eta": "53:27:53", "max_grad_norm": 0.8, "loss": 0.667533814907074, "grad_norm": 1.0460047721862793, "learning_rate": 9.123609771377832e-05} +{"ts": "2025-12-27T12:33:41", "event": "train_log", "step": 3644, "epoch": 1.5375527426160338, "progress_pct": 25.63, "epoch_pct": 25.63, "eta": "53:26:30", "max_grad_norm": 0.8, "loss": 0.6454499959945679, "grad_norm": 1.0441135168075562, "learning_rate": 9.122251176771409e-05} +{"ts": "2025-12-27T12:34:01", "event": "train_log", "step": 3646, "epoch": 1.5383966244725737, "progress_pct": 25.64, "epoch_pct": 25.64, "eta": "53:25:04", "max_grad_norm": 0.8, "loss": 0.677007794380188, "grad_norm": 1.5647634267807007, "learning_rate": 9.120891631240811e-05} +{"ts": "2025-12-27T12:34:21", "event": "train_log", "step": 3648, "epoch": 1.539240506329114, "progress_pct": 25.65, "epoch_pct": 25.65, "eta": "53:23:43", "max_grad_norm": 0.8, "loss": 0.7017449736595154, "grad_norm": 1.0650273561477661, "learning_rate": 9.119531135099655e-05} +{"ts": "2025-12-27T12:34:40", "event": "train_log", "step": 3650, "epoch": 1.540084388185654, "progress_pct": 25.67, "epoch_pct": 25.67, "eta": "53:22:15", "max_grad_norm": 0.8, "loss": 0.683830738067627, "grad_norm": 1.2904767990112305, "learning_rate": 9.118169688661784e-05} +{"ts": "2025-12-27T12:35:00", "event": "train_log", "step": 3652, "epoch": 1.540928270042194, "progress_pct": 25.68, "epoch_pct": 25.68, "eta": "53:20:52", "max_grad_norm": 0.8, "loss": 0.5923286080360413, "grad_norm": 1.1278672218322754, "learning_rate": 9.116807292241257e-05} +{"ts": "2025-12-27T12:35:21", "event": "train_log", "step": 3654, "epoch": 1.5417721518987342, "progress_pct": 25.7, "epoch_pct": 25.7, "eta": "53:19:30", "max_grad_norm": 0.8, "loss": 0.6595140099525452, "grad_norm": 1.1107184886932373, "learning_rate": 9.115443946152352e-05} +{"ts": "2025-12-27T12:35:40", "event": "train_log", "step": 3656, "epoch": 1.5426160337552743, "progress_pct": 25.71, "epoch_pct": 25.71, "eta": "53:18:05", "max_grad_norm": 0.8, "loss": 0.655241072177887, "grad_norm": 1.0917898416519165, "learning_rate": 9.114079650709566e-05} +{"ts": "2025-12-27T12:36:00", "event": "train_log", "step": 3658, "epoch": 1.5434599156118143, "progress_pct": 25.72, "epoch_pct": 25.72, "eta": "53:16:39", "max_grad_norm": 0.8, "loss": 0.5987096428871155, "grad_norm": 1.1922433376312256, "learning_rate": 9.11271440622762e-05} +{"ts": "2025-12-27T12:36:21", "event": "train_log", "step": 3660, "epoch": 1.5443037974683544, "progress_pct": 25.74, "epoch_pct": 25.74, "eta": "53:15:20", "max_grad_norm": 0.8, "loss": 0.5710145235061646, "grad_norm": 0.9974617958068848, "learning_rate": 9.111348213021445e-05} +{"ts": "2025-12-27T12:36:40", "event": "train_log", "step": 3662, "epoch": 1.5451476793248946, "progress_pct": 25.75, "epoch_pct": 25.75, "eta": "53:13:54", "max_grad_norm": 0.8, "loss": 0.6067734360694885, "grad_norm": 1.133683443069458, "learning_rate": 9.109981071406197e-05} +{"ts": "2025-12-27T12:37:00", "event": "train_log", "step": 3664, "epoch": 1.5459915611814345, "progress_pct": 25.77, "epoch_pct": 25.77, "eta": "53:12:29", "max_grad_norm": 0.8, "loss": 0.622981071472168, "grad_norm": 1.1958736181259155, "learning_rate": 9.108612981697248e-05} +{"ts": "2025-12-27T12:37:20", "event": "train_log", "step": 3666, "epoch": 1.5468354430379747, "progress_pct": 25.78, "epoch_pct": 25.78, "eta": "53:11:07", "max_grad_norm": 0.8, "loss": 0.6520710587501526, "grad_norm": 1.234328031539917, "learning_rate": 9.107243944210194e-05} +{"ts": "2025-12-27T12:37:41", "event": "train_log", "step": 3668, "epoch": 1.5476793248945149, "progress_pct": 25.79, "epoch_pct": 25.79, "eta": "53:09:47", "max_grad_norm": 0.8, "loss": 0.5993341207504272, "grad_norm": 1.0374714136123657, "learning_rate": 9.105873959260842e-05} +{"ts": "2025-12-27T12:38:02", "event": "train_log", "step": 3670, "epoch": 1.5485232067510548, "progress_pct": 25.81, "epoch_pct": 25.81, "eta": "53:08:27", "max_grad_norm": 0.8, "loss": 0.6564813852310181, "grad_norm": 0.9987428784370422, "learning_rate": 9.104503027165223e-05} +{"ts": "2025-12-27T12:38:21", "event": "train_log", "step": 3672, "epoch": 1.549367088607595, "progress_pct": 25.82, "epoch_pct": 25.82, "eta": "53:07:01", "max_grad_norm": 0.8, "loss": 0.61710524559021, "grad_norm": 1.0823339223861694, "learning_rate": 9.103131148239584e-05} +{"ts": "2025-12-27T12:38:39", "event": "train_log", "step": 3674, "epoch": 1.5502109704641351, "progress_pct": 25.84, "epoch_pct": 25.84, "eta": "53:05:32", "max_grad_norm": 0.8, "loss": 0.687752366065979, "grad_norm": 1.3481065034866333, "learning_rate": 9.101758322800391e-05} +{"ts": "2025-12-27T12:38:58", "event": "train_log", "step": 3676, "epoch": 1.551054852320675, "progress_pct": 25.85, "epoch_pct": 25.85, "eta": "53:04:07", "max_grad_norm": 0.8, "loss": 0.5981095433235168, "grad_norm": 1.2243965864181519, "learning_rate": 9.10038455116433e-05} +{"ts": "2025-12-27T12:39:17", "event": "train_log", "step": 3678, "epoch": 1.5518987341772152, "progress_pct": 25.86, "epoch_pct": 25.86, "eta": "53:02:41", "max_grad_norm": 0.8, "loss": 0.7181004285812378, "grad_norm": 1.1384631395339966, "learning_rate": 9.0990098336483e-05} +{"ts": "2025-12-27T12:39:36", "event": "train_log", "step": 3680, "epoch": 1.5527426160337554, "progress_pct": 25.88, "epoch_pct": 25.88, "eta": "53:01:14", "max_grad_norm": 0.8, "loss": 0.6137188076972961, "grad_norm": 1.042925477027893, "learning_rate": 9.097634170569426e-05} +{"ts": "2025-12-27T12:39:54", "event": "train_log", "step": 3682, "epoch": 1.5535864978902953, "progress_pct": 25.89, "epoch_pct": 25.89, "eta": "52:59:46", "max_grad_norm": 0.8, "loss": 0.6761168241500854, "grad_norm": 1.372023105621338, "learning_rate": 9.096257562245045e-05} +{"ts": "2025-12-27T12:40:13", "event": "train_log", "step": 3684, "epoch": 1.5544303797468353, "progress_pct": 25.91, "epoch_pct": 25.91, "eta": "52:58:22", "max_grad_norm": 0.8, "loss": 0.614276647567749, "grad_norm": 1.0574673414230347, "learning_rate": 9.094880008992714e-05} +{"ts": "2025-12-27T12:40:32", "event": "train_log", "step": 3686, "epoch": 1.5552742616033757, "progress_pct": 25.92, "epoch_pct": 25.92, "eta": "52:56:55", "max_grad_norm": 0.8, "loss": 0.668122410774231, "grad_norm": 1.2894645929336548, "learning_rate": 9.093501511130208e-05} +{"ts": "2025-12-27T12:40:51", "event": "train_log", "step": 3688, "epoch": 1.5561181434599156, "progress_pct": 25.94, "epoch_pct": 25.94, "eta": "52:55:30", "max_grad_norm": 0.8, "loss": 0.6305631399154663, "grad_norm": 1.2241230010986328, "learning_rate": 9.092122068975523e-05} +{"ts": "2025-12-27T12:41:09", "event": "train_log", "step": 3690, "epoch": 1.5569620253164556, "progress_pct": 25.95, "epoch_pct": 25.95, "eta": "52:54:04", "max_grad_norm": 0.8, "loss": 0.633276641368866, "grad_norm": 1.1316208839416504, "learning_rate": 9.090741682846866e-05} +{"ts": "2025-12-27T12:41:29", "event": "train_log", "step": 3692, "epoch": 1.557805907172996, "progress_pct": 25.96, "epoch_pct": 25.96, "eta": "52:52:40", "max_grad_norm": 0.8, "loss": 0.6657599806785583, "grad_norm": 1.2857953310012817, "learning_rate": 9.089360353062666e-05} +{"ts": "2025-12-27T12:41:48", "event": "train_log", "step": 3694, "epoch": 1.5586497890295359, "progress_pct": 25.98, "epoch_pct": 25.98, "eta": "52:51:14", "max_grad_norm": 0.8, "loss": 0.6379332542419434, "grad_norm": 1.2325671911239624, "learning_rate": 9.087978079941573e-05} +{"ts": "2025-12-27T12:42:06", "event": "train_log", "step": 3696, "epoch": 1.5594936708860758, "progress_pct": 25.99, "epoch_pct": 25.99, "eta": "52:49:48", "max_grad_norm": 0.8, "loss": 0.6841909885406494, "grad_norm": 1.3286080360412598, "learning_rate": 9.086594863802445e-05} +{"ts": "2025-12-27T12:42:25", "event": "train_log", "step": 3698, "epoch": 1.560337552742616, "progress_pct": 26.01, "epoch_pct": 26.01, "eta": "52:48:23", "max_grad_norm": 0.8, "loss": 0.6735964417457581, "grad_norm": 1.261890172958374, "learning_rate": 9.085210704964368e-05} +{"ts": "2025-12-27T12:42:44", "event": "train_log", "step": 3700, "epoch": 1.5611814345991561, "progress_pct": 26.02, "epoch_pct": 26.02, "eta": "52:46:58", "max_grad_norm": 0.8, "loss": 0.6602351665496826, "grad_norm": 1.0922305583953857, "learning_rate": 9.083825603746639e-05} +{"ts": "2025-12-27T12:57:01", "event": "train_log", "step": 3700, "epoch": 1.5611814345991561, "progress_pct": 26.02, "epoch_pct": 26.02, "eta": "53:27:35", "max_grad_norm": 0.8, "eval_loss": 0.7099412679672241, "eval_runtime": 857.2273, "eval_samples_per_second": 2.458, "eval_steps_per_second": 2.458} +{"ts": "2025-12-27T12:57:20", "event": "train_log", "step": 3702, "epoch": 1.562025316455696, "progress_pct": 26.03, "epoch_pct": 26.03, "eta": "53:26:09", "max_grad_norm": 0.8, "loss": 0.6590834259986877, "grad_norm": 1.1113468408584595, "learning_rate": 9.082439560468774e-05} +{"ts": "2025-12-27T12:57:40", "event": "train_log", "step": 3704, "epoch": 1.5628691983122363, "progress_pct": 26.05, "epoch_pct": 26.05, "eta": "53:24:44", "max_grad_norm": 0.8, "loss": 0.6397460103034973, "grad_norm": 1.1476659774780273, "learning_rate": 9.081052575450508e-05} +{"ts": "2025-12-27T12:57:58", "event": "train_log", "step": 3706, "epoch": 1.5637130801687764, "progress_pct": 26.06, "epoch_pct": 26.06, "eta": "53:23:16", "max_grad_norm": 0.8, "loss": 0.6337460279464722, "grad_norm": 1.2270452976226807, "learning_rate": 9.07966464901179e-05} +{"ts": "2025-12-27T12:58:18", "event": "train_log", "step": 3708, "epoch": 1.5645569620253164, "progress_pct": 26.08, "epoch_pct": 26.08, "eta": "53:21:52", "max_grad_norm": 0.8, "loss": 0.680374801158905, "grad_norm": 1.233667016029358, "learning_rate": 9.07827578147279e-05} +{"ts": "2025-12-27T12:58:39", "event": "train_log", "step": 3710, "epoch": 1.5654008438818565, "progress_pct": 26.09, "epoch_pct": 26.09, "eta": "53:20:31", "max_grad_norm": 0.8, "loss": 0.6234241724014282, "grad_norm": 1.0761466026306152, "learning_rate": 9.076885973153891e-05} +{"ts": "2025-12-27T12:59:00", "event": "train_log", "step": 3712, "epoch": 1.5662447257383967, "progress_pct": 26.1, "epoch_pct": 26.1, "eta": "53:19:10", "max_grad_norm": 0.8, "loss": 0.6096800565719604, "grad_norm": 0.9219012260437012, "learning_rate": 9.075495224375697e-05} +{"ts": "2025-12-27T12:59:20", "event": "train_log", "step": 3714, "epoch": 1.5670886075949366, "progress_pct": 26.12, "epoch_pct": 26.12, "eta": "53:17:47", "max_grad_norm": 0.8, "loss": 0.649919867515564, "grad_norm": 1.151168942451477, "learning_rate": 9.074103535459026e-05} +{"ts": "2025-12-27T12:59:39", "event": "train_log", "step": 3716, "epoch": 1.5679324894514768, "progress_pct": 26.13, "epoch_pct": 26.13, "eta": "53:16:20", "max_grad_norm": 0.8, "loss": 0.6704574227333069, "grad_norm": 1.1380470991134644, "learning_rate": 9.072710906724914e-05} +{"ts": "2025-12-27T12:59:59", "event": "train_log", "step": 3718, "epoch": 1.568776371308017, "progress_pct": 26.15, "epoch_pct": 26.15, "eta": "53:14:57", "max_grad_norm": 0.8, "loss": 0.6619362831115723, "grad_norm": 1.2184447050094604, "learning_rate": 9.071317338494614e-05} +{"ts": "2025-12-27T13:00:20", "event": "train_log", "step": 3720, "epoch": 1.5696202531645569, "progress_pct": 26.16, "epoch_pct": 26.16, "eta": "53:13:37", "max_grad_norm": 0.8, "loss": 0.6179121732711792, "grad_norm": 1.131170630455017, "learning_rate": 9.069922831089594e-05} +{"ts": "2025-12-27T13:00:41", "event": "train_log", "step": 3722, "epoch": 1.570464135021097, "progress_pct": 26.17, "epoch_pct": 26.17, "eta": "53:12:18", "max_grad_norm": 0.8, "loss": 0.594958484172821, "grad_norm": 1.2668405771255493, "learning_rate": 9.06852738483154e-05} +{"ts": "2025-12-27T13:01:01", "event": "train_log", "step": 3724, "epoch": 1.5713080168776372, "progress_pct": 26.19, "epoch_pct": 26.19, "eta": "53:10:54", "max_grad_norm": 0.8, "loss": 0.6323778629302979, "grad_norm": 1.1624782085418701, "learning_rate": 9.067131000042359e-05} +{"ts": "2025-12-27T13:01:22", "event": "train_log", "step": 3726, "epoch": 1.5721518987341772, "progress_pct": 26.2, "epoch_pct": 26.2, "eta": "53:09:35", "max_grad_norm": 0.8, "loss": 0.628058910369873, "grad_norm": 1.2936128377914429, "learning_rate": 9.065733677044166e-05} +{"ts": "2025-12-27T13:01:42", "event": "train_log", "step": 3728, "epoch": 1.5729957805907173, "progress_pct": 26.22, "epoch_pct": 26.22, "eta": "53:08:12", "max_grad_norm": 0.8, "loss": 0.6472614407539368, "grad_norm": 1.1847784519195557, "learning_rate": 9.064335416159296e-05} +{"ts": "2025-12-27T13:02:02", "event": "train_log", "step": 3730, "epoch": 1.5738396624472575, "progress_pct": 26.23, "epoch_pct": 26.23, "eta": "53:06:49", "max_grad_norm": 0.8, "loss": 0.6395491361618042, "grad_norm": 1.8903449773788452, "learning_rate": 9.062936217710305e-05} +{"ts": "2025-12-27T13:02:21", "event": "train_log", "step": 3732, "epoch": 1.5746835443037974, "progress_pct": 26.24, "epoch_pct": 26.24, "eta": "53:05:24", "max_grad_norm": 0.8, "loss": 0.6911961436271667, "grad_norm": 1.1150785684585571, "learning_rate": 9.061536082019956e-05} +{"ts": "2025-12-27T13:02:40", "event": "train_log", "step": 3734, "epoch": 1.5755274261603376, "progress_pct": 26.26, "epoch_pct": 26.26, "eta": "53:03:59", "max_grad_norm": 0.8, "loss": 0.7051874399185181, "grad_norm": 1.1206107139587402, "learning_rate": 9.060135009411239e-05} +{"ts": "2025-12-27T13:02:58", "event": "train_log", "step": 3736, "epoch": 1.5763713080168777, "progress_pct": 26.27, "epoch_pct": 26.27, "eta": "53:02:29", "max_grad_norm": 0.8, "loss": 0.7012752890586853, "grad_norm": 1.27924382686615, "learning_rate": 9.05873300020735e-05} +{"ts": "2025-12-27T13:03:15", "event": "train_log", "step": 3738, "epoch": 1.5772151898734177, "progress_pct": 26.29, "epoch_pct": 26.29, "eta": "53:01:00", "max_grad_norm": 0.8, "loss": 0.7185142040252686, "grad_norm": 1.3970832824707031, "learning_rate": 9.057330054731707e-05} +{"ts": "2025-12-27T13:03:35", "event": "train_log", "step": 3740, "epoch": 1.5780590717299579, "progress_pct": 26.3, "epoch_pct": 26.3, "eta": "52:59:37", "max_grad_norm": 0.8, "loss": 0.6298858523368835, "grad_norm": 0.9732457995414734, "learning_rate": 9.055926173307945e-05} +{"ts": "2025-12-27T13:03:54", "event": "train_log", "step": 3742, "epoch": 1.578902953586498, "progress_pct": 26.32, "epoch_pct": 26.32, "eta": "52:58:12", "max_grad_norm": 0.8, "loss": 0.7142943739891052, "grad_norm": 1.230928897857666, "learning_rate": 9.054521356259909e-05} +{"ts": "2025-12-27T13:04:14", "event": "train_log", "step": 3744, "epoch": 1.579746835443038, "progress_pct": 26.33, "epoch_pct": 26.33, "eta": "52:56:48", "max_grad_norm": 0.8, "loss": 0.6535376310348511, "grad_norm": 1.1297426223754883, "learning_rate": 9.053115603911664e-05} +{"ts": "2025-12-27T13:04:35", "event": "train_log", "step": 3746, "epoch": 1.580590717299578, "progress_pct": 26.34, "epoch_pct": 26.34, "eta": "52:55:29", "max_grad_norm": 0.8, "loss": 0.6236510872840881, "grad_norm": 1.2132076025009155, "learning_rate": 9.051708916587491e-05} +{"ts": "2025-12-27T13:04:54", "event": "train_log", "step": 3748, "epoch": 1.5814345991561183, "progress_pct": 26.36, "epoch_pct": 26.36, "eta": "52:54:03", "max_grad_norm": 0.8, "loss": 0.6752219200134277, "grad_norm": 1.201319932937622, "learning_rate": 9.050301294611885e-05} +{"ts": "2025-12-27T13:05:12", "event": "train_log", "step": 3750, "epoch": 1.5822784810126582, "progress_pct": 26.37, "epoch_pct": 26.37, "eta": "52:52:37", "max_grad_norm": 0.8, "loss": 0.7248554825782776, "grad_norm": 1.2969163656234741, "learning_rate": 9.048892738309559e-05} +{"ts": "2025-12-27T13:05:31", "event": "train_log", "step": 3752, "epoch": 1.5831223628691982, "progress_pct": 26.39, "epoch_pct": 26.39, "eta": "52:51:12", "max_grad_norm": 0.8, "loss": 0.6488997340202332, "grad_norm": 1.0721957683563232, "learning_rate": 9.047483248005439e-05} +{"ts": "2025-12-27T13:05:52", "event": "train_log", "step": 3754, "epoch": 1.5839662447257385, "progress_pct": 26.4, "epoch_pct": 26.4, "eta": "52:49:52", "max_grad_norm": 0.8, "loss": 0.6191130876541138, "grad_norm": 0.9988508820533752, "learning_rate": 9.046072824024667e-05} +{"ts": "2025-12-27T13:06:10", "event": "train_log", "step": 3756, "epoch": 1.5848101265822785, "progress_pct": 26.41, "epoch_pct": 26.41, "eta": "52:48:24", "max_grad_norm": 0.8, "loss": 0.6681985259056091, "grad_norm": 1.260183572769165, "learning_rate": 9.0446614666926e-05} +{"ts": "2025-12-27T13:06:29", "event": "train_log", "step": 3758, "epoch": 1.5856540084388184, "progress_pct": 26.43, "epoch_pct": 26.43, "eta": "52:47:01", "max_grad_norm": 0.8, "loss": 0.662024736404419, "grad_norm": 1.1288834810256958, "learning_rate": 9.043249176334812e-05} +{"ts": "2025-12-27T13:06:46", "event": "train_log", "step": 3760, "epoch": 1.5864978902953588, "progress_pct": 26.44, "epoch_pct": 26.44, "eta": "52:45:32", "max_grad_norm": 0.8, "loss": 0.609916627407074, "grad_norm": 1.4384263753890991, "learning_rate": 9.04183595327709e-05} +{"ts": "2025-12-27T13:07:06", "event": "train_log", "step": 3762, "epoch": 1.5873417721518988, "progress_pct": 26.46, "epoch_pct": 26.46, "eta": "52:44:08", "max_grad_norm": 0.8, "loss": 0.6532528400421143, "grad_norm": 1.1109941005706787, "learning_rate": 9.04042179784544e-05} +{"ts": "2025-12-27T13:07:24", "event": "train_log", "step": 3764, "epoch": 1.5881856540084387, "progress_pct": 26.47, "epoch_pct": 26.47, "eta": "52:42:42", "max_grad_norm": 0.8, "loss": 0.7136290669441223, "grad_norm": 1.0959233045578003, "learning_rate": 9.039006710366078e-05} +{"ts": "2025-12-27T13:07:43", "event": "train_log", "step": 3766, "epoch": 1.5890295358649789, "progress_pct": 26.48, "epoch_pct": 26.48, "eta": "52:41:17", "max_grad_norm": 0.8, "loss": 0.6907190084457397, "grad_norm": 1.2313964366912842, "learning_rate": 9.037590691165439e-05} +{"ts": "2025-12-27T13:08:02", "event": "train_log", "step": 3768, "epoch": 1.589873417721519, "progress_pct": 26.5, "epoch_pct": 26.5, "eta": "52:39:52", "max_grad_norm": 0.8, "loss": 0.7114790678024292, "grad_norm": 1.3127682209014893, "learning_rate": 9.036173740570172e-05} +{"ts": "2025-12-27T13:08:22", "event": "train_log", "step": 3770, "epoch": 1.590717299578059, "progress_pct": 26.51, "epoch_pct": 26.51, "eta": "52:38:31", "max_grad_norm": 0.8, "loss": 0.6257581114768982, "grad_norm": 1.0038903951644897, "learning_rate": 9.034755858907138e-05} +{"ts": "2025-12-27T13:08:41", "event": "train_log", "step": 3772, "epoch": 1.5915611814345991, "progress_pct": 26.53, "epoch_pct": 26.53, "eta": "52:37:09", "max_grad_norm": 0.8, "loss": 0.578145444393158, "grad_norm": 1.1058061122894287, "learning_rate": 9.033337046503416e-05} +{"ts": "2025-12-27T13:09:01", "event": "train_log", "step": 3774, "epoch": 1.5924050632911393, "progress_pct": 26.54, "epoch_pct": 26.54, "eta": "52:35:47", "max_grad_norm": 0.8, "loss": 0.6312620043754578, "grad_norm": 1.0893515348434448, "learning_rate": 9.0319173036863e-05} +{"ts": "2025-12-27T13:09:21", "event": "train_log", "step": 3776, "epoch": 1.5932489451476792, "progress_pct": 26.55, "epoch_pct": 26.55, "eta": "52:34:27", "max_grad_norm": 0.8, "loss": 0.6799508333206177, "grad_norm": 1.1091047525405884, "learning_rate": 9.030496630783297e-05} +{"ts": "2025-12-27T13:09:41", "event": "train_log", "step": 3778, "epoch": 1.5940928270042194, "progress_pct": 26.57, "epoch_pct": 26.57, "eta": "52:33:04", "max_grad_norm": 0.8, "loss": 0.678726315498352, "grad_norm": 1.1103609800338745, "learning_rate": 9.029075028122127e-05} +{"ts": "2025-12-27T13:10:00", "event": "train_log", "step": 3780, "epoch": 1.5949367088607596, "progress_pct": 26.58, "epoch_pct": 26.58, "eta": "52:31:39", "max_grad_norm": 0.8, "loss": 0.7357890009880066, "grad_norm": 1.1918376684188843, "learning_rate": 9.027652496030728e-05} +{"ts": "2025-12-27T13:10:20", "event": "train_log", "step": 3782, "epoch": 1.5957805907172995, "progress_pct": 26.6, "epoch_pct": 26.6, "eta": "52:30:18", "max_grad_norm": 0.8, "loss": 0.6079391241073608, "grad_norm": 1.0541924238204956, "learning_rate": 9.026229034837253e-05} +{"ts": "2025-12-27T13:10:37", "event": "train_log", "step": 3784, "epoch": 1.5966244725738397, "progress_pct": 26.61, "epoch_pct": 26.61, "eta": "52:28:50", "max_grad_norm": 0.8, "loss": 0.7173702120780945, "grad_norm": 1.195845603942871, "learning_rate": 9.024804644870062e-05} +{"ts": "2025-12-27T13:10:57", "event": "train_log", "step": 3786, "epoch": 1.5974683544303798, "progress_pct": 26.62, "epoch_pct": 26.62, "eta": "52:27:29", "max_grad_norm": 0.8, "loss": 0.6431670188903809, "grad_norm": 1.1362866163253784, "learning_rate": 9.023379326457737e-05} +{"ts": "2025-12-27T13:11:16", "event": "train_log", "step": 3788, "epoch": 1.5983122362869198, "progress_pct": 26.64, "epoch_pct": 26.64, "eta": "52:26:07", "max_grad_norm": 0.8, "loss": 0.6346777677536011, "grad_norm": 1.2327499389648438, "learning_rate": 9.021953079929074e-05} +{"ts": "2025-12-27T13:11:35", "event": "train_log", "step": 3790, "epoch": 1.59915611814346, "progress_pct": 26.65, "epoch_pct": 26.65, "eta": "52:24:43", "max_grad_norm": 0.8, "loss": 0.6852784156799316, "grad_norm": 1.1623177528381348, "learning_rate": 9.020525905613078e-05} +{"ts": "2025-12-27T13:11:57", "event": "train_log", "step": 3792, "epoch": 1.6, "progress_pct": 26.67, "epoch_pct": 26.67, "eta": "52:23:27", "max_grad_norm": 0.8, "loss": 0.6357095241546631, "grad_norm": 1.0258424282073975, "learning_rate": 9.019097803838971e-05} +{"ts": "2025-12-27T13:12:17", "event": "train_log", "step": 3794, "epoch": 1.60084388185654, "progress_pct": 26.68, "epoch_pct": 26.68, "eta": "52:22:05", "max_grad_norm": 0.8, "loss": 0.6663659811019897, "grad_norm": 1.0825177431106567, "learning_rate": 9.017668774936188e-05} +{"ts": "2025-12-27T13:12:35", "event": "train_log", "step": 3796, "epoch": 1.6016877637130802, "progress_pct": 26.69, "epoch_pct": 26.69, "eta": "52:20:39", "max_grad_norm": 0.8, "loss": 0.6009758710861206, "grad_norm": 1.1190401315689087, "learning_rate": 9.016238819234381e-05} +{"ts": "2025-12-27T13:12:54", "event": "train_log", "step": 3798, "epoch": 1.6025316455696204, "progress_pct": 26.71, "epoch_pct": 26.71, "eta": "52:19:16", "max_grad_norm": 0.8, "loss": 0.6907890439033508, "grad_norm": 1.09871244430542, "learning_rate": 9.01480793706341e-05} +{"ts": "2025-12-27T13:13:13", "event": "train_log", "step": 3800, "epoch": 1.6033755274261603, "progress_pct": 26.72, "epoch_pct": 26.72, "eta": "52:17:54", "max_grad_norm": 0.8, "loss": 0.6709389090538025, "grad_norm": 1.2046958208084106, "learning_rate": 9.013376128753354e-05} +{"ts": "2025-12-27T13:27:39", "event": "train_log", "step": 3800, "epoch": 1.6033755274261603, "progress_pct": 26.72, "epoch_pct": 26.72, "eta": "52:57:28", "max_grad_norm": 0.8, "eval_loss": 0.7080941200256348, "eval_runtime": 865.6774, "eval_samples_per_second": 2.434, "eval_steps_per_second": 2.434} +{"ts": "2025-12-27T13:28:00", "event": "train_log", "step": 3802, "epoch": 1.6042194092827005, "progress_pct": 26.74, "epoch_pct": 26.74, "eta": "52:56:08", "max_grad_norm": 0.8, "loss": 0.653937041759491, "grad_norm": 1.0671489238739014, "learning_rate": 9.011943394634505e-05} +{"ts": "2025-12-27T13:28:17", "event": "train_log", "step": 3804, "epoch": 1.6050632911392406, "progress_pct": 26.75, "epoch_pct": 26.75, "eta": "52:54:38", "max_grad_norm": 0.8, "loss": 0.6647229194641113, "grad_norm": 1.4205375909805298, "learning_rate": 9.010509735037364e-05} +{"ts": "2025-12-27T13:28:34", "event": "train_log", "step": 3806, "epoch": 1.6059071729957806, "progress_pct": 26.77, "epoch_pct": 26.77, "eta": "52:53:10", "max_grad_norm": 0.8, "loss": 0.6981267929077148, "grad_norm": 1.3793799877166748, "learning_rate": 9.009075150292652e-05} +{"ts": "2025-12-27T13:28:53", "event": "train_log", "step": 3808, "epoch": 1.6067510548523207, "progress_pct": 26.78, "epoch_pct": 26.78, "eta": "52:51:44", "max_grad_norm": 0.8, "loss": 0.6151314973831177, "grad_norm": 1.0534380674362183, "learning_rate": 9.007639640731298e-05} +{"ts": "2025-12-27T13:29:12", "event": "train_log", "step": 3810, "epoch": 1.6075949367088609, "progress_pct": 26.79, "epoch_pct": 26.79, "eta": "52:50:21", "max_grad_norm": 0.8, "loss": 0.6671237349510193, "grad_norm": 1.1359853744506836, "learning_rate": 9.006203206684447e-05} +{"ts": "2025-12-27T13:29:32", "event": "train_log", "step": 3812, "epoch": 1.6084388185654008, "progress_pct": 26.81, "epoch_pct": 26.81, "eta": "52:48:57", "max_grad_norm": 0.8, "loss": 0.7145646810531616, "grad_norm": 1.2385475635528564, "learning_rate": 9.004765848483456e-05} +{"ts": "2025-12-27T13:29:51", "event": "train_log", "step": 3814, "epoch": 1.6092827004219408, "progress_pct": 26.82, "epoch_pct": 26.82, "eta": "52:47:32", "max_grad_norm": 0.8, "loss": 0.6524789929389954, "grad_norm": 1.1323930025100708, "learning_rate": 9.003327566459899e-05} +{"ts": "2025-12-27T13:30:09", "event": "train_log", "step": 3816, "epoch": 1.6101265822784812, "progress_pct": 26.84, "epoch_pct": 26.84, "eta": "52:46:08", "max_grad_norm": 0.8, "loss": 0.7574670314788818, "grad_norm": 1.1863508224487305, "learning_rate": 9.001888360945555e-05} +{"ts": "2025-12-27T13:30:30", "event": "train_log", "step": 3818, "epoch": 1.610970464135021, "progress_pct": 26.85, "epoch_pct": 26.85, "eta": "52:44:49", "max_grad_norm": 0.8, "loss": 0.5858811736106873, "grad_norm": 1.0288994312286377, "learning_rate": 9.000448232272425e-05} +{"ts": "2025-12-27T13:30:49", "event": "train_log", "step": 3820, "epoch": 1.611814345991561, "progress_pct": 26.86, "epoch_pct": 26.86, "eta": "52:43:25", "max_grad_norm": 0.8, "loss": 0.6834250688552856, "grad_norm": 1.2674148082733154, "learning_rate": 8.999007180772719e-05} +{"ts": "2025-12-27T13:31:09", "event": "train_log", "step": 3822, "epoch": 1.6126582278481014, "progress_pct": 26.88, "epoch_pct": 26.88, "eta": "52:42:02", "max_grad_norm": 0.8, "loss": 0.6435309052467346, "grad_norm": 1.2014318704605103, "learning_rate": 8.997565206778856e-05} +{"ts": "2025-12-27T13:31:29", "event": "train_log", "step": 3824, "epoch": 1.6135021097046414, "progress_pct": 26.89, "epoch_pct": 26.89, "eta": "52:40:40", "max_grad_norm": 0.8, "loss": 0.6212471127510071, "grad_norm": 1.205741286277771, "learning_rate": 8.996122310623476e-05} +{"ts": "2025-12-27T13:31:48", "event": "train_log", "step": 3826, "epoch": 1.6143459915611813, "progress_pct": 26.91, "epoch_pct": 26.91, "eta": "52:39:16", "max_grad_norm": 0.8, "loss": 0.6832143664360046, "grad_norm": 1.0866186618804932, "learning_rate": 8.994678492639426e-05} +{"ts": "2025-12-27T13:32:07", "event": "train_log", "step": 3828, "epoch": 1.6151898734177215, "progress_pct": 26.92, "epoch_pct": 26.92, "eta": "52:37:53", "max_grad_norm": 0.8, "loss": 0.6129988431930542, "grad_norm": 1.0786924362182617, "learning_rate": 8.993233753159768e-05} +{"ts": "2025-12-27T13:32:26", "event": "train_log", "step": 3830, "epoch": 1.6160337552742616, "progress_pct": 26.93, "epoch_pct": 26.93, "eta": "52:36:29", "max_grad_norm": 0.8, "loss": 0.6376019716262817, "grad_norm": 1.176597237586975, "learning_rate": 8.991788092517775e-05} +{"ts": "2025-12-27T13:32:45", "event": "train_log", "step": 3832, "epoch": 1.6168776371308016, "progress_pct": 26.95, "epoch_pct": 26.95, "eta": "52:35:05", "max_grad_norm": 0.8, "loss": 0.7300569415092468, "grad_norm": 1.149990200996399, "learning_rate": 8.99034151104693e-05} +{"ts": "2025-12-27T13:33:04", "event": "train_log", "step": 3834, "epoch": 1.6177215189873417, "progress_pct": 26.96, "epoch_pct": 26.96, "eta": "52:33:41", "max_grad_norm": 0.8, "loss": 0.6163336634635925, "grad_norm": 1.0655301809310913, "learning_rate": 8.988894009080936e-05} +{"ts": "2025-12-27T13:33:23", "event": "train_log", "step": 3836, "epoch": 1.618565400843882, "progress_pct": 26.98, "epoch_pct": 26.98, "eta": "52:32:17", "max_grad_norm": 0.8, "loss": 0.6459008455276489, "grad_norm": 1.1596909761428833, "learning_rate": 8.987445586953703e-05} +{"ts": "2025-12-27T13:33:43", "event": "train_log", "step": 3838, "epoch": 1.6194092827004218, "progress_pct": 26.99, "epoch_pct": 26.99, "eta": "52:30:57", "max_grad_norm": 0.8, "loss": 0.6166399121284485, "grad_norm": 1.201897382736206, "learning_rate": 8.985996244999352e-05} +{"ts": "2025-12-27T13:34:03", "event": "train_log", "step": 3840, "epoch": 1.620253164556962, "progress_pct": 27.0, "epoch_pct": 27.0, "eta": "52:29:36", "max_grad_norm": 0.8, "loss": 0.6438087224960327, "grad_norm": 1.1000950336456299, "learning_rate": 8.984545983552219e-05} +{"ts": "2025-12-27T13:34:23", "event": "train_log", "step": 3842, "epoch": 1.6210970464135022, "progress_pct": 27.02, "epoch_pct": 27.02, "eta": "52:28:15", "max_grad_norm": 0.8, "loss": 0.6238043308258057, "grad_norm": 0.9962409734725952, "learning_rate": 8.983094802946854e-05} +{"ts": "2025-12-27T13:34:42", "event": "train_log", "step": 3844, "epoch": 1.621940928270042, "progress_pct": 27.03, "epoch_pct": 27.03, "eta": "52:26:51", "max_grad_norm": 0.8, "loss": 0.6445946097373962, "grad_norm": 1.2501682043075562, "learning_rate": 8.981642703518015e-05} +{"ts": "2025-12-27T13:35:01", "event": "train_log", "step": 3846, "epoch": 1.6227848101265823, "progress_pct": 27.05, "epoch_pct": 27.05, "eta": "52:25:28", "max_grad_norm": 0.8, "loss": 0.7147613167762756, "grad_norm": 1.2027913331985474, "learning_rate": 8.980189685600673e-05} +{"ts": "2025-12-27T13:35:21", "event": "train_log", "step": 3848, "epoch": 1.6236286919831224, "progress_pct": 27.06, "epoch_pct": 27.06, "eta": "52:24:08", "max_grad_norm": 0.8, "loss": 0.6531714200973511, "grad_norm": 1.1382197141647339, "learning_rate": 8.97873574953001e-05} +{"ts": "2025-12-27T13:35:39", "event": "train_log", "step": 3850, "epoch": 1.6244725738396624, "progress_pct": 27.07, "epoch_pct": 27.07, "eta": "52:22:43", "max_grad_norm": 0.8, "loss": 0.6811055541038513, "grad_norm": 1.2600723505020142, "learning_rate": 8.977280895641425e-05} +{"ts": "2025-12-27T13:35:59", "event": "train_log", "step": 3852, "epoch": 1.6253164556962025, "progress_pct": 27.09, "epoch_pct": 27.09, "eta": "52:21:23", "max_grad_norm": 0.8, "loss": 0.6142261624336243, "grad_norm": 0.9908071160316467, "learning_rate": 8.97582512427052e-05} +{"ts": "2025-12-27T13:36:19", "event": "train_log", "step": 3854, "epoch": 1.6261603375527427, "progress_pct": 27.1, "epoch_pct": 27.1, "eta": "52:20:01", "max_grad_norm": 0.8, "loss": 0.6408987045288086, "grad_norm": 1.171557068824768, "learning_rate": 8.974368435753117e-05} +{"ts": "2025-12-27T13:36:37", "event": "train_log", "step": 3856, "epoch": 1.6270042194092826, "progress_pct": 27.12, "epoch_pct": 27.12, "eta": "52:18:37", "max_grad_norm": 0.8, "loss": 0.7352069616317749, "grad_norm": 1.1839419603347778, "learning_rate": 8.972910830425247e-05} +{"ts": "2025-12-27T13:36:56", "event": "train_log", "step": 3858, "epoch": 1.6278481012658228, "progress_pct": 27.13, "epoch_pct": 27.13, "eta": "52:17:14", "max_grad_norm": 0.8, "loss": 0.7663040161132812, "grad_norm": 1.233730673789978, "learning_rate": 8.971452308623148e-05} +{"ts": "2025-12-27T13:37:16", "event": "train_log", "step": 3860, "epoch": 1.628691983122363, "progress_pct": 27.14, "epoch_pct": 27.14, "eta": "52:15:53", "max_grad_norm": 0.8, "loss": 0.6496971249580383, "grad_norm": 1.3636224269866943, "learning_rate": 8.969992870683273e-05} +{"ts": "2025-12-27T13:37:37", "event": "train_log", "step": 3862, "epoch": 1.629535864978903, "progress_pct": 27.16, "epoch_pct": 27.16, "eta": "52:14:35", "max_grad_norm": 0.8, "loss": 0.6079609394073486, "grad_norm": 1.2819573879241943, "learning_rate": 8.96853251694229e-05} +{"ts": "2025-12-27T13:37:57", "event": "train_log", "step": 3864, "epoch": 1.630379746835443, "progress_pct": 27.17, "epoch_pct": 27.17, "eta": "52:13:16", "max_grad_norm": 0.8, "loss": 0.6299422979354858, "grad_norm": 1.087265968322754, "learning_rate": 8.967071247737071e-05} +{"ts": "2025-12-27T13:38:19", "event": "train_log", "step": 3866, "epoch": 1.6312236286919832, "progress_pct": 27.19, "epoch_pct": 27.19, "eta": "52:12:01", "max_grad_norm": 0.8, "loss": 0.6691840291023254, "grad_norm": 1.24200439453125, "learning_rate": 8.965609063404706e-05} +{"ts": "2025-12-27T13:38:39", "event": "train_log", "step": 3868, "epoch": 1.6320675105485232, "progress_pct": 27.2, "epoch_pct": 27.2, "eta": "52:10:41", "max_grad_norm": 0.8, "loss": 0.6623613238334656, "grad_norm": 1.0771806240081787, "learning_rate": 8.96414596428249e-05} +{"ts": "2025-12-27T13:38:59", "event": "train_log", "step": 3870, "epoch": 1.6329113924050633, "progress_pct": 27.22, "epoch_pct": 27.22, "eta": "52:09:22", "max_grad_norm": 0.8, "loss": 0.6663276553153992, "grad_norm": 1.1830974817276, "learning_rate": 8.962681950707932e-05} +{"ts": "2025-12-27T13:39:20", "event": "train_log", "step": 3872, "epoch": 1.6337552742616035, "progress_pct": 27.23, "epoch_pct": 27.23, "eta": "52:08:04", "max_grad_norm": 0.8, "loss": 0.6426810622215271, "grad_norm": 1.1107177734375, "learning_rate": 8.961217023018754e-05} +{"ts": "2025-12-27T13:39:38", "event": "train_log", "step": 3874, "epoch": 1.6345991561181434, "progress_pct": 27.24, "epoch_pct": 27.24, "eta": "52:06:40", "max_grad_norm": 0.8, "loss": 0.7113696336746216, "grad_norm": 1.2528507709503174, "learning_rate": 8.959751181552886e-05} +{"ts": "2025-12-27T13:40:00", "event": "train_log", "step": 3876, "epoch": 1.6354430379746834, "progress_pct": 27.26, "epoch_pct": 27.26, "eta": "52:05:24", "max_grad_norm": 0.8, "loss": 0.6211581230163574, "grad_norm": 1.0656070709228516, "learning_rate": 8.958284426648467e-05} +{"ts": "2025-12-27T13:40:22", "event": "train_log", "step": 3878, "epoch": 1.6362869198312238, "progress_pct": 27.27, "epoch_pct": 27.27, "eta": "52:04:11", "max_grad_norm": 0.8, "loss": 0.5950066447257996, "grad_norm": 1.0627381801605225, "learning_rate": 8.956816758643852e-05} +{"ts": "2025-12-27T13:40:43", "event": "train_log", "step": 3880, "epoch": 1.6371308016877637, "progress_pct": 27.29, "epoch_pct": 27.29, "eta": "52:02:54", "max_grad_norm": 0.8, "loss": 0.6519815325737, "grad_norm": 0.9812912344932556, "learning_rate": 8.955348177877603e-05} +{"ts": "2025-12-27T13:41:04", "event": "train_log", "step": 3882, "epoch": 1.6379746835443036, "progress_pct": 27.3, "epoch_pct": 27.3, "eta": "52:01:36", "max_grad_norm": 0.8, "loss": 0.6830767393112183, "grad_norm": 1.1843842267990112, "learning_rate": 8.953878684688493e-05} +{"ts": "2025-12-27T13:41:23", "event": "train_log", "step": 3884, "epoch": 1.638818565400844, "progress_pct": 27.31, "epoch_pct": 27.31, "eta": "52:00:16", "max_grad_norm": 0.8, "loss": 0.5920302271842957, "grad_norm": 1.0393236875534058, "learning_rate": 8.952408279415507e-05} +{"ts": "2025-12-27T13:41:44", "event": "train_log", "step": 3886, "epoch": 1.639662447257384, "progress_pct": 27.33, "epoch_pct": 27.33, "eta": "51:58:57", "max_grad_norm": 0.8, "loss": 0.6269177198410034, "grad_norm": 0.9931944608688354, "learning_rate": 8.950936962397838e-05} +{"ts": "2025-12-27T13:42:03", "event": "train_log", "step": 3888, "epoch": 1.640506329113924, "progress_pct": 27.34, "epoch_pct": 27.34, "eta": "51:57:37", "max_grad_norm": 0.8, "loss": 0.7021532654762268, "grad_norm": 1.1461358070373535, "learning_rate": 8.949464733974891e-05} +{"ts": "2025-12-27T13:42:22", "event": "train_log", "step": 3890, "epoch": 1.6413502109704643, "progress_pct": 27.36, "epoch_pct": 27.36, "eta": "51:56:15", "max_grad_norm": 0.8, "loss": 0.7331246733665466, "grad_norm": 1.2654093503952026, "learning_rate": 8.947991594486279e-05} +{"ts": "2025-12-27T13:42:43", "event": "train_log", "step": 3892, "epoch": 1.6421940928270042, "progress_pct": 27.37, "epoch_pct": 27.37, "eta": "51:54:57", "max_grad_norm": 0.8, "loss": 0.6438513994216919, "grad_norm": 1.1487081050872803, "learning_rate": 8.946517544271831e-05} +{"ts": "2025-12-27T13:43:04", "event": "train_log", "step": 3894, "epoch": 1.6430379746835442, "progress_pct": 27.38, "epoch_pct": 27.38, "eta": "51:53:41", "max_grad_norm": 0.8, "loss": 0.6779276728630066, "grad_norm": 1.0876784324645996, "learning_rate": 8.945042583671579e-05} +{"ts": "2025-12-27T13:43:25", "event": "train_log", "step": 3896, "epoch": 1.6438818565400843, "progress_pct": 27.4, "epoch_pct": 27.4, "eta": "51:52:25", "max_grad_norm": 0.8, "loss": 0.7255419492721558, "grad_norm": 1.2382020950317383, "learning_rate": 8.943566713025768e-05} +{"ts": "2025-12-27T13:43:44", "event": "train_log", "step": 3898, "epoch": 1.6447257383966245, "progress_pct": 27.41, "epoch_pct": 27.41, "eta": "51:51:03", "max_grad_norm": 0.8, "loss": 0.7068934440612793, "grad_norm": 1.3502718210220337, "learning_rate": 8.942089932674855e-05} +{"ts": "2025-12-27T13:44:05", "event": "train_log", "step": 3900, "epoch": 1.6455696202531644, "progress_pct": 27.43, "epoch_pct": 27.43, "eta": "51:49:46", "max_grad_norm": 0.8, "loss": 0.608700156211853, "grad_norm": 1.050878643989563, "learning_rate": 8.940612242959503e-05} +{"ts": "2025-12-27T13:58:20", "event": "train_log", "step": 3900, "epoch": 1.6455696202531644, "progress_pct": 27.43, "epoch_pct": 27.43, "eta": "52:27:28", "max_grad_norm": 0.8, "eval_loss": 0.7049403786659241, "eval_runtime": 854.9866, "eval_samples_per_second": 2.464, "eval_steps_per_second": 2.464} +{"ts": "2025-12-27T13:58:39", "event": "train_log", "step": 3902, "epoch": 1.6464135021097046, "progress_pct": 27.44, "epoch_pct": 27.44, "eta": "52:26:06", "max_grad_norm": 0.8, "loss": 0.6257222890853882, "grad_norm": 1.0536954402923584, "learning_rate": 8.939133644220588e-05} +{"ts": "2025-12-27T13:58:58", "event": "train_log", "step": 3904, "epoch": 1.6472573839662448, "progress_pct": 27.45, "epoch_pct": 27.45, "eta": "52:24:43", "max_grad_norm": 0.8, "loss": 0.6823404431343079, "grad_norm": 1.1903947591781616, "learning_rate": 8.937654136799195e-05} +{"ts": "2025-12-27T13:59:17", "event": "train_log", "step": 3906, "epoch": 1.6481012658227847, "progress_pct": 27.47, "epoch_pct": 27.47, "eta": "52:23:20", "max_grad_norm": 0.8, "loss": 0.6596478819847107, "grad_norm": 1.225679874420166, "learning_rate": 8.936173721036616e-05} +{"ts": "2025-12-27T13:59:37", "event": "train_log", "step": 3908, "epoch": 1.6489451476793249, "progress_pct": 27.48, "epoch_pct": 27.48, "eta": "52:22:00", "max_grad_norm": 0.8, "loss": 0.5638422966003418, "grad_norm": 1.0071430206298828, "learning_rate": 8.934692397274354e-05} +{"ts": "2025-12-27T13:59:57", "event": "train_log", "step": 3910, "epoch": 1.649789029535865, "progress_pct": 27.5, "epoch_pct": 27.5, "eta": "52:20:40", "max_grad_norm": 0.8, "loss": 0.5743419528007507, "grad_norm": 1.0146223306655884, "learning_rate": 8.933210165854125e-05} +{"ts": "2025-12-27T14:00:17", "event": "train_log", "step": 3912, "epoch": 1.650632911392405, "progress_pct": 27.51, "epoch_pct": 27.51, "eta": "52:19:19", "max_grad_norm": 0.8, "loss": 0.6775169372558594, "grad_norm": 1.122976541519165, "learning_rate": 8.931727027117848e-05} +{"ts": "2025-12-27T14:00:38", "event": "train_log", "step": 3914, "epoch": 1.6514767932489451, "progress_pct": 27.52, "epoch_pct": 27.52, "eta": "52:18:03", "max_grad_norm": 0.8, "loss": 0.5984215140342712, "grad_norm": 0.9223271012306213, "learning_rate": 8.930242981407656e-05} +{"ts": "2025-12-27T14:00:58", "event": "train_log", "step": 3916, "epoch": 1.6523206751054853, "progress_pct": 27.54, "epoch_pct": 27.54, "eta": "52:16:43", "max_grad_norm": 0.8, "loss": 0.6342158913612366, "grad_norm": 1.1599735021591187, "learning_rate": 8.928758029065891e-05} +{"ts": "2025-12-27T14:01:17", "event": "train_log", "step": 3918, "epoch": 1.6531645569620252, "progress_pct": 27.55, "epoch_pct": 27.55, "eta": "52:15:18", "max_grad_norm": 0.8, "loss": 0.678507924079895, "grad_norm": 1.2680121660232544, "learning_rate": 8.927272170435101e-05} +{"ts": "2025-12-27T14:01:35", "event": "train_log", "step": 3920, "epoch": 1.6540084388185654, "progress_pct": 27.57, "epoch_pct": 27.57, "eta": "52:13:55", "max_grad_norm": 0.8, "loss": 0.6739710569381714, "grad_norm": 1.3628549575805664, "learning_rate": 8.925785405858047e-05} +{"ts": "2025-12-27T14:01:56", "event": "train_log", "step": 3922, "epoch": 1.6548523206751056, "progress_pct": 27.58, "epoch_pct": 27.58, "eta": "52:12:36", "max_grad_norm": 0.8, "loss": 0.7050020098686218, "grad_norm": 1.163482427597046, "learning_rate": 8.924297735677694e-05} +{"ts": "2025-12-27T14:02:17", "event": "train_log", "step": 3924, "epoch": 1.6556962025316455, "progress_pct": 27.59, "epoch_pct": 27.59, "eta": "52:11:19", "max_grad_norm": 0.8, "loss": 0.6847540140151978, "grad_norm": 1.2057000398635864, "learning_rate": 8.922809160237222e-05} +{"ts": "2025-12-27T14:02:35", "event": "train_log", "step": 3926, "epoch": 1.6565400843881857, "progress_pct": 27.61, "epoch_pct": 27.61, "eta": "52:09:54", "max_grad_norm": 0.8, "loss": 0.7079069018363953, "grad_norm": 1.2784082889556885, "learning_rate": 8.921319679880016e-05} +{"ts": "2025-12-27T14:02:55", "event": "train_log", "step": 3928, "epoch": 1.6573839662447258, "progress_pct": 27.62, "epoch_pct": 27.62, "eta": "52:08:34", "max_grad_norm": 0.8, "loss": 0.665060818195343, "grad_norm": 1.1701157093048096, "learning_rate": 8.919829294949671e-05} +{"ts": "2025-12-27T14:03:13", "event": "train_log", "step": 3930, "epoch": 1.6582278481012658, "progress_pct": 27.64, "epoch_pct": 27.64, "eta": "52:07:09", "max_grad_norm": 0.8, "loss": 0.7547550201416016, "grad_norm": 1.3886606693267822, "learning_rate": 8.918338005789988e-05} +{"ts": "2025-12-27T14:03:35", "event": "train_log", "step": 3932, "epoch": 1.659071729957806, "progress_pct": 27.65, "epoch_pct": 27.65, "eta": "52:05:54", "max_grad_norm": 0.8, "loss": 0.5718522667884827, "grad_norm": 0.9504727721214294, "learning_rate": 8.91684581274498e-05} +{"ts": "2025-12-27T14:03:54", "event": "train_log", "step": 3934, "epoch": 1.659915611814346, "progress_pct": 27.67, "epoch_pct": 27.67, "eta": "52:04:34", "max_grad_norm": 0.8, "loss": 0.5984254479408264, "grad_norm": 1.1185030937194824, "learning_rate": 8.915352716158869e-05} +{"ts": "2025-12-27T14:04:16", "event": "train_log", "step": 3936, "epoch": 1.660759493670886, "progress_pct": 27.68, "epoch_pct": 27.68, "eta": "52:03:18", "max_grad_norm": 0.8, "loss": 0.6749780774116516, "grad_norm": 1.1489602327346802, "learning_rate": 8.913858716376081e-05} +{"ts": "2025-12-27T14:04:35", "event": "train_log", "step": 3938, "epoch": 1.6616033755274262, "progress_pct": 27.69, "epoch_pct": 27.69, "eta": "52:01:57", "max_grad_norm": 0.8, "loss": 0.6537864804267883, "grad_norm": 1.389431118965149, "learning_rate": 8.912363813741255e-05} +{"ts": "2025-12-27T14:04:55", "event": "train_log", "step": 3940, "epoch": 1.6624472573839664, "progress_pct": 27.71, "epoch_pct": 27.71, "eta": "52:00:37", "max_grad_norm": 0.8, "loss": 0.6033569574356079, "grad_norm": 1.0958757400512695, "learning_rate": 8.910868008599235e-05} +{"ts": "2025-12-27T14:05:15", "event": "train_log", "step": 3942, "epoch": 1.6632911392405063, "progress_pct": 27.72, "epoch_pct": 27.72, "eta": "51:59:18", "max_grad_norm": 0.8, "loss": 0.7404987215995789, "grad_norm": 1.2735344171524048, "learning_rate": 8.909371301295075e-05} +{"ts": "2025-12-27T14:05:35", "event": "train_log", "step": 3944, "epoch": 1.6641350210970463, "progress_pct": 27.74, "epoch_pct": 27.74, "eta": "51:57:59", "max_grad_norm": 0.8, "loss": 0.6265006065368652, "grad_norm": 1.123336911201477, "learning_rate": 8.907873692174038e-05} +{"ts": "2025-12-27T14:05:54", "event": "train_log", "step": 3946, "epoch": 1.6649789029535866, "progress_pct": 27.75, "epoch_pct": 27.75, "eta": "51:56:37", "max_grad_norm": 0.8, "loss": 0.650705099105835, "grad_norm": 1.259470820426941, "learning_rate": 8.90637518158159e-05} +{"ts": "2025-12-27T14:06:12", "event": "train_log", "step": 3948, "epoch": 1.6658227848101266, "progress_pct": 27.76, "epoch_pct": 27.76, "eta": "51:55:14", "max_grad_norm": 0.8, "loss": 0.7813970446586609, "grad_norm": 1.4020485877990723, "learning_rate": 8.904875769863412e-05} +{"ts": "2025-12-27T14:06:31", "event": "train_log", "step": 3950, "epoch": 1.6666666666666665, "progress_pct": 27.78, "epoch_pct": 27.78, "eta": "51:53:52", "max_grad_norm": 0.8, "loss": 0.6499447822570801, "grad_norm": 1.1709671020507812, "learning_rate": 8.903375457365389e-05} +{"ts": "2025-12-27T14:06:53", "event": "train_log", "step": 3952, "epoch": 1.667510548523207, "progress_pct": 27.79, "epoch_pct": 27.79, "eta": "51:52:38", "max_grad_norm": 0.8, "loss": 0.6141875386238098, "grad_norm": 1.085585355758667, "learning_rate": 8.901874244433612e-05} +{"ts": "2025-12-27T14:07:13", "event": "train_log", "step": 3954, "epoch": 1.6683544303797468, "progress_pct": 27.81, "epoch_pct": 27.81, "eta": "51:51:20", "max_grad_norm": 0.8, "loss": 0.7080221176147461, "grad_norm": 1.2340166568756104, "learning_rate": 8.900372131414386e-05} +{"ts": "2025-12-27T14:07:34", "event": "train_log", "step": 3956, "epoch": 1.6691983122362868, "progress_pct": 27.82, "epoch_pct": 27.82, "eta": "51:50:03", "max_grad_norm": 0.8, "loss": 0.6340513229370117, "grad_norm": 1.148576259613037, "learning_rate": 8.898869118654216e-05} +{"ts": "2025-12-27T14:07:54", "event": "train_log", "step": 3958, "epoch": 1.6700421940928272, "progress_pct": 27.83, "epoch_pct": 27.83, "eta": "51:48:44", "max_grad_norm": 0.8, "loss": 0.6999116539955139, "grad_norm": 1.2231999635696411, "learning_rate": 8.89736520649982e-05} +{"ts": "2025-12-27T14:08:14", "event": "train_log", "step": 3960, "epoch": 1.6708860759493671, "progress_pct": 27.85, "epoch_pct": 27.85, "eta": "51:47:25", "max_grad_norm": 0.8, "loss": 0.7177759408950806, "grad_norm": 1.1600396633148193, "learning_rate": 8.895860395298121e-05} +{"ts": "2025-12-27T14:08:33", "event": "train_log", "step": 3962, "epoch": 1.671729957805907, "progress_pct": 27.86, "epoch_pct": 27.86, "eta": "51:46:05", "max_grad_norm": 0.8, "loss": 0.6485702395439148, "grad_norm": 1.3019158840179443, "learning_rate": 8.894354685396251e-05} +{"ts": "2025-12-27T14:08:54", "event": "train_log", "step": 3964, "epoch": 1.6725738396624472, "progress_pct": 27.88, "epoch_pct": 27.88, "eta": "51:44:49", "max_grad_norm": 0.8, "loss": 0.6189450025558472, "grad_norm": 1.0153226852416992, "learning_rate": 8.892848077141546e-05} +{"ts": "2025-12-27T14:09:14", "event": "train_log", "step": 3966, "epoch": 1.6734177215189874, "progress_pct": 27.89, "epoch_pct": 27.89, "eta": "51:43:30", "max_grad_norm": 0.8, "loss": 0.6756728291511536, "grad_norm": 1.1953094005584717, "learning_rate": 8.891340570881555e-05} +{"ts": "2025-12-27T14:09:33", "event": "train_log", "step": 3968, "epoch": 1.6742616033755273, "progress_pct": 27.9, "epoch_pct": 27.9, "eta": "51:42:09", "max_grad_norm": 0.8, "loss": 0.6851167678833008, "grad_norm": 1.3376187086105347, "learning_rate": 8.889832166964027e-05} +{"ts": "2025-12-27T14:09:55", "event": "train_log", "step": 3970, "epoch": 1.6751054852320675, "progress_pct": 27.92, "epoch_pct": 27.92, "eta": "51:40:55", "max_grad_norm": 0.8, "loss": 0.5991915464401245, "grad_norm": 1.0045926570892334, "learning_rate": 8.888322865736924e-05} +{"ts": "2025-12-27T14:10:14", "event": "train_log", "step": 3972, "epoch": 1.6759493670886076, "progress_pct": 27.93, "epoch_pct": 27.93, "eta": "51:39:35", "max_grad_norm": 0.8, "loss": 0.713362455368042, "grad_norm": 1.2115750312805176, "learning_rate": 8.886812667548414e-05} +{"ts": "2025-12-27T14:10:35", "event": "train_log", "step": 3974, "epoch": 1.6767932489451476, "progress_pct": 27.95, "epoch_pct": 27.95, "eta": "51:38:18", "max_grad_norm": 0.8, "loss": 0.7058883309364319, "grad_norm": 1.1887929439544678, "learning_rate": 8.88530157274687e-05} +{"ts": "2025-12-27T14:10:55", "event": "train_log", "step": 3976, "epoch": 1.6776371308016877, "progress_pct": 27.96, "epoch_pct": 27.96, "eta": "51:37:01", "max_grad_norm": 0.8, "loss": 0.6501380801200867, "grad_norm": 1.1465295553207397, "learning_rate": 8.883789581680868e-05} +{"ts": "2025-12-27T14:11:16", "event": "train_log", "step": 3978, "epoch": 1.678481012658228, "progress_pct": 27.97, "epoch_pct": 27.97, "eta": "51:35:46", "max_grad_norm": 0.8, "loss": 0.6109840273857117, "grad_norm": 1.184693694114685, "learning_rate": 8.882276694699204e-05} +{"ts": "2025-12-27T14:11:35", "event": "train_log", "step": 3980, "epoch": 1.6793248945147679, "progress_pct": 27.99, "epoch_pct": 27.99, "eta": "51:34:24", "max_grad_norm": 0.8, "loss": 0.6815584897994995, "grad_norm": 1.2034777402877808, "learning_rate": 8.880762912150862e-05} +{"ts": "2025-12-27T14:11:54", "event": "train_log", "step": 3982, "epoch": 1.680168776371308, "progress_pct": 28.0, "epoch_pct": 28.0, "eta": "51:33:03", "max_grad_norm": 0.8, "loss": 0.6859248876571655, "grad_norm": 1.1312000751495361, "learning_rate": 8.879248234385052e-05} +{"ts": "2025-12-27T14:12:12", "event": "train_log", "step": 3984, "epoch": 1.6810126582278482, "progress_pct": 28.02, "epoch_pct": 28.02, "eta": "51:31:41", "max_grad_norm": 0.8, "loss": 0.6426702737808228, "grad_norm": 1.2273681163787842, "learning_rate": 8.877732661751173e-05} +{"ts": "2025-12-27T14:12:32", "event": "train_log", "step": 3986, "epoch": 1.6818565400843881, "progress_pct": 28.03, "epoch_pct": 28.03, "eta": "51:30:22", "max_grad_norm": 0.8, "loss": 0.6462456583976746, "grad_norm": 1.2550326585769653, "learning_rate": 8.876216194598844e-05} +{"ts": "2025-12-27T14:12:50", "event": "train_log", "step": 3988, "epoch": 1.6827004219409283, "progress_pct": 28.05, "epoch_pct": 28.05, "eta": "51:28:59", "max_grad_norm": 0.8, "loss": 0.6293925046920776, "grad_norm": 1.3111321926116943, "learning_rate": 8.874698833277884e-05} +{"ts": "2025-12-27T14:13:12", "event": "train_log", "step": 3990, "epoch": 1.6835443037974684, "progress_pct": 28.06, "epoch_pct": 28.06, "eta": "51:27:46", "max_grad_norm": 0.8, "loss": 0.59798264503479, "grad_norm": 1.037883996963501, "learning_rate": 8.873180578138316e-05} +{"ts": "2025-12-27T14:13:31", "event": "train_log", "step": 3992, "epoch": 1.6843881856540084, "progress_pct": 28.07, "epoch_pct": 28.07, "eta": "51:26:25", "max_grad_norm": 0.8, "loss": 0.6741529703140259, "grad_norm": 1.2411901950836182, "learning_rate": 8.871661429530376e-05} +{"ts": "2025-12-27T14:13:51", "event": "train_log", "step": 3994, "epoch": 1.6852320675105485, "progress_pct": 28.09, "epoch_pct": 28.09, "eta": "51:25:07", "max_grad_norm": 0.8, "loss": 0.5972680449485779, "grad_norm": 1.206354022026062, "learning_rate": 8.8701413878045e-05} +{"ts": "2025-12-27T14:14:12", "event": "train_log", "step": 3996, "epoch": 1.6860759493670887, "progress_pct": 28.1, "epoch_pct": 28.1, "eta": "51:23:52", "max_grad_norm": 0.8, "loss": 0.5879245400428772, "grad_norm": 1.1922144889831543, "learning_rate": 8.868620453311334e-05} +{"ts": "2025-12-27T14:14:30", "event": "train_log", "step": 3998, "epoch": 1.6869198312236287, "progress_pct": 28.12, "epoch_pct": 28.12, "eta": "51:22:31", "max_grad_norm": 0.8, "loss": 0.7381167411804199, "grad_norm": 1.3499996662139893, "learning_rate": 8.867098626401729e-05} +{"ts": "2025-12-27T14:14:50", "event": "train_log", "step": 4000, "epoch": 1.6877637130801688, "progress_pct": 28.13, "epoch_pct": 28.13, "eta": "51:21:14", "max_grad_norm": 0.8, "loss": 0.6590276956558228, "grad_norm": 1.3601514101028442, "learning_rate": 8.865575907426737e-05} +{"ts": "2025-12-27T14:28:59", "event": "train_log", "step": 4000, "epoch": 1.6877637130801688, "progress_pct": 28.13, "epoch_pct": 28.13, "eta": "51:57:23", "max_grad_norm": 0.8, "eval_loss": 0.7027890682220459, "eval_runtime": 848.7529, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482} +{"ts": "2025-12-27T14:29:20", "event": "train_log", "step": 4002, "epoch": 1.688607594936709, "progress_pct": 28.14, "epoch_pct": 28.14, "eta": "51:56:06", "max_grad_norm": 0.8, "loss": 0.5958077907562256, "grad_norm": 1.1060529947280884, "learning_rate": 8.864052296737624e-05} +{"ts": "2025-12-27T14:29:38", "event": "train_log", "step": 4004, "epoch": 1.689451476793249, "progress_pct": 28.16, "epoch_pct": 28.16, "eta": "51:54:42", "max_grad_norm": 0.8, "loss": 0.6802279353141785, "grad_norm": 1.2067371606826782, "learning_rate": 8.862527794685858e-05} +{"ts": "2025-12-27T14:29:59", "event": "train_log", "step": 4006, "epoch": 1.690295358649789, "progress_pct": 28.17, "epoch_pct": 28.17, "eta": "51:53:25", "max_grad_norm": 0.8, "loss": 0.5701603889465332, "grad_norm": 1.0094636678695679, "learning_rate": 8.86100240162311e-05} +{"ts": "2025-12-27T14:30:18", "event": "train_log", "step": 4008, "epoch": 1.6911392405063292, "progress_pct": 28.19, "epoch_pct": 28.19, "eta": "51:52:05", "max_grad_norm": 0.8, "loss": 0.6580625176429749, "grad_norm": 1.0976500511169434, "learning_rate": 8.85947611790126e-05} +{"ts": "2025-12-27T14:30:39", "event": "train_log", "step": 4010, "epoch": 1.6919831223628692, "progress_pct": 28.2, "epoch_pct": 28.2, "eta": "51:50:47", "max_grad_norm": 0.8, "loss": 0.5947542190551758, "grad_norm": 0.9448981285095215, "learning_rate": 8.857948943872392e-05} +{"ts": "2025-12-27T14:30:59", "event": "train_log", "step": 4012, "epoch": 1.6928270042194091, "progress_pct": 28.21, "epoch_pct": 28.21, "eta": "51:49:29", "max_grad_norm": 0.8, "loss": 0.6361464262008667, "grad_norm": 1.219609260559082, "learning_rate": 8.856420879888796e-05} +{"ts": "2025-12-27T14:31:18", "event": "train_log", "step": 4014, "epoch": 1.6936708860759495, "progress_pct": 28.23, "epoch_pct": 28.23, "eta": "51:48:09", "max_grad_norm": 0.8, "loss": 0.608664333820343, "grad_norm": 1.2395503520965576, "learning_rate": 8.854891926302966e-05} +{"ts": "2025-12-27T14:31:38", "event": "train_log", "step": 4016, "epoch": 1.6945147679324895, "progress_pct": 28.24, "epoch_pct": 28.24, "eta": "51:46:50", "max_grad_norm": 0.8, "loss": 0.6932460069656372, "grad_norm": 1.1300057172775269, "learning_rate": 8.853362083467604e-05} +{"ts": "2025-12-27T14:31:57", "event": "train_log", "step": 4018, "epoch": 1.6953586497890294, "progress_pct": 28.26, "epoch_pct": 28.26, "eta": "51:45:29", "max_grad_norm": 0.8, "loss": 0.646004855632782, "grad_norm": 1.2300254106521606, "learning_rate": 8.851831351735616e-05} +{"ts": "2025-12-27T14:32:16", "event": "train_log", "step": 4020, "epoch": 1.6962025316455698, "progress_pct": 28.27, "epoch_pct": 28.27, "eta": "51:44:07", "max_grad_norm": 0.8, "loss": 0.6760826110839844, "grad_norm": 1.2328956127166748, "learning_rate": 8.85029973146011e-05} +{"ts": "2025-12-27T14:32:37", "event": "train_log", "step": 4022, "epoch": 1.6970464135021097, "progress_pct": 28.28, "epoch_pct": 28.28, "eta": "51:42:51", "max_grad_norm": 0.8, "loss": 0.5943224430084229, "grad_norm": 1.1252286434173584, "learning_rate": 8.848767222994401e-05} +{"ts": "2025-12-27T14:32:56", "event": "train_log", "step": 4024, "epoch": 1.6978902953586497, "progress_pct": 28.3, "epoch_pct": 28.3, "eta": "51:41:31", "max_grad_norm": 0.8, "loss": 0.7535276412963867, "grad_norm": 1.1587592363357544, "learning_rate": 8.847233826692012e-05} +{"ts": "2025-12-27T14:33:17", "event": "train_log", "step": 4026, "epoch": 1.6987341772151898, "progress_pct": 28.31, "epoch_pct": 28.31, "eta": "51:40:14", "max_grad_norm": 0.8, "loss": 0.5903090834617615, "grad_norm": 1.0294606685638428, "learning_rate": 8.845699542906667e-05} +{"ts": "2025-12-27T14:33:35", "event": "train_log", "step": 4028, "epoch": 1.69957805907173, "progress_pct": 28.33, "epoch_pct": 28.33, "eta": "51:38:52", "max_grad_norm": 0.8, "loss": 0.6031379699707031, "grad_norm": 1.1940597295761108, "learning_rate": 8.844164371992295e-05} +{"ts": "2025-12-27T14:33:55", "event": "train_log", "step": 4030, "epoch": 1.70042194092827, "progress_pct": 28.34, "epoch_pct": 28.34, "eta": "51:37:32", "max_grad_norm": 0.8, "loss": 0.6185168623924255, "grad_norm": 1.0416409969329834, "learning_rate": 8.842628314303031e-05} +{"ts": "2025-12-27T14:34:14", "event": "train_log", "step": 4032, "epoch": 1.70126582278481, "progress_pct": 28.35, "epoch_pct": 28.35, "eta": "51:36:14", "max_grad_norm": 0.8, "loss": 0.6325570344924927, "grad_norm": 1.8715689182281494, "learning_rate": 8.841091370193214e-05} +{"ts": "2025-12-27T14:34:35", "event": "train_log", "step": 4034, "epoch": 1.7021097046413503, "progress_pct": 28.37, "epoch_pct": 28.37, "eta": "51:34:57", "max_grad_norm": 0.8, "loss": 0.7413952350616455, "grad_norm": 1.230658769607544, "learning_rate": 8.839553540017387e-05} +{"ts": "2025-12-27T14:34:53", "event": "train_log", "step": 4036, "epoch": 1.7029535864978902, "progress_pct": 28.38, "epoch_pct": 28.38, "eta": "51:33:35", "max_grad_norm": 0.8, "loss": 0.6973189115524292, "grad_norm": 1.298003077507019, "learning_rate": 8.838014824130299e-05} +{"ts": "2025-12-27T14:35:14", "event": "train_log", "step": 4038, "epoch": 1.7037974683544304, "progress_pct": 28.4, "epoch_pct": 28.4, "eta": "51:32:18", "max_grad_norm": 0.8, "loss": 0.6582493185997009, "grad_norm": 1.0246652364730835, "learning_rate": 8.836475222886902e-05} +{"ts": "2025-12-27T14:35:33", "event": "train_log", "step": 4040, "epoch": 1.7046413502109705, "progress_pct": 28.41, "epoch_pct": 28.41, "eta": "51:30:59", "max_grad_norm": 0.8, "loss": 0.6934399008750916, "grad_norm": 1.3652594089508057, "learning_rate": 8.834934736642351e-05} +{"ts": "2025-12-27T14:35:54", "event": "train_log", "step": 4042, "epoch": 1.7054852320675105, "progress_pct": 28.42, "epoch_pct": 28.42, "eta": "51:29:42", "max_grad_norm": 0.8, "loss": 0.6437561511993408, "grad_norm": 1.029778242111206, "learning_rate": 8.833393365752007e-05} +{"ts": "2025-12-27T14:36:14", "event": "train_log", "step": 4044, "epoch": 1.7063291139240506, "progress_pct": 28.44, "epoch_pct": 28.44, "eta": "51:28:25", "max_grad_norm": 0.8, "loss": 0.605059027671814, "grad_norm": 1.1993004083633423, "learning_rate": 8.831851110571437e-05} +{"ts": "2025-12-27T14:36:34", "event": "train_log", "step": 4046, "epoch": 1.7071729957805908, "progress_pct": 28.45, "epoch_pct": 28.45, "eta": "51:27:09", "max_grad_norm": 0.8, "loss": 0.7035017609596252, "grad_norm": 1.286389946937561, "learning_rate": 8.830307971456406e-05} +{"ts": "2025-12-27T14:36:54", "event": "train_log", "step": 4048, "epoch": 1.7080168776371307, "progress_pct": 28.47, "epoch_pct": 28.47, "eta": "51:25:50", "max_grad_norm": 0.8, "loss": 0.6429924964904785, "grad_norm": 1.1211459636688232, "learning_rate": 8.82876394876289e-05} +{"ts": "2025-12-27T14:37:14", "event": "train_log", "step": 4050, "epoch": 1.7088607594936709, "progress_pct": 28.48, "epoch_pct": 28.48, "eta": "51:24:32", "max_grad_norm": 0.8, "loss": 0.6454769968986511, "grad_norm": 1.1284868717193604, "learning_rate": 8.827219042847064e-05} +{"ts": "2025-12-27T14:37:35", "event": "train_log", "step": 4052, "epoch": 1.709704641350211, "progress_pct": 28.5, "epoch_pct": 28.5, "eta": "51:23:16", "max_grad_norm": 0.8, "loss": 0.707233190536499, "grad_norm": 1.1934884786605835, "learning_rate": 8.825673254065306e-05} +{"ts": "2025-12-27T14:37:55", "event": "train_log", "step": 4054, "epoch": 1.710548523206751, "progress_pct": 28.51, "epoch_pct": 28.51, "eta": "51:21:59", "max_grad_norm": 0.8, "loss": 0.6790444254875183, "grad_norm": 1.1560680866241455, "learning_rate": 8.824126582774203e-05} +{"ts": "2025-12-27T14:38:15", "event": "train_log", "step": 4056, "epoch": 1.7113924050632912, "progress_pct": 28.52, "epoch_pct": 28.52, "eta": "51:20:42", "max_grad_norm": 0.8, "loss": 0.6115295886993408, "grad_norm": 1.1924364566802979, "learning_rate": 8.822579029330541e-05} +{"ts": "2025-12-27T14:38:35", "event": "train_log", "step": 4058, "epoch": 1.7122362869198313, "progress_pct": 28.54, "epoch_pct": 28.54, "eta": "51:19:25", "max_grad_norm": 0.8, "loss": 0.7039182186126709, "grad_norm": 1.107370138168335, "learning_rate": 8.82103059409131e-05} +{"ts": "2025-12-27T14:38:55", "event": "train_log", "step": 4060, "epoch": 1.7130801687763713, "progress_pct": 28.55, "epoch_pct": 28.55, "eta": "51:18:07", "max_grad_norm": 0.8, "loss": 0.6580052971839905, "grad_norm": 1.2554657459259033, "learning_rate": 8.819481277413707e-05} +{"ts": "2025-12-27T14:39:14", "event": "train_log", "step": 4062, "epoch": 1.7139240506329114, "progress_pct": 28.57, "epoch_pct": 28.57, "eta": "51:16:47", "max_grad_norm": 0.8, "loss": 0.6042479276657104, "grad_norm": 1.2873135805130005, "learning_rate": 8.817931079655127e-05} +{"ts": "2025-12-27T14:39:33", "event": "train_log", "step": 4064, "epoch": 1.7147679324894516, "progress_pct": 28.58, "epoch_pct": 28.58, "eta": "51:15:29", "max_grad_norm": 0.8, "loss": 0.5992372632026672, "grad_norm": 1.027056097984314, "learning_rate": 8.816380001173172e-05} +{"ts": "2025-12-27T14:39:52", "event": "train_log", "step": 4066, "epoch": 1.7156118143459915, "progress_pct": 28.59, "epoch_pct": 28.59, "eta": "51:14:09", "max_grad_norm": 0.8, "loss": 0.7078655362129211, "grad_norm": 1.0694721937179565, "learning_rate": 8.814828042325644e-05} +{"ts": "2025-12-27T14:40:12", "event": "train_log", "step": 4068, "epoch": 1.7164556962025317, "progress_pct": 28.61, "epoch_pct": 28.61, "eta": "51:12:52", "max_grad_norm": 0.8, "loss": 0.6618752479553223, "grad_norm": 1.194984793663025, "learning_rate": 8.813275203470555e-05} +{"ts": "2025-12-27T14:40:31", "event": "train_log", "step": 4070, "epoch": 1.7172995780590719, "progress_pct": 28.62, "epoch_pct": 28.62, "eta": "51:11:32", "max_grad_norm": 0.8, "loss": 0.6328625679016113, "grad_norm": 1.1713165044784546, "learning_rate": 8.811721484966109e-05} +{"ts": "2025-12-27T14:40:51", "event": "train_log", "step": 4072, "epoch": 1.7181434599156118, "progress_pct": 28.64, "epoch_pct": 28.64, "eta": "51:10:14", "max_grad_norm": 0.8, "loss": 0.5916416645050049, "grad_norm": 0.9993656277656555, "learning_rate": 8.810166887170724e-05} +{"ts": "2025-12-27T14:41:10", "event": "train_log", "step": 4074, "epoch": 1.7189873417721517, "progress_pct": 28.65, "epoch_pct": 28.65, "eta": "51:08:55", "max_grad_norm": 0.8, "loss": 0.6490002274513245, "grad_norm": 1.172642707824707, "learning_rate": 8.808611410443011e-05} +{"ts": "2025-12-27T14:41:29", "event": "train_log", "step": 4076, "epoch": 1.7198312236286921, "progress_pct": 28.66, "epoch_pct": 28.66, "eta": "51:07:36", "max_grad_norm": 0.8, "loss": 0.6571791172027588, "grad_norm": 1.1404821872711182, "learning_rate": 8.807055055141793e-05} +{"ts": "2025-12-27T14:41:48", "event": "train_log", "step": 4078, "epoch": 1.720675105485232, "progress_pct": 28.68, "epoch_pct": 28.68, "eta": "51:06:16", "max_grad_norm": 0.8, "loss": 0.6233854293823242, "grad_norm": 1.2104214429855347, "learning_rate": 8.80549782162609e-05} +{"ts": "2025-12-27T14:42:08", "event": "train_log", "step": 4080, "epoch": 1.721518987341772, "progress_pct": 28.69, "epoch_pct": 28.69, "eta": "51:04:59", "max_grad_norm": 0.8, "loss": 0.6331531405448914, "grad_norm": 1.1691396236419678, "learning_rate": 8.803939710255126e-05} +{"ts": "2025-12-27T14:42:27", "event": "train_log", "step": 4082, "epoch": 1.7223628691983124, "progress_pct": 28.71, "epoch_pct": 28.71, "eta": "51:03:40", "max_grad_norm": 0.8, "loss": 0.6321156620979309, "grad_norm": 1.263174057006836, "learning_rate": 8.802380721388325e-05} +{"ts": "2025-12-27T14:42:47", "event": "train_log", "step": 4084, "epoch": 1.7232067510548523, "progress_pct": 28.72, "epoch_pct": 28.72, "eta": "51:02:23", "max_grad_norm": 0.8, "loss": 0.644904613494873, "grad_norm": 1.0685606002807617, "learning_rate": 8.80082085538532e-05} +{"ts": "2025-12-27T14:43:06", "event": "train_log", "step": 4086, "epoch": 1.7240506329113923, "progress_pct": 28.73, "epoch_pct": 28.73, "eta": "51:01:04", "max_grad_norm": 0.8, "loss": 0.6743831634521484, "grad_norm": 1.2289735078811646, "learning_rate": 8.799260112605938e-05} +{"ts": "2025-12-27T14:43:25", "event": "train_log", "step": 4088, "epoch": 1.7248945147679327, "progress_pct": 28.75, "epoch_pct": 28.75, "eta": "50:59:47", "max_grad_norm": 0.8, "loss": 0.6866999268531799, "grad_norm": 1.0661355257034302, "learning_rate": 8.797698493410216e-05} +{"ts": "2025-12-27T14:43:44", "event": "train_log", "step": 4090, "epoch": 1.7257383966244726, "progress_pct": 28.76, "epoch_pct": 28.76, "eta": "50:58:27", "max_grad_norm": 0.8, "loss": 0.691387414932251, "grad_norm": 1.1001228094100952, "learning_rate": 8.796135998158386e-05} +{"ts": "2025-12-27T14:44:03", "event": "train_log", "step": 4092, "epoch": 1.7265822784810125, "progress_pct": 28.78, "epoch_pct": 28.78, "eta": "50:57:09", "max_grad_norm": 0.8, "loss": 0.5882864594459534, "grad_norm": 1.1078115701675415, "learning_rate": 8.794572627210887e-05} +{"ts": "2025-12-27T14:44:22", "event": "train_log", "step": 4094, "epoch": 1.7274261603375527, "progress_pct": 28.79, "epoch_pct": 28.79, "eta": "50:55:49", "max_grad_norm": 0.8, "loss": 0.6192089319229126, "grad_norm": 1.0483999252319336, "learning_rate": 8.79300838092836e-05} +{"ts": "2025-12-27T14:44:42", "event": "train_log", "step": 4096, "epoch": 1.7282700421940929, "progress_pct": 28.8, "epoch_pct": 28.8, "eta": "50:54:33", "max_grad_norm": 0.8, "loss": 0.603322446346283, "grad_norm": 1.1194913387298584, "learning_rate": 8.791443259671645e-05} +{"ts": "2025-12-27T14:45:02", "event": "train_log", "step": 4098, "epoch": 1.7291139240506328, "progress_pct": 28.82, "epoch_pct": 28.82, "eta": "50:53:17", "max_grad_norm": 0.8, "loss": 0.6141818165779114, "grad_norm": 1.1800397634506226, "learning_rate": 8.789877263801787e-05} +{"ts": "2025-12-27T14:45:21", "event": "train_log", "step": 4100, "epoch": 1.729957805907173, "progress_pct": 28.83, "epoch_pct": 28.83, "eta": "50:52:00", "max_grad_norm": 0.8, "loss": 0.6707983016967773, "grad_norm": 1.261768102645874, "learning_rate": 8.78831039368003e-05} +{"ts": "2025-12-27T14:59:26", "event": "train_log", "step": 4100, "epoch": 1.729957805907173, "progress_pct": 28.83, "epoch_pct": 28.83, "eta": "51:26:44", "max_grad_norm": 0.8, "eval_loss": 0.7022181153297424, "eval_runtime": 844.6405, "eval_samples_per_second": 2.495, "eval_steps_per_second": 2.495} +{"ts": "2025-12-27T14:59:45", "event": "train_log", "step": 4102, "epoch": 1.7308016877637131, "progress_pct": 28.85, "epoch_pct": 28.85, "eta": "51:25:25", "max_grad_norm": 0.8, "loss": 0.6440353989601135, "grad_norm": 1.2505232095718384, "learning_rate": 8.786742649667822e-05} +{"ts": "2025-12-27T15:00:05", "event": "train_log", "step": 4104, "epoch": 1.731645569620253, "progress_pct": 28.86, "epoch_pct": 28.86, "eta": "51:24:06", "max_grad_norm": 0.8, "loss": 0.6712808012962341, "grad_norm": 1.2631809711456299, "learning_rate": 8.78517403212681e-05} +{"ts": "2025-12-27T15:00:24", "event": "train_log", "step": 4106, "epoch": 1.7324894514767932, "progress_pct": 28.87, "epoch_pct": 28.87, "eta": "51:22:47", "max_grad_norm": 0.8, "loss": 0.6854958534240723, "grad_norm": 1.2781071662902832, "learning_rate": 8.783604541418845e-05} +{"ts": "2025-12-27T15:00:43", "event": "train_log", "step": 4108, "epoch": 1.7333333333333334, "progress_pct": 28.89, "epoch_pct": 28.89, "eta": "51:21:27", "max_grad_norm": 0.8, "loss": 0.6281477808952332, "grad_norm": 1.1065936088562012, "learning_rate": 8.782034177905976e-05} +{"ts": "2025-12-27T15:01:03", "event": "train_log", "step": 4110, "epoch": 1.7341772151898733, "progress_pct": 28.9, "epoch_pct": 28.9, "eta": "51:20:09", "max_grad_norm": 0.8, "loss": 0.6835165619850159, "grad_norm": 1.010961890220642, "learning_rate": 8.780462941950457e-05} +{"ts": "2025-12-27T15:01:22", "event": "train_log", "step": 4112, "epoch": 1.7350210970464135, "progress_pct": 28.92, "epoch_pct": 28.92, "eta": "51:18:49", "max_grad_norm": 0.8, "loss": 0.6674962639808655, "grad_norm": 1.1467366218566895, "learning_rate": 8.778890833914744e-05} +{"ts": "2025-12-27T15:01:37", "event": "train_log", "step": 4114, "epoch": 1.7358649789029537, "progress_pct": 28.93, "epoch_pct": 28.93, "eta": "51:17:19", "max_grad_norm": 0.8, "loss": 0.5967551469802856, "grad_norm": 1.0221859216690063, "learning_rate": 8.77731785416149e-05} +{"ts": "2025-12-27T15:01:49", "event": "train_log", "step": 4116, "epoch": 1.7367088607594936, "progress_pct": 28.95, "epoch_pct": 28.95, "eta": "51:15:43", "max_grad_norm": 0.8, "loss": 0.7356855869293213, "grad_norm": 1.347937822341919, "learning_rate": 8.775744003053552e-05} +{"ts": "2025-12-27T15:02:01", "event": "train_log", "step": 4118, "epoch": 1.7375527426160338, "progress_pct": 28.96, "epoch_pct": 28.96, "eta": "51:14:07", "max_grad_norm": 0.8, "loss": 0.6932644844055176, "grad_norm": 1.2952557802200317, "learning_rate": 8.774169280953988e-05} +{"ts": "2025-12-27T15:02:15", "event": "train_log", "step": 4120, "epoch": 1.738396624472574, "progress_pct": 28.97, "epoch_pct": 28.97, "eta": "51:12:34", "max_grad_norm": 0.8, "loss": 0.5917407870292664, "grad_norm": 1.0157089233398438, "learning_rate": 8.772593688226052e-05} +{"ts": "2025-12-27T15:02:27", "event": "train_log", "step": 4122, "epoch": 1.7392405063291139, "progress_pct": 28.99, "epoch_pct": 28.99, "eta": "51:11:00", "max_grad_norm": 0.8, "loss": 0.6335760354995728, "grad_norm": 1.1537878513336182, "learning_rate": 8.77101722523321e-05} +{"ts": "2025-12-27T15:02:40", "event": "train_log", "step": 4124, "epoch": 1.740084388185654, "progress_pct": 29.0, "epoch_pct": 29.0, "eta": "51:09:24", "max_grad_norm": 0.8, "loss": 0.6892110109329224, "grad_norm": 1.0989667177200317, "learning_rate": 8.769439892339115e-05} +{"ts": "2025-12-27T15:02:52", "event": "train_log", "step": 4126, "epoch": 1.7409282700421942, "progress_pct": 29.02, "epoch_pct": 29.02, "eta": "51:07:49", "max_grad_norm": 0.8, "loss": 0.5966230630874634, "grad_norm": 1.1293572187423706, "learning_rate": 8.767861689907633e-05} +{"ts": "2025-12-27T15:03:05", "event": "train_log", "step": 4128, "epoch": 1.7417721518987341, "progress_pct": 29.03, "epoch_pct": 29.03, "eta": "51:06:15", "max_grad_norm": 0.8, "loss": 0.5981804728507996, "grad_norm": 1.1167775392532349, "learning_rate": 8.76628261830282e-05} +{"ts": "2025-12-27T15:03:19", "event": "train_log", "step": 4130, "epoch": 1.7426160337552743, "progress_pct": 29.04, "epoch_pct": 29.04, "eta": "51:04:42", "max_grad_norm": 0.8, "loss": 0.5539529919624329, "grad_norm": 1.0572419166564941, "learning_rate": 8.76470267788894e-05} +{"ts": "2025-12-27T15:03:31", "event": "train_log", "step": 4132, "epoch": 1.7434599156118145, "progress_pct": 29.06, "epoch_pct": 29.06, "eta": "51:03:08", "max_grad_norm": 0.8, "loss": 0.6238219141960144, "grad_norm": 0.937256932258606, "learning_rate": 8.763121869030456e-05} +{"ts": "2025-12-27T15:03:44", "event": "train_log", "step": 4134, "epoch": 1.7443037974683544, "progress_pct": 29.07, "epoch_pct": 29.07, "eta": "51:01:34", "max_grad_norm": 0.8, "loss": 0.6033329963684082, "grad_norm": 1.082932472229004, "learning_rate": 8.761540192092029e-05} +{"ts": "2025-12-27T15:03:58", "event": "train_log", "step": 4136, "epoch": 1.7451476793248946, "progress_pct": 29.09, "epoch_pct": 29.09, "eta": "51:00:02", "max_grad_norm": 0.8, "loss": 0.5567626357078552, "grad_norm": 1.0495184659957886, "learning_rate": 8.75995764743852e-05} +{"ts": "2025-12-27T15:04:10", "event": "train_log", "step": 4138, "epoch": 1.7459915611814347, "progress_pct": 29.1, "epoch_pct": 29.1, "eta": "50:58:27", "max_grad_norm": 0.8, "loss": 0.6759346127510071, "grad_norm": 1.3143779039382935, "learning_rate": 8.758374235434994e-05} +{"ts": "2025-12-27T15:04:23", "event": "train_log", "step": 4140, "epoch": 1.7468354430379747, "progress_pct": 29.11, "epoch_pct": 29.11, "eta": "50:56:52", "max_grad_norm": 0.8, "loss": 0.6439400315284729, "grad_norm": 1.2385786771774292, "learning_rate": 8.756789956446713e-05} +{"ts": "2025-12-27T15:04:35", "event": "train_log", "step": 4142, "epoch": 1.7476793248945146, "progress_pct": 29.13, "epoch_pct": 29.13, "eta": "50:55:18", "max_grad_norm": 0.8, "loss": 0.627493679523468, "grad_norm": 1.0453747510910034, "learning_rate": 8.75520481083914e-05} +{"ts": "2025-12-27T15:04:48", "event": "train_log", "step": 4144, "epoch": 1.748523206751055, "progress_pct": 29.14, "epoch_pct": 29.14, "eta": "50:53:43", "max_grad_norm": 0.8, "loss": 0.677209198474884, "grad_norm": 1.09946608543396, "learning_rate": 8.753618798977935e-05} +{"ts": "2025-12-27T15:05:01", "event": "train_log", "step": 4146, "epoch": 1.749367088607595, "progress_pct": 29.16, "epoch_pct": 29.16, "eta": "50:52:10", "max_grad_norm": 0.8, "loss": 0.6874014735221863, "grad_norm": 1.2207063436508179, "learning_rate": 8.752031921228965e-05} +{"ts": "2025-12-27T15:05:13", "event": "train_log", "step": 4148, "epoch": 1.7502109704641349, "progress_pct": 29.17, "epoch_pct": 29.17, "eta": "50:50:36", "max_grad_norm": 0.8, "loss": 0.6332831382751465, "grad_norm": 1.2520697116851807, "learning_rate": 8.750444177958288e-05} +{"ts": "2025-12-27T15:05:26", "event": "train_log", "step": 4150, "epoch": 1.7510548523206753, "progress_pct": 29.18, "epoch_pct": 29.18, "eta": "50:49:02", "max_grad_norm": 0.8, "loss": 0.682744562625885, "grad_norm": 1.2463186979293823, "learning_rate": 8.748855569532168e-05} +{"ts": "2025-12-27T15:05:38", "event": "train_log", "step": 4152, "epoch": 1.7518987341772152, "progress_pct": 29.2, "epoch_pct": 29.2, "eta": "50:47:28", "max_grad_norm": 0.8, "loss": 0.7006803750991821, "grad_norm": 1.1895235776901245, "learning_rate": 8.747266096317069e-05} +{"ts": "2025-12-27T15:05:51", "event": "train_log", "step": 4154, "epoch": 1.7527426160337551, "progress_pct": 29.21, "epoch_pct": 29.21, "eta": "50:45:53", "max_grad_norm": 0.8, "loss": 0.6751191020011902, "grad_norm": 1.1627185344696045, "learning_rate": 8.745675758679646e-05} +{"ts": "2025-12-27T15:06:03", "event": "train_log", "step": 4156, "epoch": 1.7535864978902953, "progress_pct": 29.23, "epoch_pct": 29.23, "eta": "50:44:19", "max_grad_norm": 0.8, "loss": 0.661848247051239, "grad_norm": 1.324127197265625, "learning_rate": 8.744084556986764e-05} +{"ts": "2025-12-27T15:06:15", "event": "train_log", "step": 4158, "epoch": 1.7544303797468355, "progress_pct": 29.24, "epoch_pct": 29.24, "eta": "50:42:43", "max_grad_norm": 0.8, "loss": 0.7057217955589294, "grad_norm": 1.226809024810791, "learning_rate": 8.74249249160548e-05} +{"ts": "2025-12-27T15:06:27", "event": "train_log", "step": 4160, "epoch": 1.7552742616033754, "progress_pct": 29.25, "epoch_pct": 29.25, "eta": "50:41:09", "max_grad_norm": 0.8, "loss": 0.6856105923652649, "grad_norm": 1.2341214418411255, "learning_rate": 8.740899562903056e-05} +{"ts": "2025-12-27T15:06:40", "event": "train_log", "step": 4162, "epoch": 1.7561181434599156, "progress_pct": 29.27, "epoch_pct": 29.27, "eta": "50:39:36", "max_grad_norm": 0.8, "loss": 0.6616930365562439, "grad_norm": 1.3907564878463745, "learning_rate": 8.739305771246946e-05} +{"ts": "2025-12-27T15:06:52", "event": "train_log", "step": 4164, "epoch": 1.7569620253164557, "progress_pct": 29.28, "epoch_pct": 29.28, "eta": "50:38:01", "max_grad_norm": 0.8, "loss": 0.5791551470756531, "grad_norm": 1.2756825685501099, "learning_rate": 8.737711117004812e-05} +{"ts": "2025-12-27T15:07:04", "event": "train_log", "step": 4166, "epoch": 1.7578059071729957, "progress_pct": 29.3, "epoch_pct": 29.3, "eta": "50:36:26", "max_grad_norm": 0.8, "loss": 0.7074756622314453, "grad_norm": 1.2861095666885376, "learning_rate": 8.736115600544506e-05} +{"ts": "2025-12-27T15:07:16", "event": "train_log", "step": 4168, "epoch": 1.7586497890295358, "progress_pct": 29.31, "epoch_pct": 29.31, "eta": "50:34:53", "max_grad_norm": 0.8, "loss": 0.6494167447090149, "grad_norm": 1.2198424339294434, "learning_rate": 8.734519222234083e-05} +{"ts": "2025-12-27T15:07:28", "event": "train_log", "step": 4170, "epoch": 1.759493670886076, "progress_pct": 29.32, "epoch_pct": 29.32, "eta": "50:33:19", "max_grad_norm": 0.8, "loss": 0.6546841859817505, "grad_norm": 1.19169020652771, "learning_rate": 8.732921982441799e-05} +{"ts": "2025-12-27T15:07:41", "event": "train_log", "step": 4172, "epoch": 1.760337552742616, "progress_pct": 29.34, "epoch_pct": 29.34, "eta": "50:31:45", "max_grad_norm": 0.8, "loss": 0.6701815724372864, "grad_norm": 1.11533784866333, "learning_rate": 8.731323881536108e-05} +{"ts": "2025-12-27T15:07:53", "event": "train_log", "step": 4174, "epoch": 1.761181434599156, "progress_pct": 29.35, "epoch_pct": 29.35, "eta": "50:30:12", "max_grad_norm": 0.8, "loss": 0.6678179502487183, "grad_norm": 1.2148140668869019, "learning_rate": 8.729724919885657e-05} +{"ts": "2025-12-27T15:08:06", "event": "train_log", "step": 4176, "epoch": 1.7620253164556963, "progress_pct": 29.37, "epoch_pct": 29.37, "eta": "50:28:38", "max_grad_norm": 0.8, "loss": 0.6505144834518433, "grad_norm": 1.1968709230422974, "learning_rate": 8.728125097859298e-05} +{"ts": "2025-12-27T15:08:18", "event": "train_log", "step": 4178, "epoch": 1.7628691983122362, "progress_pct": 29.38, "epoch_pct": 29.38, "eta": "50:27:05", "max_grad_norm": 0.8, "loss": 0.6531696915626526, "grad_norm": 1.0954766273498535, "learning_rate": 8.726524415826079e-05} +{"ts": "2025-12-27T15:08:31", "event": "train_log", "step": 4180, "epoch": 1.7637130801687764, "progress_pct": 29.4, "epoch_pct": 29.4, "eta": "50:25:32", "max_grad_norm": 0.8, "loss": 0.710014283657074, "grad_norm": 1.5149537324905396, "learning_rate": 8.724922874155246e-05} +{"ts": "2025-12-27T15:08:43", "event": "train_log", "step": 4182, "epoch": 1.7645569620253165, "progress_pct": 29.41, "epoch_pct": 29.41, "eta": "50:23:59", "max_grad_norm": 0.8, "loss": 0.714016318321228, "grad_norm": 1.145113229751587, "learning_rate": 8.723320473216245e-05} +{"ts": "2025-12-27T15:09:00", "event": "train_log", "step": 4184, "epoch": 1.7654008438818565, "progress_pct": 29.42, "epoch_pct": 29.42, "eta": "50:22:35", "max_grad_norm": 0.8, "loss": 0.6775414347648621, "grad_norm": 0.9454524517059326, "learning_rate": 8.721717213378719e-05} +{"ts": "2025-12-27T15:09:16", "event": "train_log", "step": 4186, "epoch": 1.7662447257383966, "progress_pct": 29.44, "epoch_pct": 29.44, "eta": "50:21:11", "max_grad_norm": 0.8, "loss": 0.6279728412628174, "grad_norm": 1.1414754390716553, "learning_rate": 8.720113095012507e-05} +{"ts": "2025-12-27T15:09:31", "event": "train_log", "step": 4188, "epoch": 1.7670886075949368, "progress_pct": 29.45, "epoch_pct": 29.45, "eta": "50:19:44", "max_grad_norm": 0.8, "loss": 0.5894309282302856, "grad_norm": 1.212802767753601, "learning_rate": 8.718508118487652e-05} +{"ts": "2025-12-27T15:09:48", "event": "train_log", "step": 4190, "epoch": 1.7679324894514767, "progress_pct": 29.47, "epoch_pct": 29.47, "eta": "50:18:22", "max_grad_norm": 0.8, "loss": 0.6124046444892883, "grad_norm": 1.5213478803634644, "learning_rate": 8.716902284174388e-05} +{"ts": "2025-12-27T15:10:04", "event": "train_log", "step": 4192, "epoch": 1.768776371308017, "progress_pct": 29.48, "epoch_pct": 29.48, "eta": "50:17:00", "max_grad_norm": 0.8, "loss": 0.5990801453590393, "grad_norm": 0.9973840713500977, "learning_rate": 8.715295592443154e-05} +{"ts": "2025-12-27T15:10:20", "event": "train_log", "step": 4194, "epoch": 1.769620253164557, "progress_pct": 29.49, "epoch_pct": 29.49, "eta": "50:15:36", "max_grad_norm": 0.8, "loss": 0.6485559344291687, "grad_norm": 1.1084294319152832, "learning_rate": 8.713688043664579e-05} +{"ts": "2025-12-27T15:10:35", "event": "train_log", "step": 4196, "epoch": 1.770464135021097, "progress_pct": 29.51, "epoch_pct": 29.51, "eta": "50:14:09", "max_grad_norm": 0.8, "loss": 0.7083099484443665, "grad_norm": 1.1401913166046143, "learning_rate": 8.712079638209493e-05} +{"ts": "2025-12-27T15:10:50", "event": "train_log", "step": 4198, "epoch": 1.7713080168776372, "progress_pct": 29.52, "epoch_pct": 29.52, "eta": "50:12:43", "max_grad_norm": 0.8, "loss": 0.7237915992736816, "grad_norm": 1.278105616569519, "learning_rate": 8.71047037644893e-05} +{"ts": "2025-12-27T15:11:06", "event": "train_log", "step": 4200, "epoch": 1.7721518987341773, "progress_pct": 29.54, "epoch_pct": 29.54, "eta": "50:11:17", "max_grad_norm": 0.8, "loss": 0.6259870529174805, "grad_norm": 1.2407530546188354, "learning_rate": 8.708860258754108e-05} +{"ts": "2025-12-27T15:20:08", "event": "train_log", "step": 4200, "epoch": 1.7721518987341773, "progress_pct": 29.54, "epoch_pct": 29.54, "eta": "50:32:50", "max_grad_norm": 0.8, "eval_loss": 0.6993561387062073, "eval_runtime": 542.0281, "eval_samples_per_second": 3.887, "eval_steps_per_second": 3.887} +{"ts": "2025-12-27T15:20:21", "event": "train_log", "step": 4202, "epoch": 1.7729957805907173, "progress_pct": 29.55, "epoch_pct": 29.55, "eta": "50:31:18", "max_grad_norm": 0.8, "loss": 0.6604248285293579, "grad_norm": 1.102859616279602, "learning_rate": 8.707249285496457e-05} +{"ts": "2025-12-27T15:20:33", "event": "train_log", "step": 4204, "epoch": 1.7738396624472574, "progress_pct": 29.56, "epoch_pct": 29.56, "eta": "50:29:45", "max_grad_norm": 0.8, "loss": 0.6799775958061218, "grad_norm": 1.2478244304656982, "learning_rate": 8.705637457047594e-05} +{"ts": "2025-12-27T15:20:45", "event": "train_log", "step": 4206, "epoch": 1.7746835443037976, "progress_pct": 29.58, "epoch_pct": 29.58, "eta": "50:28:12", "max_grad_norm": 0.8, "loss": 0.6136477589607239, "grad_norm": 1.1178022623062134, "learning_rate": 8.704024773779338e-05} +{"ts": "2025-12-27T15:20:58", "event": "train_log", "step": 4208, "epoch": 1.7755274261603375, "progress_pct": 29.59, "epoch_pct": 29.59, "eta": "50:26:39", "max_grad_norm": 0.8, "loss": 0.6568390130996704, "grad_norm": 1.904076337814331, "learning_rate": 8.702411236063703e-05} +{"ts": "2025-12-27T15:21:11", "event": "train_log", "step": 4210, "epoch": 1.7763713080168775, "progress_pct": 29.61, "epoch_pct": 29.61, "eta": "50:25:08", "max_grad_norm": 0.8, "loss": 0.6404406428337097, "grad_norm": 1.0902835130691528, "learning_rate": 8.700796844272903e-05} +{"ts": "2025-12-27T15:21:23", "event": "train_log", "step": 4212, "epoch": 1.7772151898734179, "progress_pct": 29.62, "epoch_pct": 29.62, "eta": "50:23:35", "max_grad_norm": 0.8, "loss": 0.6924911737442017, "grad_norm": 1.1858288049697876, "learning_rate": 8.699181598779347e-05} +{"ts": "2025-12-27T15:21:37", "event": "train_log", "step": 4214, "epoch": 1.7780590717299578, "progress_pct": 29.63, "epoch_pct": 29.63, "eta": "50:22:04", "max_grad_norm": 0.8, "loss": 0.572692334651947, "grad_norm": 1.0015727281570435, "learning_rate": 8.69756549995564e-05} +{"ts": "2025-12-27T15:21:49", "event": "train_log", "step": 4216, "epoch": 1.7789029535864977, "progress_pct": 29.65, "epoch_pct": 29.65, "eta": "50:20:30", "max_grad_norm": 0.8, "loss": 0.7196018695831299, "grad_norm": 1.440079689025879, "learning_rate": 8.695948548174583e-05} +{"ts": "2025-12-27T15:22:02", "event": "train_log", "step": 4218, "epoch": 1.7797468354430381, "progress_pct": 29.66, "epoch_pct": 29.66, "eta": "50:18:58", "max_grad_norm": 0.8, "loss": 0.5870906710624695, "grad_norm": 1.1320992708206177, "learning_rate": 8.69433074380918e-05} +{"ts": "2025-12-27T15:22:15", "event": "train_log", "step": 4220, "epoch": 1.780590717299578, "progress_pct": 29.68, "epoch_pct": 29.68, "eta": "50:17:28", "max_grad_norm": 0.8, "loss": 0.6501539349555969, "grad_norm": 1.3156964778900146, "learning_rate": 8.692712087232626e-05} +{"ts": "2025-12-27T15:22:27", "event": "train_log", "step": 4222, "epoch": 1.781434599156118, "progress_pct": 29.69, "epoch_pct": 29.69, "eta": "50:15:56", "max_grad_norm": 0.8, "loss": 0.7017278075218201, "grad_norm": 1.1869803667068481, "learning_rate": 8.691092578818311e-05} +{"ts": "2025-12-27T15:22:41", "event": "train_log", "step": 4224, "epoch": 1.7822784810126582, "progress_pct": 29.7, "epoch_pct": 29.7, "eta": "50:14:26", "max_grad_norm": 0.8, "loss": 0.5954802632331848, "grad_norm": 0.9708380699157715, "learning_rate": 8.689472218939829e-05} +{"ts": "2025-12-27T15:22:53", "event": "train_log", "step": 4226, "epoch": 1.7831223628691983, "progress_pct": 29.72, "epoch_pct": 29.72, "eta": "50:12:53", "max_grad_norm": 0.8, "loss": 0.6494144797325134, "grad_norm": 1.0753228664398193, "learning_rate": 8.687851007970962e-05} +{"ts": "2025-12-27T15:23:06", "event": "train_log", "step": 4228, "epoch": 1.7839662447257383, "progress_pct": 29.73, "epoch_pct": 29.73, "eta": "50:11:22", "max_grad_norm": 0.8, "loss": 0.7247282862663269, "grad_norm": 1.1038413047790527, "learning_rate": 8.686228946285695e-05} +{"ts": "2025-12-27T15:23:20", "event": "train_log", "step": 4230, "epoch": 1.7848101265822784, "progress_pct": 29.75, "epoch_pct": 29.75, "eta": "50:09:53", "max_grad_norm": 0.8, "loss": 0.5673812627792358, "grad_norm": 0.9666786789894104, "learning_rate": 8.684606034258206e-05} +{"ts": "2025-12-27T15:23:32", "event": "train_log", "step": 4232, "epoch": 1.7856540084388186, "progress_pct": 29.76, "epoch_pct": 29.76, "eta": "50:08:21", "max_grad_norm": 0.8, "loss": 0.5950504541397095, "grad_norm": 1.1972676515579224, "learning_rate": 8.682982272262869e-05} +{"ts": "2025-12-27T15:23:45", "event": "train_log", "step": 4234, "epoch": 1.7864978902953585, "progress_pct": 29.77, "epoch_pct": 29.77, "eta": "50:06:48", "max_grad_norm": 0.8, "loss": 0.6477514505386353, "grad_norm": 1.23736572265625, "learning_rate": 8.681357660674255e-05} +{"ts": "2025-12-27T15:23:58", "event": "train_log", "step": 4236, "epoch": 1.7873417721518987, "progress_pct": 29.79, "epoch_pct": 29.79, "eta": "50:05:18", "max_grad_norm": 0.8, "loss": 0.6180200576782227, "grad_norm": 1.0238158702850342, "learning_rate": 8.679732199867127e-05} +{"ts": "2025-12-27T15:24:10", "event": "train_log", "step": 4238, "epoch": 1.7881856540084389, "progress_pct": 29.8, "epoch_pct": 29.8, "eta": "50:03:46", "max_grad_norm": 0.8, "loss": 0.5771099328994751, "grad_norm": 1.0333375930786133, "learning_rate": 8.678105890216455e-05} +{"ts": "2025-12-27T15:24:22", "event": "train_log", "step": 4240, "epoch": 1.7890295358649788, "progress_pct": 29.82, "epoch_pct": 29.82, "eta": "50:02:13", "max_grad_norm": 0.8, "loss": 0.6592516899108887, "grad_norm": 1.30390202999115, "learning_rate": 8.676478732097393e-05} +{"ts": "2025-12-27T15:24:35", "event": "train_log", "step": 4242, "epoch": 1.789873417721519, "progress_pct": 29.83, "epoch_pct": 29.83, "eta": "50:00:43", "max_grad_norm": 0.8, "loss": 0.6662757396697998, "grad_norm": 1.115160346031189, "learning_rate": 8.674850725885294e-05} +{"ts": "2025-12-27T15:24:47", "event": "train_log", "step": 4244, "epoch": 1.7907172995780591, "progress_pct": 29.85, "epoch_pct": 29.85, "eta": "49:59:10", "max_grad_norm": 0.8, "loss": 0.6673333048820496, "grad_norm": 1.2130142450332642, "learning_rate": 8.67322187195571e-05} +{"ts": "2025-12-27T15:24:59", "event": "train_log", "step": 4246, "epoch": 1.791561181434599, "progress_pct": 29.86, "epoch_pct": 29.86, "eta": "49:57:37", "max_grad_norm": 0.8, "loss": 0.6698325872421265, "grad_norm": 1.1505554914474487, "learning_rate": 8.671592170684386e-05} +{"ts": "2025-12-27T15:25:12", "event": "train_log", "step": 4248, "epoch": 1.7924050632911392, "progress_pct": 29.87, "epoch_pct": 29.87, "eta": "49:56:06", "max_grad_norm": 0.8, "loss": 0.6216199398040771, "grad_norm": 1.0758062601089478, "learning_rate": 8.669961622447262e-05} +{"ts": "2025-12-27T15:25:25", "event": "train_log", "step": 4250, "epoch": 1.7932489451476794, "progress_pct": 29.89, "epoch_pct": 29.89, "eta": "49:54:37", "max_grad_norm": 0.8, "loss": 0.6460495591163635, "grad_norm": 0.9300920367240906, "learning_rate": 8.668330227620475e-05} +{"ts": "2025-12-27T15:25:37", "event": "train_log", "step": 4252, "epoch": 1.7940928270042193, "progress_pct": 29.9, "epoch_pct": 29.9, "eta": "49:53:04", "max_grad_norm": 0.8, "loss": 0.6949506998062134, "grad_norm": 1.3860046863555908, "learning_rate": 8.666697986580357e-05} +{"ts": "2025-12-27T15:25:49", "event": "train_log", "step": 4254, "epoch": 1.7949367088607595, "progress_pct": 29.92, "epoch_pct": 29.92, "eta": "49:51:32", "max_grad_norm": 0.8, "loss": 0.6320405602455139, "grad_norm": 1.2287555932998657, "learning_rate": 8.665064899703433e-05} +{"ts": "2025-12-27T15:26:02", "event": "train_log", "step": 4256, "epoch": 1.7957805907172997, "progress_pct": 29.93, "epoch_pct": 29.93, "eta": "49:50:03", "max_grad_norm": 0.8, "loss": 0.6635019779205322, "grad_norm": 1.1585466861724854, "learning_rate": 8.663430967366426e-05} +{"ts": "2025-12-27T15:26:15", "event": "train_log", "step": 4258, "epoch": 1.7966244725738396, "progress_pct": 29.94, "epoch_pct": 29.94, "eta": "49:48:32", "max_grad_norm": 0.8, "loss": 0.645052969455719, "grad_norm": 1.1007941961288452, "learning_rate": 8.661796189946252e-05} +{"ts": "2025-12-27T15:26:28", "event": "train_log", "step": 4260, "epoch": 1.7974683544303798, "progress_pct": 29.96, "epoch_pct": 29.96, "eta": "49:47:02", "max_grad_norm": 0.8, "loss": 0.70420902967453, "grad_norm": 1.2059847116470337, "learning_rate": 8.660160567820023e-05} +{"ts": "2025-12-27T15:26:41", "event": "train_log", "step": 4262, "epoch": 1.79831223628692, "progress_pct": 29.97, "epoch_pct": 29.97, "eta": "49:45:34", "max_grad_norm": 0.8, "loss": 0.6263765096664429, "grad_norm": 1.0648717880249023, "learning_rate": 8.658524101365044e-05} +{"ts": "2025-12-27T15:26:54", "event": "train_log", "step": 4264, "epoch": 1.7991561181434599, "progress_pct": 29.99, "epoch_pct": 29.99, "eta": "49:44:04", "max_grad_norm": 0.8, "loss": 0.6199937462806702, "grad_norm": 1.017052412033081, "learning_rate": 8.656886790958821e-05} +{"ts": "2025-12-27T15:27:08", "event": "train_log", "step": 4266, "epoch": 1.8, "progress_pct": 30.0, "epoch_pct": 30.0, "eta": "49:42:36", "max_grad_norm": 0.8, "loss": 0.5891271233558655, "grad_norm": 1.1153450012207031, "learning_rate": 8.655248636979045e-05} +{"ts": "2025-12-27T15:27:21", "event": "train_log", "step": 4268, "epoch": 1.8008438818565402, "progress_pct": 30.01, "epoch_pct": 30.01, "eta": "49:41:07", "max_grad_norm": 0.8, "loss": 0.5442121028900146, "grad_norm": 1.0661747455596924, "learning_rate": 8.65360963980361e-05} +{"ts": "2025-12-27T15:27:33", "event": "train_log", "step": 4270, "epoch": 1.8016877637130801, "progress_pct": 30.03, "epoch_pct": 30.03, "eta": "49:39:35", "max_grad_norm": 0.8, "loss": 0.6988245248794556, "grad_norm": 1.3049758672714233, "learning_rate": 8.6519697998106e-05} +{"ts": "2025-12-27T15:27:45", "event": "train_log", "step": 4272, "epoch": 1.80253164556962, "progress_pct": 30.04, "epoch_pct": 30.04, "eta": "49:38:03", "max_grad_norm": 0.8, "loss": 0.7260398864746094, "grad_norm": 1.2679938077926636, "learning_rate": 8.650329117378294e-05} +{"ts": "2025-12-27T15:27:57", "event": "train_log", "step": 4274, "epoch": 1.8033755274261605, "progress_pct": 30.06, "epoch_pct": 30.06, "eta": "49:36:32", "max_grad_norm": 0.8, "loss": 0.5757678151130676, "grad_norm": 1.0899536609649658, "learning_rate": 8.648687592885168e-05} +{"ts": "2025-12-27T15:28:10", "event": "train_log", "step": 4276, "epoch": 1.8042194092827004, "progress_pct": 30.07, "epoch_pct": 30.07, "eta": "49:35:01", "max_grad_norm": 0.8, "loss": 0.7042108178138733, "grad_norm": 1.4088575839996338, "learning_rate": 8.647045226709887e-05} +{"ts": "2025-12-27T15:28:22", "event": "train_log", "step": 4278, "epoch": 1.8050632911392404, "progress_pct": 30.08, "epoch_pct": 30.08, "eta": "49:33:30", "max_grad_norm": 0.8, "loss": 0.641275942325592, "grad_norm": 1.2143783569335938, "learning_rate": 8.645402019231316e-05} +{"ts": "2025-12-27T15:28:33", "event": "train_log", "step": 4280, "epoch": 1.8059071729957807, "progress_pct": 30.1, "epoch_pct": 30.1, "eta": "49:31:58", "max_grad_norm": 0.8, "loss": 0.7657124996185303, "grad_norm": 1.4072896242141724, "learning_rate": 8.64375797082851e-05} +{"ts": "2025-12-27T15:28:45", "event": "train_log", "step": 4282, "epoch": 1.8067510548523207, "progress_pct": 30.11, "epoch_pct": 30.11, "eta": "49:30:26", "max_grad_norm": 0.8, "loss": 0.713768720626831, "grad_norm": 1.2563380002975464, "learning_rate": 8.642113081880718e-05} +{"ts": "2025-12-27T15:28:58", "event": "train_log", "step": 4284, "epoch": 1.8075949367088606, "progress_pct": 30.13, "epoch_pct": 30.13, "eta": "49:28:56", "max_grad_norm": 0.8, "loss": 0.6276429295539856, "grad_norm": 1.1195416450500488, "learning_rate": 8.64046735276739e-05} +{"ts": "2025-12-27T15:29:11", "event": "train_log", "step": 4286, "epoch": 1.808438818565401, "progress_pct": 30.14, "epoch_pct": 30.14, "eta": "49:27:28", "max_grad_norm": 0.8, "loss": 0.5641238689422607, "grad_norm": 1.2472422122955322, "learning_rate": 8.638820783868158e-05} +{"ts": "2025-12-27T15:29:23", "event": "train_log", "step": 4288, "epoch": 1.809282700421941, "progress_pct": 30.15, "epoch_pct": 30.15, "eta": "49:25:58", "max_grad_norm": 0.8, "loss": 0.6312015056610107, "grad_norm": 1.1974313259124756, "learning_rate": 8.637173375562855e-05} +{"ts": "2025-12-27T15:29:36", "event": "train_log", "step": 4290, "epoch": 1.810126582278481, "progress_pct": 30.17, "epoch_pct": 30.17, "eta": "49:24:27", "max_grad_norm": 0.8, "loss": 0.6674410104751587, "grad_norm": 1.1673604249954224, "learning_rate": 8.63552512823151e-05} +{"ts": "2025-12-27T15:29:48", "event": "train_log", "step": 4292, "epoch": 1.810970464135021, "progress_pct": 30.18, "epoch_pct": 30.18, "eta": "49:22:58", "max_grad_norm": 0.8, "loss": 0.6772016286849976, "grad_norm": 1.199095368385315, "learning_rate": 8.633876042254337e-05} +{"ts": "2025-12-27T15:30:01", "event": "train_log", "step": 4294, "epoch": 1.8118143459915612, "progress_pct": 30.2, "epoch_pct": 30.2, "eta": "49:21:28", "max_grad_norm": 0.8, "loss": 0.6621671915054321, "grad_norm": 1.2302746772766113, "learning_rate": 8.632226118011752e-05} +{"ts": "2025-12-27T15:30:13", "event": "train_log", "step": 4296, "epoch": 1.8126582278481012, "progress_pct": 30.21, "epoch_pct": 30.21, "eta": "49:19:57", "max_grad_norm": 0.8, "loss": 0.6965363621711731, "grad_norm": 1.304010033607483, "learning_rate": 8.63057535588436e-05} +{"ts": "2025-12-27T15:30:25", "event": "train_log", "step": 4298, "epoch": 1.8135021097046413, "progress_pct": 30.23, "epoch_pct": 30.23, "eta": "49:18:28", "max_grad_norm": 0.8, "loss": 0.6300807595252991, "grad_norm": 1.223366618156433, "learning_rate": 8.62892375625296e-05} +{"ts": "2025-12-27T15:30:38", "event": "train_log", "step": 4300, "epoch": 1.8143459915611815, "progress_pct": 30.24, "epoch_pct": 30.24, "eta": "49:17:00", "max_grad_norm": 0.8, "loss": 0.5610660910606384, "grad_norm": 1.028496265411377, "learning_rate": 8.627271319498544e-05} +{"ts": "2025-12-27T15:39:13", "event": "train_log", "step": 4300, "epoch": 1.8143459915611815, "progress_pct": 30.24, "epoch_pct": 30.24, "eta": "49:36:46", "max_grad_norm": 0.8, "eval_loss": 0.6981000900268555, "eval_runtime": 514.4659, "eval_samples_per_second": 4.096, "eval_steps_per_second": 4.096} +{"ts": "2025-12-27T15:39:25", "event": "train_log", "step": 4302, "epoch": 1.8151898734177214, "progress_pct": 30.25, "epoch_pct": 30.25, "eta": "49:35:17", "max_grad_norm": 0.8, "loss": 0.6666551232337952, "grad_norm": 1.2050007581710815, "learning_rate": 8.625618046002298e-05} +{"ts": "2025-12-27T15:39:38", "event": "train_log", "step": 4304, "epoch": 1.8160337552742616, "progress_pct": 30.27, "epoch_pct": 30.27, "eta": "49:33:46", "max_grad_norm": 0.8, "loss": 0.6631835103034973, "grad_norm": 1.1233220100402832, "learning_rate": 8.6239639361456e-05} +{"ts": "2025-12-27T15:39:50", "event": "train_log", "step": 4306, "epoch": 1.8168776371308017, "progress_pct": 30.28, "epoch_pct": 30.28, "eta": "49:32:16", "max_grad_norm": 0.8, "loss": 0.6395270228385925, "grad_norm": 1.1262956857681274, "learning_rate": 8.622308990310021e-05} +{"ts": "2025-12-27T15:40:03", "event": "train_log", "step": 4308, "epoch": 1.8177215189873417, "progress_pct": 30.3, "epoch_pct": 30.3, "eta": "49:30:47", "max_grad_norm": 0.8, "loss": 0.6165015697479248, "grad_norm": 1.0448222160339355, "learning_rate": 8.620653208877328e-05} +{"ts": "2025-12-27T15:40:16", "event": "train_log", "step": 4310, "epoch": 1.8185654008438819, "progress_pct": 30.31, "epoch_pct": 30.31, "eta": "49:29:18", "max_grad_norm": 0.8, "loss": 0.5915844440460205, "grad_norm": 1.1555759906768799, "learning_rate": 8.618996592229473e-05} +{"ts": "2025-12-27T15:40:29", "event": "train_log", "step": 4312, "epoch": 1.819409282700422, "progress_pct": 30.32, "epoch_pct": 30.32, "eta": "49:27:49", "max_grad_norm": 0.8, "loss": 0.6491456627845764, "grad_norm": 1.5407506227493286, "learning_rate": 8.617339140748608e-05} +{"ts": "2025-12-27T15:40:42", "event": "train_log", "step": 4314, "epoch": 1.820253164556962, "progress_pct": 30.34, "epoch_pct": 30.34, "eta": "49:26:20", "max_grad_norm": 0.8, "loss": 0.6053901314735413, "grad_norm": 1.3690788745880127, "learning_rate": 8.615680854817077e-05} +{"ts": "2025-12-27T15:40:55", "event": "train_log", "step": 4316, "epoch": 1.8210970464135021, "progress_pct": 30.35, "epoch_pct": 30.35, "eta": "49:24:52", "max_grad_norm": 0.8, "loss": 0.5821644067764282, "grad_norm": 1.052583932876587, "learning_rate": 8.614021734817413e-05} +{"ts": "2025-12-27T15:41:08", "event": "train_log", "step": 4318, "epoch": 1.8219409282700423, "progress_pct": 30.37, "epoch_pct": 30.37, "eta": "49:23:22", "max_grad_norm": 0.8, "loss": 0.645878255367279, "grad_norm": 1.090567708015442, "learning_rate": 8.612361781132344e-05} +{"ts": "2025-12-27T15:41:20", "event": "train_log", "step": 4320, "epoch": 1.8227848101265822, "progress_pct": 30.38, "epoch_pct": 30.38, "eta": "49:21:53", "max_grad_norm": 0.8, "loss": 0.6883123517036438, "grad_norm": 1.122719645500183, "learning_rate": 8.610700994144787e-05} +{"ts": "2025-12-27T15:41:32", "event": "train_log", "step": 4322, "epoch": 1.8236286919831224, "progress_pct": 30.39, "epoch_pct": 30.39, "eta": "49:20:23", "max_grad_norm": 0.8, "loss": 0.6918330788612366, "grad_norm": 1.3273001909255981, "learning_rate": 8.609039374237856e-05} +{"ts": "2025-12-27T15:41:45", "event": "train_log", "step": 4324, "epoch": 1.8244725738396625, "progress_pct": 30.41, "epoch_pct": 30.41, "eta": "49:18:53", "max_grad_norm": 0.8, "loss": 0.6292204856872559, "grad_norm": 1.0628443956375122, "learning_rate": 8.607376921794855e-05} +{"ts": "2025-12-27T15:41:57", "event": "train_log", "step": 4326, "epoch": 1.8253164556962025, "progress_pct": 30.42, "epoch_pct": 30.42, "eta": "49:17:24", "max_grad_norm": 0.8, "loss": 0.6136105060577393, "grad_norm": 1.287466287612915, "learning_rate": 8.605713637199279e-05} +{"ts": "2025-12-27T15:42:10", "event": "train_log", "step": 4328, "epoch": 1.8261603375527427, "progress_pct": 30.44, "epoch_pct": 30.44, "eta": "49:15:55", "max_grad_norm": 0.8, "loss": 0.6099681854248047, "grad_norm": 1.1399345397949219, "learning_rate": 8.604049520834816e-05} +{"ts": "2025-12-27T15:42:23", "event": "train_log", "step": 4330, "epoch": 1.8270042194092828, "progress_pct": 30.45, "epoch_pct": 30.45, "eta": "49:14:27", "max_grad_norm": 0.8, "loss": 0.6267056465148926, "grad_norm": 1.1131435632705688, "learning_rate": 8.602384573085345e-05} +{"ts": "2025-12-27T15:42:36", "event": "train_log", "step": 4332, "epoch": 1.8278481012658228, "progress_pct": 30.46, "epoch_pct": 30.46, "eta": "49:12:58", "max_grad_norm": 0.8, "loss": 0.609437882900238, "grad_norm": 1.1312925815582275, "learning_rate": 8.600718794334939e-05} +{"ts": "2025-12-27T15:42:48", "event": "train_log", "step": 4334, "epoch": 1.828691983122363, "progress_pct": 30.48, "epoch_pct": 30.48, "eta": "49:11:29", "max_grad_norm": 0.8, "loss": 0.727881669998169, "grad_norm": 1.3711494207382202, "learning_rate": 8.599052184967859e-05} +{"ts": "2025-12-27T15:43:00", "event": "train_log", "step": 4336, "epoch": 1.829535864978903, "progress_pct": 30.49, "epoch_pct": 30.49, "eta": "49:09:59", "max_grad_norm": 0.8, "loss": 0.6771696209907532, "grad_norm": 1.1403605937957764, "learning_rate": 8.597384745368562e-05} +{"ts": "2025-12-27T15:43:13", "event": "train_log", "step": 4338, "epoch": 1.830379746835443, "progress_pct": 30.51, "epoch_pct": 30.51, "eta": "49:08:30", "max_grad_norm": 0.8, "loss": 0.6812924742698669, "grad_norm": 1.2769951820373535, "learning_rate": 8.595716475921693e-05} +{"ts": "2025-12-27T15:43:25", "event": "train_log", "step": 4340, "epoch": 1.831223628691983, "progress_pct": 30.52, "epoch_pct": 30.52, "eta": "49:07:02", "max_grad_norm": 0.8, "loss": 0.6403515338897705, "grad_norm": 1.055721402168274, "learning_rate": 8.59404737701209e-05} +{"ts": "2025-12-27T15:43:37", "event": "train_log", "step": 4342, "epoch": 1.8320675105485233, "progress_pct": 30.53, "epoch_pct": 30.53, "eta": "49:05:32", "max_grad_norm": 0.8, "loss": 0.663240373134613, "grad_norm": 1.1047639846801758, "learning_rate": 8.592377449024784e-05} +{"ts": "2025-12-27T15:43:50", "event": "train_log", "step": 4344, "epoch": 1.8329113924050633, "progress_pct": 30.55, "epoch_pct": 30.55, "eta": "49:04:04", "max_grad_norm": 0.8, "loss": 0.6398993134498596, "grad_norm": 1.0808883905410767, "learning_rate": 8.590706692344991e-05} +{"ts": "2025-12-27T15:44:03", "event": "train_log", "step": 4346, "epoch": 1.8337552742616032, "progress_pct": 30.56, "epoch_pct": 30.56, "eta": "49:02:35", "max_grad_norm": 0.8, "loss": 0.6838348507881165, "grad_norm": 1.2433407306671143, "learning_rate": 8.589035107358125e-05} +{"ts": "2025-12-27T15:44:16", "event": "train_log", "step": 4348, "epoch": 1.8345991561181436, "progress_pct": 30.58, "epoch_pct": 30.58, "eta": "49:01:08", "max_grad_norm": 0.8, "loss": 0.640884280204773, "grad_norm": 1.031216025352478, "learning_rate": 8.58736269444979e-05} +{"ts": "2025-12-27T15:44:28", "event": "train_log", "step": 4350, "epoch": 1.8354430379746836, "progress_pct": 30.59, "epoch_pct": 30.59, "eta": "48:59:39", "max_grad_norm": 0.8, "loss": 0.6346741914749146, "grad_norm": 1.1417057514190674, "learning_rate": 8.585689454005776e-05} +{"ts": "2025-12-27T15:44:40", "event": "train_log", "step": 4352, "epoch": 1.8362869198312235, "progress_pct": 30.6, "epoch_pct": 30.6, "eta": "48:58:10", "max_grad_norm": 0.8, "loss": 0.6209521889686584, "grad_norm": 1.210988998413086, "learning_rate": 8.584015386412072e-05} +{"ts": "2025-12-27T15:44:53", "event": "train_log", "step": 4354, "epoch": 1.8371308016877637, "progress_pct": 30.62, "epoch_pct": 30.62, "eta": "48:56:41", "max_grad_norm": 0.8, "loss": 0.6699252128601074, "grad_norm": 1.2120760679244995, "learning_rate": 8.582340492054847e-05} +{"ts": "2025-12-27T15:45:05", "event": "train_log", "step": 4356, "epoch": 1.8379746835443038, "progress_pct": 30.63, "epoch_pct": 30.63, "eta": "48:55:13", "max_grad_norm": 0.8, "loss": 0.6472980380058289, "grad_norm": 1.1768114566802979, "learning_rate": 8.580664771320475e-05} +{"ts": "2025-12-27T15:45:18", "event": "train_log", "step": 4358, "epoch": 1.8388185654008438, "progress_pct": 30.65, "epoch_pct": 30.65, "eta": "48:53:44", "max_grad_norm": 0.8, "loss": 0.6440452933311462, "grad_norm": 1.060070276260376, "learning_rate": 8.578988224595506e-05} +{"ts": "2025-12-27T15:45:30", "event": "train_log", "step": 4360, "epoch": 1.839662447257384, "progress_pct": 30.66, "epoch_pct": 30.66, "eta": "48:52:16", "max_grad_norm": 0.8, "loss": 0.5894474387168884, "grad_norm": 1.1366443634033203, "learning_rate": 8.57731085226669e-05} +{"ts": "2025-12-27T15:45:42", "event": "train_log", "step": 4362, "epoch": 1.840506329113924, "progress_pct": 30.68, "epoch_pct": 30.68, "eta": "48:50:47", "max_grad_norm": 0.8, "loss": 0.5868900418281555, "grad_norm": 1.1571751832962036, "learning_rate": 8.575632654720963e-05} +{"ts": "2025-12-27T15:45:55", "event": "train_log", "step": 4364, "epoch": 1.841350210970464, "progress_pct": 30.69, "epoch_pct": 30.69, "eta": "48:49:20", "max_grad_norm": 0.8, "loss": 0.5841533541679382, "grad_norm": 1.1983840465545654, "learning_rate": 8.573953632345453e-05} +{"ts": "2025-12-27T15:46:08", "event": "train_log", "step": 4366, "epoch": 1.8421940928270042, "progress_pct": 30.7, "epoch_pct": 30.7, "eta": "48:47:53", "max_grad_norm": 0.8, "loss": 0.5503215193748474, "grad_norm": 1.101806640625, "learning_rate": 8.572273785527481e-05} +{"ts": "2025-12-27T15:46:21", "event": "train_log", "step": 4368, "epoch": 1.8430379746835444, "progress_pct": 30.72, "epoch_pct": 30.72, "eta": "48:46:27", "max_grad_norm": 0.8, "loss": 0.6131128072738647, "grad_norm": 1.0327471494674683, "learning_rate": 8.570593114654552e-05} +{"ts": "2025-12-27T15:46:34", "event": "train_log", "step": 4370, "epoch": 1.8438818565400843, "progress_pct": 30.73, "epoch_pct": 30.73, "eta": "48:45:01", "max_grad_norm": 0.8, "loss": 0.6614060401916504, "grad_norm": 1.1421098709106445, "learning_rate": 8.568911620114368e-05} +{"ts": "2025-12-27T15:46:47", "event": "train_log", "step": 4372, "epoch": 1.8447257383966245, "progress_pct": 30.75, "epoch_pct": 30.75, "eta": "48:43:33", "max_grad_norm": 0.8, "loss": 0.6392307877540588, "grad_norm": 1.1707026958465576, "learning_rate": 8.567229302294814e-05} +{"ts": "2025-12-27T15:46:59", "event": "train_log", "step": 4374, "epoch": 1.8455696202531646, "progress_pct": 30.76, "epoch_pct": 30.76, "eta": "48:42:04", "max_grad_norm": 0.8, "loss": 0.6560825109481812, "grad_norm": 1.1704418659210205, "learning_rate": 8.565546161583969e-05} +{"ts": "2025-12-27T15:47:11", "event": "train_log", "step": 4376, "epoch": 1.8464135021097046, "progress_pct": 30.77, "epoch_pct": 30.77, "eta": "48:40:36", "max_grad_norm": 0.8, "loss": 0.6996290683746338, "grad_norm": 1.3618037700653076, "learning_rate": 8.563862198370103e-05} +{"ts": "2025-12-27T15:47:24", "event": "train_log", "step": 4378, "epoch": 1.8472573839662447, "progress_pct": 30.79, "epoch_pct": 30.79, "eta": "48:39:09", "max_grad_norm": 0.8, "loss": 0.6776535511016846, "grad_norm": 1.116645097732544, "learning_rate": 8.562177413041674e-05} +{"ts": "2025-12-27T15:47:36", "event": "train_log", "step": 4380, "epoch": 1.8481012658227849, "progress_pct": 30.8, "epoch_pct": 30.8, "eta": "48:37:42", "max_grad_norm": 0.8, "loss": 0.6390423774719238, "grad_norm": 1.1669151782989502, "learning_rate": 8.560491805987327e-05} +{"ts": "2025-12-27T15:47:49", "event": "train_log", "step": 4382, "epoch": 1.8489451476793248, "progress_pct": 30.82, "epoch_pct": 30.82, "eta": "48:36:13", "max_grad_norm": 0.8, "loss": 0.6554020047187805, "grad_norm": 1.2188117504119873, "learning_rate": 8.558805377595904e-05} +{"ts": "2025-12-27T15:48:01", "event": "train_log", "step": 4384, "epoch": 1.849789029535865, "progress_pct": 30.83, "epoch_pct": 30.83, "eta": "48:34:47", "max_grad_norm": 0.8, "loss": 0.6291787624359131, "grad_norm": 1.216829776763916, "learning_rate": 8.557118128256425e-05} +{"ts": "2025-12-27T15:48:14", "event": "train_log", "step": 4386, "epoch": 1.8506329113924052, "progress_pct": 30.84, "epoch_pct": 30.84, "eta": "48:33:21", "max_grad_norm": 0.8, "loss": 0.6484442949295044, "grad_norm": 1.0431596040725708, "learning_rate": 8.555430058358111e-05} +{"ts": "2025-12-27T15:48:26", "event": "train_log", "step": 4388, "epoch": 1.851476793248945, "progress_pct": 30.86, "epoch_pct": 30.86, "eta": "48:31:52", "max_grad_norm": 0.8, "loss": 0.7034047842025757, "grad_norm": 1.3015289306640625, "learning_rate": 8.553741168290367e-05} +{"ts": "2025-12-27T15:48:38", "event": "train_log", "step": 4390, "epoch": 1.8523206751054853, "progress_pct": 30.87, "epoch_pct": 30.87, "eta": "48:30:23", "max_grad_norm": 0.8, "loss": 0.644135594367981, "grad_norm": 1.2062040567398071, "learning_rate": 8.552051458442785e-05} +{"ts": "2025-12-27T15:48:50", "event": "train_log", "step": 4392, "epoch": 1.8531645569620254, "progress_pct": 30.89, "epoch_pct": 30.89, "eta": "48:28:55", "max_grad_norm": 0.8, "loss": 0.6767282485961914, "grad_norm": 1.238461971282959, "learning_rate": 8.55036092920515e-05} +{"ts": "2025-12-27T15:49:01", "event": "train_log", "step": 4394, "epoch": 1.8540084388185654, "progress_pct": 30.9, "epoch_pct": 30.9, "eta": "48:27:26", "max_grad_norm": 0.8, "loss": 0.7292267680168152, "grad_norm": 1.2978830337524414, "learning_rate": 8.548669580967435e-05} +{"ts": "2025-12-27T15:49:14", "event": "train_log", "step": 4396, "epoch": 1.8548523206751055, "progress_pct": 30.91, "epoch_pct": 30.91, "eta": "48:25:58", "max_grad_norm": 0.8, "loss": 0.6788421273231506, "grad_norm": 1.1448328495025635, "learning_rate": 8.546977414119801e-05} +{"ts": "2025-12-27T15:49:27", "event": "train_log", "step": 4398, "epoch": 1.8556962025316457, "progress_pct": 30.93, "epoch_pct": 30.93, "eta": "48:24:33", "max_grad_norm": 0.8, "loss": 0.6745942234992981, "grad_norm": 1.0685368776321411, "learning_rate": 8.5452844290526e-05} +{"ts": "2025-12-27T15:49:39", "event": "train_log", "step": 4400, "epoch": 1.8565400843881856, "progress_pct": 30.94, "epoch_pct": 30.94, "eta": "48:23:05", "max_grad_norm": 0.8, "loss": 0.6351125836372375, "grad_norm": 1.125707983970642, "learning_rate": 8.543590626156368e-05} +{"ts": "2025-12-27T15:58:13", "event": "train_log", "step": 4400, "epoch": 1.8565400843881856, "progress_pct": 30.94, "epoch_pct": 30.94, "eta": "48:42:11", "max_grad_norm": 0.8, "eval_loss": 0.6961485147476196, "eval_runtime": 513.5724, "eval_samples_per_second": 4.103, "eval_steps_per_second": 4.103} +{"ts": "2025-12-27T15:58:26", "event": "train_log", "step": 4402, "epoch": 1.8573839662447258, "progress_pct": 30.96, "epoch_pct": 30.96, "eta": "48:40:45", "max_grad_norm": 0.8, "loss": 0.5840762257575989, "grad_norm": 1.072179913520813, "learning_rate": 8.541896005821835e-05} +{"ts": "2025-12-27T15:58:37", "event": "train_log", "step": 4404, "epoch": 1.858227848101266, "progress_pct": 30.97, "epoch_pct": 30.97, "eta": "48:39:16", "max_grad_norm": 0.8, "loss": 0.6431074738502502, "grad_norm": 1.2572803497314453, "learning_rate": 8.540200568439915e-05} +{"ts": "2025-12-27T15:58:49", "event": "train_log", "step": 4406, "epoch": 1.859071729957806, "progress_pct": 30.98, "epoch_pct": 30.98, "eta": "48:37:46", "max_grad_norm": 0.8, "loss": 0.708808183670044, "grad_norm": 1.3294413089752197, "learning_rate": 8.538504314401718e-05} +{"ts": "2025-12-27T15:59:01", "event": "train_log", "step": 4408, "epoch": 1.8599156118143458, "progress_pct": 31.0, "epoch_pct": 31.0, "eta": "48:36:18", "max_grad_norm": 0.8, "loss": 0.6580085754394531, "grad_norm": 1.1775587797164917, "learning_rate": 8.536807244098533e-05} +{"ts": "2025-12-27T15:59:13", "event": "train_log", "step": 4410, "epoch": 1.8607594936708862, "progress_pct": 31.01, "epoch_pct": 31.01, "eta": "48:34:50", "max_grad_norm": 0.8, "loss": 0.6500136256217957, "grad_norm": 1.1880089044570923, "learning_rate": 8.53510935792184e-05} +{"ts": "2025-12-27T15:59:25", "event": "train_log", "step": 4412, "epoch": 1.8616033755274262, "progress_pct": 31.03, "epoch_pct": 31.03, "eta": "48:33:22", "max_grad_norm": 0.8, "loss": 0.6922352313995361, "grad_norm": 1.2166204452514648, "learning_rate": 8.533410656263313e-05} +{"ts": "2025-12-27T15:59:38", "event": "train_log", "step": 4414, "epoch": 1.862447257383966, "progress_pct": 31.04, "epoch_pct": 31.04, "eta": "48:31:57", "max_grad_norm": 0.8, "loss": 0.6761626601219177, "grad_norm": 1.0405415296554565, "learning_rate": 8.531711139514808e-05} +{"ts": "2025-12-27T15:59:51", "event": "train_log", "step": 4416, "epoch": 1.8632911392405065, "progress_pct": 31.05, "epoch_pct": 31.05, "eta": "48:30:30", "max_grad_norm": 0.8, "loss": 0.672576904296875, "grad_norm": 1.0674270391464233, "learning_rate": 8.530010808068371e-05} +{"ts": "2025-12-27T16:00:04", "event": "train_log", "step": 4418, "epoch": 1.8641350210970464, "progress_pct": 31.07, "epoch_pct": 31.07, "eta": "48:29:05", "max_grad_norm": 0.8, "loss": 0.5521218180656433, "grad_norm": 1.0584741830825806, "learning_rate": 8.528309662316236e-05} +{"ts": "2025-12-27T16:00:16", "event": "train_log", "step": 4420, "epoch": 1.8649789029535864, "progress_pct": 31.08, "epoch_pct": 31.08, "eta": "48:27:37", "max_grad_norm": 0.8, "loss": 0.6546680927276611, "grad_norm": 1.3619039058685303, "learning_rate": 8.526607702650824e-05} +{"ts": "2025-12-27T16:00:30", "event": "train_log", "step": 4422, "epoch": 1.8658227848101265, "progress_pct": 31.1, "epoch_pct": 31.1, "eta": "48:26:12", "max_grad_norm": 0.8, "loss": 0.6043933629989624, "grad_norm": 0.9904745221138, "learning_rate": 8.524904929464745e-05} +{"ts": "2025-12-27T16:00:41", "event": "train_log", "step": 4424, "epoch": 1.8666666666666667, "progress_pct": 31.11, "epoch_pct": 31.11, "eta": "48:24:43", "max_grad_norm": 0.8, "loss": 0.7106801271438599, "grad_norm": 1.3046703338623047, "learning_rate": 8.523201343150795e-05} +{"ts": "2025-12-27T16:00:54", "event": "train_log", "step": 4426, "epoch": 1.8675105485232066, "progress_pct": 31.13, "epoch_pct": 31.13, "eta": "48:23:16", "max_grad_norm": 0.8, "loss": 0.6456703543663025, "grad_norm": 1.1166832447052002, "learning_rate": 8.52149694410196e-05} +{"ts": "2025-12-27T16:01:07", "event": "train_log", "step": 4428, "epoch": 1.8683544303797468, "progress_pct": 31.14, "epoch_pct": 31.14, "eta": "48:21:51", "max_grad_norm": 0.8, "loss": 0.5963318347930908, "grad_norm": 1.1260632276535034, "learning_rate": 8.519791732711412e-05} +{"ts": "2025-12-27T16:01:19", "event": "train_log", "step": 4430, "epoch": 1.869198312236287, "progress_pct": 31.15, "epoch_pct": 31.15, "eta": "48:20:24", "max_grad_norm": 0.8, "loss": 0.6295356750488281, "grad_norm": 1.0990599393844604, "learning_rate": 8.51808570937251e-05} +{"ts": "2025-12-27T16:01:32", "event": "train_log", "step": 4432, "epoch": 1.870042194092827, "progress_pct": 31.17, "epoch_pct": 31.17, "eta": "48:18:59", "max_grad_norm": 0.8, "loss": 0.6984617114067078, "grad_norm": 1.3689274787902832, "learning_rate": 8.516378874478801e-05} +{"ts": "2025-12-27T16:01:45", "event": "train_log", "step": 4434, "epoch": 1.870886075949367, "progress_pct": 31.18, "epoch_pct": 31.18, "eta": "48:17:33", "max_grad_norm": 0.8, "loss": 0.5598900318145752, "grad_norm": 1.0986580848693848, "learning_rate": 8.514671228424018e-05} +{"ts": "2025-12-27T16:01:58", "event": "train_log", "step": 4436, "epoch": 1.8717299578059072, "progress_pct": 31.2, "epoch_pct": 31.2, "eta": "48:16:07", "max_grad_norm": 0.8, "loss": 0.6286435723304749, "grad_norm": 0.9570761322975159, "learning_rate": 8.512962771602085e-05} +{"ts": "2025-12-27T16:02:10", "event": "train_log", "step": 4438, "epoch": 1.8725738396624472, "progress_pct": 31.21, "epoch_pct": 31.21, "eta": "48:14:41", "max_grad_norm": 0.8, "loss": 0.5956313014030457, "grad_norm": 1.1480669975280762, "learning_rate": 8.511253504407107e-05} +{"ts": "2025-12-27T16:02:22", "event": "train_log", "step": 4440, "epoch": 1.8734177215189873, "progress_pct": 31.22, "epoch_pct": 31.22, "eta": "48:13:13", "max_grad_norm": 0.8, "loss": 0.6523844599723816, "grad_norm": 1.1132479906082153, "learning_rate": 8.50954342723338e-05} +{"ts": "2025-12-27T16:02:35", "event": "train_log", "step": 4442, "epoch": 1.8742616033755275, "progress_pct": 31.24, "epoch_pct": 31.24, "eta": "48:11:48", "max_grad_norm": 0.8, "loss": 0.6231355667114258, "grad_norm": 1.1569167375564575, "learning_rate": 8.507832540475387e-05} +{"ts": "2025-12-27T16:02:48", "event": "train_log", "step": 4444, "epoch": 1.8751054852320674, "progress_pct": 31.25, "epoch_pct": 31.25, "eta": "48:10:23", "max_grad_norm": 0.8, "loss": 0.660773754119873, "grad_norm": 1.1327043771743774, "learning_rate": 8.506120844527796e-05} +{"ts": "2025-12-27T16:03:01", "event": "train_log", "step": 4446, "epoch": 1.8759493670886076, "progress_pct": 31.27, "epoch_pct": 31.27, "eta": "48:08:59", "max_grad_norm": 0.8, "loss": 0.6319235563278198, "grad_norm": 0.8939630389213562, "learning_rate": 8.504408339785463e-05} +{"ts": "2025-12-27T16:03:14", "event": "train_log", "step": 4448, "epoch": 1.8767932489451478, "progress_pct": 31.28, "epoch_pct": 31.28, "eta": "48:07:33", "max_grad_norm": 0.8, "loss": 0.6753001809120178, "grad_norm": 1.1910638809204102, "learning_rate": 8.50269502664343e-05} +{"ts": "2025-12-27T16:03:27", "event": "train_log", "step": 4450, "epoch": 1.8776371308016877, "progress_pct": 31.29, "epoch_pct": 31.29, "eta": "48:06:08", "max_grad_norm": 0.8, "loss": 0.6300671696662903, "grad_norm": 1.1502408981323242, "learning_rate": 8.500980905496923e-05} +{"ts": "2025-12-27T16:03:39", "event": "train_log", "step": 4452, "epoch": 1.8784810126582279, "progress_pct": 31.31, "epoch_pct": 31.31, "eta": "48:04:43", "max_grad_norm": 0.8, "loss": 0.6196691989898682, "grad_norm": 1.0639009475708008, "learning_rate": 8.49926597674136e-05} +{"ts": "2025-12-27T16:03:51", "event": "train_log", "step": 4454, "epoch": 1.879324894514768, "progress_pct": 31.32, "epoch_pct": 31.32, "eta": "48:03:15", "max_grad_norm": 0.8, "loss": 0.7029181122779846, "grad_norm": 1.1072754859924316, "learning_rate": 8.497550240772341e-05} +{"ts": "2025-12-27T16:04:03", "event": "train_log", "step": 4456, "epoch": 1.880168776371308, "progress_pct": 31.34, "epoch_pct": 31.34, "eta": "48:01:49", "max_grad_norm": 0.8, "loss": 0.65432208776474, "grad_norm": 1.0440188646316528, "learning_rate": 8.495833697985652e-05} +{"ts": "2025-12-27T16:04:16", "event": "train_log", "step": 4458, "epoch": 1.8810126582278481, "progress_pct": 31.35, "epoch_pct": 31.35, "eta": "48:00:24", "max_grad_norm": 0.8, "loss": 0.6446614861488342, "grad_norm": 1.0646617412567139, "learning_rate": 8.494116348777269e-05} +{"ts": "2025-12-27T16:04:29", "event": "train_log", "step": 4460, "epoch": 1.8818565400843883, "progress_pct": 31.36, "epoch_pct": 31.36, "eta": "47:58:59", "max_grad_norm": 0.8, "loss": 0.6430497765541077, "grad_norm": 1.2163805961608887, "learning_rate": 8.492398193543349e-05} +{"ts": "2025-12-27T16:04:41", "event": "train_log", "step": 4462, "epoch": 1.8827004219409282, "progress_pct": 31.38, "epoch_pct": 31.38, "eta": "47:57:32", "max_grad_norm": 0.8, "loss": 0.6609845161437988, "grad_norm": 1.2715297937393188, "learning_rate": 8.490679232680241e-05} +{"ts": "2025-12-27T16:04:53", "event": "train_log", "step": 4464, "epoch": 1.8835443037974684, "progress_pct": 31.39, "epoch_pct": 31.39, "eta": "47:56:07", "max_grad_norm": 0.8, "loss": 0.5791062712669373, "grad_norm": 1.0435588359832764, "learning_rate": 8.488959466584469e-05} +{"ts": "2025-12-27T16:05:06", "event": "train_log", "step": 4466, "epoch": 1.8843881856540086, "progress_pct": 31.41, "epoch_pct": 31.41, "eta": "47:54:42", "max_grad_norm": 0.8, "loss": 0.6312171220779419, "grad_norm": 1.229202151298523, "learning_rate": 8.487238895652759e-05} +{"ts": "2025-12-27T16:05:18", "event": "train_log", "step": 4468, "epoch": 1.8852320675105485, "progress_pct": 31.42, "epoch_pct": 31.42, "eta": "47:53:17", "max_grad_norm": 0.8, "loss": 0.6698815226554871, "grad_norm": 1.0713022947311401, "learning_rate": 8.485517520282008e-05} +{"ts": "2025-12-27T16:05:31", "event": "train_log", "step": 4470, "epoch": 1.8860759493670884, "progress_pct": 31.43, "epoch_pct": 31.43, "eta": "47:51:52", "max_grad_norm": 0.8, "loss": 0.6283810138702393, "grad_norm": 1.0172312259674072, "learning_rate": 8.483795340869305e-05} +{"ts": "2025-12-27T16:05:44", "event": "train_log", "step": 4472, "epoch": 1.8869198312236288, "progress_pct": 31.45, "epoch_pct": 31.45, "eta": "47:50:28", "max_grad_norm": 0.8, "loss": 0.6659437417984009, "grad_norm": 1.2880207300186157, "learning_rate": 8.482072357811926e-05} +{"ts": "2025-12-27T16:05:56", "event": "train_log", "step": 4474, "epoch": 1.8877637130801688, "progress_pct": 31.46, "epoch_pct": 31.46, "eta": "47:49:02", "max_grad_norm": 0.8, "loss": 0.6190289258956909, "grad_norm": 1.0840508937835693, "learning_rate": 8.480348571507329e-05} +{"ts": "2025-12-27T16:06:09", "event": "train_log", "step": 4476, "epoch": 1.8886075949367087, "progress_pct": 31.48, "epoch_pct": 31.48, "eta": "47:47:38", "max_grad_norm": 0.8, "loss": 0.5760066509246826, "grad_norm": 1.1101994514465332, "learning_rate": 8.478623982353156e-05} +{"ts": "2025-12-27T16:06:22", "event": "train_log", "step": 4478, "epoch": 1.889451476793249, "progress_pct": 31.49, "epoch_pct": 31.49, "eta": "47:46:13", "max_grad_norm": 0.8, "loss": 0.6151811480522156, "grad_norm": 1.2388770580291748, "learning_rate": 8.476898590747237e-05} +{"ts": "2025-12-27T16:06:35", "event": "train_log", "step": 4480, "epoch": 1.890295358649789, "progress_pct": 31.5, "epoch_pct": 31.5, "eta": "47:44:49", "max_grad_norm": 0.8, "loss": 0.5991593599319458, "grad_norm": 0.9986408948898315, "learning_rate": 8.475172397087591e-05} +{"ts": "2025-12-27T16:06:47", "event": "train_log", "step": 4482, "epoch": 1.891139240506329, "progress_pct": 31.52, "epoch_pct": 31.52, "eta": "47:43:24", "max_grad_norm": 0.8, "loss": 0.7262179255485535, "grad_norm": 1.1380778551101685, "learning_rate": 8.473445401772415e-05} +{"ts": "2025-12-27T16:07:00", "event": "train_log", "step": 4484, "epoch": 1.8919831223628694, "progress_pct": 31.53, "epoch_pct": 31.53, "eta": "47:42:00", "max_grad_norm": 0.8, "loss": 0.5806916356086731, "grad_norm": 1.3933676481246948, "learning_rate": 8.471717605200092e-05} +{"ts": "2025-12-27T16:07:13", "event": "train_log", "step": 4486, "epoch": 1.8928270042194093, "progress_pct": 31.55, "epoch_pct": 31.55, "eta": "47:40:37", "max_grad_norm": 0.8, "loss": 0.617904782295227, "grad_norm": 1.0242944955825806, "learning_rate": 8.469989007769194e-05} +{"ts": "2025-12-27T16:07:26", "event": "train_log", "step": 4488, "epoch": 1.8936708860759492, "progress_pct": 31.56, "epoch_pct": 31.56, "eta": "47:39:13", "max_grad_norm": 0.8, "loss": 0.6488202810287476, "grad_norm": 1.0909028053283691, "learning_rate": 8.468259609878475e-05} +{"ts": "2025-12-27T16:07:39", "event": "train_log", "step": 4490, "epoch": 1.8945147679324894, "progress_pct": 31.58, "epoch_pct": 31.58, "eta": "47:37:49", "max_grad_norm": 0.8, "loss": 0.6015118956565857, "grad_norm": 1.042611002922058, "learning_rate": 8.466529411926874e-05} +{"ts": "2025-12-27T16:07:51", "event": "train_log", "step": 4492, "epoch": 1.8953586497890296, "progress_pct": 31.59, "epoch_pct": 31.59, "eta": "47:36:23", "max_grad_norm": 0.8, "loss": 0.7035272717475891, "grad_norm": 1.3965784311294556, "learning_rate": 8.46479841431351e-05} +{"ts": "2025-12-27T16:08:03", "event": "train_log", "step": 4494, "epoch": 1.8962025316455695, "progress_pct": 31.6, "epoch_pct": 31.6, "eta": "47:34:59", "max_grad_norm": 0.8, "loss": 0.6611229777336121, "grad_norm": 1.1486462354660034, "learning_rate": 8.463066617437698e-05} +{"ts": "2025-12-27T16:08:15", "event": "train_log", "step": 4496, "epoch": 1.8970464135021097, "progress_pct": 31.62, "epoch_pct": 31.62, "eta": "47:33:34", "max_grad_norm": 0.8, "loss": 0.6378056406974792, "grad_norm": 1.0845859050750732, "learning_rate": 8.461334021698925e-05} +{"ts": "2025-12-27T16:08:28", "event": "train_log", "step": 4498, "epoch": 1.8978902953586498, "progress_pct": 31.63, "epoch_pct": 31.63, "eta": "47:32:11", "max_grad_norm": 0.8, "loss": 0.642429769039154, "grad_norm": 0.936612069606781, "learning_rate": 8.459600627496869e-05} +{"ts": "2025-12-27T16:08:42", "event": "train_log", "step": 4500, "epoch": 1.8987341772151898, "progress_pct": 31.65, "epoch_pct": 31.65, "eta": "47:30:48", "max_grad_norm": 0.8, "loss": 0.6341768503189087, "grad_norm": 1.1905454397201538, "learning_rate": 8.457866435231391e-05} +{"ts": "2025-12-27T16:17:15", "event": "train_log", "step": 4500, "epoch": 1.8987341772151898, "progress_pct": 31.65, "epoch_pct": 31.65, "eta": "47:49:17", "max_grad_norm": 0.8, "eval_loss": 0.6938078999519348, "eval_runtime": 513.615, "eval_samples_per_second": 4.102, "eval_steps_per_second": 4.102} +{"ts": "2025-12-27T16:17:29", "event": "train_log", "step": 4502, "epoch": 1.89957805907173, "progress_pct": 31.66, "epoch_pct": 31.66, "eta": "47:47:55", "max_grad_norm": 0.8, "loss": 0.5973100662231445, "grad_norm": 0.9778118133544922, "learning_rate": 8.456131445302538e-05} +{"ts": "2025-12-27T16:17:42", "event": "train_log", "step": 4504, "epoch": 1.90042194092827, "progress_pct": 31.67, "epoch_pct": 31.67, "eta": "47:46:31", "max_grad_norm": 0.8, "loss": 0.5982911586761475, "grad_norm": 0.9587083458900452, "learning_rate": 8.454395658110536e-05} +{"ts": "2025-12-27T16:17:54", "event": "train_log", "step": 4506, "epoch": 1.90126582278481, "progress_pct": 31.69, "epoch_pct": 31.69, "eta": "47:45:05", "max_grad_norm": 0.8, "loss": 0.6858586668968201, "grad_norm": 1.327643871307373, "learning_rate": 8.452659074055798e-05} +{"ts": "2025-12-27T16:18:07", "event": "train_log", "step": 4508, "epoch": 1.9021097046413502, "progress_pct": 31.7, "epoch_pct": 31.7, "eta": "47:43:42", "max_grad_norm": 0.8, "loss": 0.6172328591346741, "grad_norm": 1.0740257501602173, "learning_rate": 8.450921693538922e-05} +{"ts": "2025-12-27T16:18:20", "event": "train_log", "step": 4510, "epoch": 1.9029535864978904, "progress_pct": 31.72, "epoch_pct": 31.72, "eta": "47:42:18", "max_grad_norm": 0.8, "loss": 0.5349634289741516, "grad_norm": 1.0705101490020752, "learning_rate": 8.449183516960685e-05} +{"ts": "2025-12-27T16:18:33", "event": "train_log", "step": 4512, "epoch": 1.9037974683544303, "progress_pct": 31.73, "epoch_pct": 31.73, "eta": "47:40:56", "max_grad_norm": 0.8, "loss": 0.5769277811050415, "grad_norm": 0.9151237607002258, "learning_rate": 8.447444544722058e-05} +{"ts": "2025-12-27T16:18:46", "event": "train_log", "step": 4514, "epoch": 1.9046413502109705, "progress_pct": 31.74, "epoch_pct": 31.74, "eta": "47:39:31", "max_grad_norm": 0.8, "loss": 0.6579093933105469, "grad_norm": 1.139900803565979, "learning_rate": 8.44570477722418e-05} +{"ts": "2025-12-27T16:18:58", "event": "train_log", "step": 4516, "epoch": 1.9054852320675106, "progress_pct": 31.76, "epoch_pct": 31.76, "eta": "47:38:06", "max_grad_norm": 0.8, "loss": 0.6748929619789124, "grad_norm": 1.2481658458709717, "learning_rate": 8.443964214868387e-05} +{"ts": "2025-12-27T16:19:10", "event": "train_log", "step": 4518, "epoch": 1.9063291139240506, "progress_pct": 31.77, "epoch_pct": 31.77, "eta": "47:36:41", "max_grad_norm": 0.8, "loss": 0.6492021083831787, "grad_norm": 1.1661686897277832, "learning_rate": 8.442222858056193e-05} +{"ts": "2025-12-27T16:19:23", "event": "train_log", "step": 4520, "epoch": 1.9071729957805907, "progress_pct": 31.79, "epoch_pct": 31.79, "eta": "47:35:18", "max_grad_norm": 0.8, "loss": 0.635409951210022, "grad_norm": 1.241477370262146, "learning_rate": 8.440480707189295e-05} +{"ts": "2025-12-27T16:19:36", "event": "train_log", "step": 4522, "epoch": 1.908016877637131, "progress_pct": 31.8, "epoch_pct": 31.8, "eta": "47:33:53", "max_grad_norm": 0.8, "loss": 0.631928026676178, "grad_norm": 1.1102054119110107, "learning_rate": 8.438737762669573e-05} +{"ts": "2025-12-27T16:19:48", "event": "train_log", "step": 4524, "epoch": 1.9088607594936708, "progress_pct": 31.81, "epoch_pct": 31.81, "eta": "47:32:29", "max_grad_norm": 0.8, "loss": 0.604518473148346, "grad_norm": 1.0638107061386108, "learning_rate": 8.43699402489909e-05} +{"ts": "2025-12-27T16:20:01", "event": "train_log", "step": 4526, "epoch": 1.909704641350211, "progress_pct": 31.83, "epoch_pct": 31.83, "eta": "47:31:06", "max_grad_norm": 0.8, "loss": 0.61314457654953, "grad_norm": 1.0270655155181885, "learning_rate": 8.435249494280096e-05} +{"ts": "2025-12-27T16:20:14", "event": "train_log", "step": 4528, "epoch": 1.9105485232067512, "progress_pct": 31.84, "epoch_pct": 31.84, "eta": "47:29:43", "max_grad_norm": 0.8, "loss": 0.661663293838501, "grad_norm": 1.1840111017227173, "learning_rate": 8.433504171215018e-05} +{"ts": "2025-12-27T16:20:26", "event": "train_log", "step": 4530, "epoch": 1.9113924050632911, "progress_pct": 31.86, "epoch_pct": 31.86, "eta": "47:28:18", "max_grad_norm": 0.8, "loss": 0.7026967406272888, "grad_norm": 1.1404399871826172, "learning_rate": 8.43175805610647e-05} +{"ts": "2025-12-27T16:20:38", "event": "train_log", "step": 4532, "epoch": 1.9122362869198313, "progress_pct": 31.87, "epoch_pct": 31.87, "eta": "47:26:53", "max_grad_norm": 0.8, "loss": 0.6599440574645996, "grad_norm": 1.2371265888214111, "learning_rate": 8.430011149357246e-05} +{"ts": "2025-12-27T16:20:52", "event": "train_log", "step": 4534, "epoch": 1.9130801687763714, "progress_pct": 31.88, "epoch_pct": 31.88, "eta": "47:25:31", "max_grad_norm": 0.8, "loss": 0.5728344321250916, "grad_norm": 1.0042651891708374, "learning_rate": 8.428263451370326e-05} +{"ts": "2025-12-27T16:21:04", "event": "train_log", "step": 4536, "epoch": 1.9139240506329114, "progress_pct": 31.9, "epoch_pct": 31.9, "eta": "47:24:08", "max_grad_norm": 0.8, "loss": 0.6495450735092163, "grad_norm": 1.04367196559906, "learning_rate": 8.426514962548866e-05} +{"ts": "2025-12-27T16:21:17", "event": "train_log", "step": 4538, "epoch": 1.9147679324894513, "progress_pct": 31.91, "epoch_pct": 31.91, "eta": "47:22:45", "max_grad_norm": 0.8, "loss": 0.6406553387641907, "grad_norm": 1.0867135524749756, "learning_rate": 8.424765683296215e-05} +{"ts": "2025-12-27T16:21:30", "event": "train_log", "step": 4540, "epoch": 1.9156118143459917, "progress_pct": 31.93, "epoch_pct": 31.93, "eta": "47:21:21", "max_grad_norm": 0.8, "loss": 0.6692186594009399, "grad_norm": 1.0751310586929321, "learning_rate": 8.423015614015892e-05} +{"ts": "2025-12-27T16:21:43", "event": "train_log", "step": 4542, "epoch": 1.9164556962025316, "progress_pct": 31.94, "epoch_pct": 31.94, "eta": "47:19:58", "max_grad_norm": 0.8, "loss": 0.6029785871505737, "grad_norm": 1.13556969165802, "learning_rate": 8.421264755111607e-05} +{"ts": "2025-12-27T16:21:55", "event": "train_log", "step": 4544, "epoch": 1.9172995780590716, "progress_pct": 31.95, "epoch_pct": 31.95, "eta": "47:18:34", "max_grad_norm": 0.8, "loss": 0.6457844972610474, "grad_norm": 1.1560977697372437, "learning_rate": 8.419513106987251e-05} +{"ts": "2025-12-27T16:22:08", "event": "train_log", "step": 4546, "epoch": 1.918143459915612, "progress_pct": 31.97, "epoch_pct": 31.97, "eta": "47:17:11", "max_grad_norm": 0.8, "loss": 0.7082147598266602, "grad_norm": 1.2192902565002441, "learning_rate": 8.417760670046893e-05} +{"ts": "2025-12-27T16:22:20", "event": "train_log", "step": 4548, "epoch": 1.918987341772152, "progress_pct": 31.98, "epoch_pct": 31.98, "eta": "47:15:47", "max_grad_norm": 0.8, "loss": 0.6919234991073608, "grad_norm": 1.1170696020126343, "learning_rate": 8.41600744469479e-05} +{"ts": "2025-12-27T16:22:33", "event": "train_log", "step": 4550, "epoch": 1.9198312236286919, "progress_pct": 32.0, "epoch_pct": 32.0, "eta": "47:14:25", "max_grad_norm": 0.8, "loss": 0.6310052871704102, "grad_norm": 1.061253547668457, "learning_rate": 8.414253431335373e-05} +{"ts": "2025-12-27T16:22:45", "event": "train_log", "step": 4552, "epoch": 1.920675105485232, "progress_pct": 32.01, "epoch_pct": 32.01, "eta": "47:13:02", "max_grad_norm": 0.8, "loss": 0.6330236792564392, "grad_norm": 1.0671885013580322, "learning_rate": 8.412498630373263e-05} +{"ts": "2025-12-27T16:22:58", "event": "train_log", "step": 4554, "epoch": 1.9215189873417722, "progress_pct": 32.03, "epoch_pct": 32.03, "eta": "47:11:38", "max_grad_norm": 0.8, "loss": 0.7031015157699585, "grad_norm": 1.2085163593292236, "learning_rate": 8.410743042213256e-05} +{"ts": "2025-12-27T16:23:10", "event": "train_log", "step": 4556, "epoch": 1.9223628691983121, "progress_pct": 32.04, "epoch_pct": 32.04, "eta": "47:10:14", "max_grad_norm": 0.8, "loss": 0.7078304290771484, "grad_norm": 1.2682013511657715, "learning_rate": 8.408986667260334e-05} +{"ts": "2025-12-27T16:23:21", "event": "train_log", "step": 4558, "epoch": 1.9232067510548523, "progress_pct": 32.05, "epoch_pct": 32.05, "eta": "47:08:49", "max_grad_norm": 0.8, "loss": 0.6542860865592957, "grad_norm": 1.2966876029968262, "learning_rate": 8.407229505919658e-05} +{"ts": "2025-12-27T16:23:34", "event": "train_log", "step": 4560, "epoch": 1.9240506329113924, "progress_pct": 32.07, "epoch_pct": 32.07, "eta": "47:07:26", "max_grad_norm": 0.8, "loss": 0.5856828093528748, "grad_norm": 1.1086169481277466, "learning_rate": 8.405471558596573e-05} +{"ts": "2025-12-27T16:23:46", "event": "train_log", "step": 4562, "epoch": 1.9248945147679324, "progress_pct": 32.08, "epoch_pct": 32.08, "eta": "47:06:01", "max_grad_norm": 0.8, "loss": 0.7382104992866516, "grad_norm": 1.3175504207611084, "learning_rate": 8.403712825696604e-05} +{"ts": "2025-12-27T16:23:58", "event": "train_log", "step": 4564, "epoch": 1.9257383966244725, "progress_pct": 32.1, "epoch_pct": 32.1, "eta": "47:04:37", "max_grad_norm": 0.8, "loss": 0.6862360239028931, "grad_norm": 1.163164496421814, "learning_rate": 8.401953307625454e-05} +{"ts": "2025-12-27T16:24:10", "event": "train_log", "step": 4566, "epoch": 1.9265822784810127, "progress_pct": 32.11, "epoch_pct": 32.11, "eta": "47:03:14", "max_grad_norm": 0.8, "loss": 0.7442302703857422, "grad_norm": 1.207650899887085, "learning_rate": 8.400193004789013e-05} +{"ts": "2025-12-27T16:24:22", "event": "train_log", "step": 4568, "epoch": 1.9274261603375527, "progress_pct": 32.12, "epoch_pct": 32.12, "eta": "47:01:50", "max_grad_norm": 0.8, "loss": 0.595226526260376, "grad_norm": 1.1570589542388916, "learning_rate": 8.398431917593345e-05} +{"ts": "2025-12-27T16:24:35", "event": "train_log", "step": 4570, "epoch": 1.9282700421940928, "progress_pct": 32.14, "epoch_pct": 32.14, "eta": "47:00:27", "max_grad_norm": 0.8, "loss": 0.6360410451889038, "grad_norm": 1.091927170753479, "learning_rate": 8.396670046444704e-05} +{"ts": "2025-12-27T16:24:47", "event": "train_log", "step": 4572, "epoch": 1.929113924050633, "progress_pct": 32.15, "epoch_pct": 32.15, "eta": "46:59:04", "max_grad_norm": 0.8, "loss": 0.6343122124671936, "grad_norm": 1.149559497833252, "learning_rate": 8.394907391749516e-05} +{"ts": "2025-12-27T16:24:59", "event": "train_log", "step": 4574, "epoch": 1.929957805907173, "progress_pct": 32.17, "epoch_pct": 32.17, "eta": "46:57:42", "max_grad_norm": 0.8, "loss": 0.7394745349884033, "grad_norm": 1.0585254430770874, "learning_rate": 8.393143953914395e-05} +{"ts": "2025-12-27T16:25:12", "event": "train_log", "step": 4576, "epoch": 1.930801687763713, "progress_pct": 32.18, "epoch_pct": 32.18, "eta": "46:56:19", "max_grad_norm": 0.8, "loss": 0.6489678025245667, "grad_norm": 1.1648521423339844, "learning_rate": 8.391379733346128e-05} +{"ts": "2025-12-27T16:25:24", "event": "train_log", "step": 4578, "epoch": 1.9316455696202532, "progress_pct": 32.19, "epoch_pct": 32.19, "eta": "46:54:56", "max_grad_norm": 0.8, "loss": 0.6687861084938049, "grad_norm": 1.1756316423416138, "learning_rate": 8.389614730451692e-05} +{"ts": "2025-12-27T16:25:36", "event": "train_log", "step": 4580, "epoch": 1.9324894514767932, "progress_pct": 32.21, "epoch_pct": 32.21, "eta": "46:53:33", "max_grad_norm": 0.8, "loss": 0.523727536201477, "grad_norm": 0.9857237339019775, "learning_rate": 8.387848945638235e-05} +{"ts": "2025-12-27T16:25:49", "event": "train_log", "step": 4582, "epoch": 1.9333333333333333, "progress_pct": 32.22, "epoch_pct": 32.22, "eta": "46:52:12", "max_grad_norm": 0.8, "loss": 0.6545047760009766, "grad_norm": 1.1038693189620972, "learning_rate": 8.386082379313092e-05} +{"ts": "2025-12-27T16:26:02", "event": "train_log", "step": 4584, "epoch": 1.9341772151898735, "progress_pct": 32.24, "epoch_pct": 32.24, "eta": "46:50:50", "max_grad_norm": 0.8, "loss": 0.6067036390304565, "grad_norm": 1.0780832767486572, "learning_rate": 8.384315031883774e-05} +{"ts": "2025-12-27T16:26:15", "event": "train_log", "step": 4586, "epoch": 1.9350210970464135, "progress_pct": 32.25, "epoch_pct": 32.25, "eta": "46:49:28", "max_grad_norm": 0.8, "loss": 0.6880824565887451, "grad_norm": 1.2915070056915283, "learning_rate": 8.382546903757975e-05} +{"ts": "2025-12-27T16:26:28", "event": "train_log", "step": 4588, "epoch": 1.9358649789029536, "progress_pct": 32.26, "epoch_pct": 32.26, "eta": "46:48:07", "max_grad_norm": 0.8, "loss": 0.7319117188453674, "grad_norm": 1.1243441104888916, "learning_rate": 8.380777995343568e-05} +{"ts": "2025-12-27T16:26:40", "event": "train_log", "step": 4590, "epoch": 1.9367088607594938, "progress_pct": 32.28, "epoch_pct": 32.28, "eta": "46:46:45", "max_grad_norm": 0.8, "loss": 0.6845395565032959, "grad_norm": 1.1143072843551636, "learning_rate": 8.379008307048609e-05} +{"ts": "2025-12-27T16:26:53", "event": "train_log", "step": 4592, "epoch": 1.9375527426160337, "progress_pct": 32.29, "epoch_pct": 32.29, "eta": "46:45:24", "max_grad_norm": 0.8, "loss": 0.6653600335121155, "grad_norm": 1.039494276046753, "learning_rate": 8.377237839281327e-05} +{"ts": "2025-12-27T16:27:06", "event": "train_log", "step": 4594, "epoch": 1.9383966244725739, "progress_pct": 32.31, "epoch_pct": 32.31, "eta": "46:44:02", "max_grad_norm": 0.8, "loss": 0.6352495551109314, "grad_norm": 1.299617886543274, "learning_rate": 8.375466592450136e-05} +{"ts": "2025-12-27T16:27:19", "event": "train_log", "step": 4596, "epoch": 1.939240506329114, "progress_pct": 32.32, "epoch_pct": 32.32, "eta": "46:42:41", "max_grad_norm": 0.8, "loss": 0.5660957098007202, "grad_norm": 0.9918657541275024, "learning_rate": 8.373694566963631e-05} +{"ts": "2025-12-27T16:27:32", "event": "train_log", "step": 4598, "epoch": 1.940084388185654, "progress_pct": 32.33, "epoch_pct": 32.33, "eta": "46:41:20", "max_grad_norm": 0.8, "loss": 0.6296496987342834, "grad_norm": 1.0540478229522705, "learning_rate": 8.371921763230579e-05} +{"ts": "2025-12-27T16:27:44", "event": "train_log", "step": 4600, "epoch": 1.9409282700421941, "progress_pct": 32.35, "epoch_pct": 32.35, "eta": "46:39:58", "max_grad_norm": 0.8, "loss": 0.6672025918960571, "grad_norm": 1.1309545040130615, "learning_rate": 8.370148181659939e-05} +{"ts": "2025-12-27T16:38:02", "event": "train_log", "step": 4600, "epoch": 1.9409282700421941, "progress_pct": 32.35, "epoch_pct": 32.35, "eta": "47:01:30", "max_grad_norm": 0.8, "eval_loss": 0.6930755376815796, "eval_runtime": 617.8927, "eval_samples_per_second": 3.41, "eval_steps_per_second": 3.41} +{"ts": "2025-12-27T16:38:21", "event": "train_log", "step": 4602, "epoch": 1.9417721518987343, "progress_pct": 32.36, "epoch_pct": 32.36, "eta": "47:00:21", "max_grad_norm": 0.8, "loss": 0.6200884580612183, "grad_norm": 1.2338588237762451, "learning_rate": 8.368373822660836e-05} +{"ts": "2025-12-27T16:38:41", "event": "train_log", "step": 4604, "epoch": 1.9426160337552743, "progress_pct": 32.38, "epoch_pct": 32.38, "eta": "46:59:13", "max_grad_norm": 0.8, "loss": 0.653294026851654, "grad_norm": 1.1756945848464966, "learning_rate": 8.366598686642582e-05} +{"ts": "2025-12-27T16:39:00", "event": "train_log", "step": 4606, "epoch": 1.9434599156118142, "progress_pct": 32.39, "epoch_pct": 32.39, "eta": "46:58:05", "max_grad_norm": 0.8, "loss": 0.5670395493507385, "grad_norm": 1.032018780708313, "learning_rate": 8.364822774014671e-05} +{"ts": "2025-12-27T16:39:19", "event": "train_log", "step": 4608, "epoch": 1.9443037974683546, "progress_pct": 32.41, "epoch_pct": 32.41, "eta": "46:56:55", "max_grad_norm": 0.8, "loss": 0.6819197535514832, "grad_norm": 1.045280933380127, "learning_rate": 8.363046085186766e-05} +{"ts": "2025-12-27T16:39:36", "event": "train_log", "step": 4610, "epoch": 1.9451476793248945, "progress_pct": 32.42, "epoch_pct": 32.42, "eta": "46:55:43", "max_grad_norm": 0.8, "loss": 0.6952820420265198, "grad_norm": 1.3223930597305298, "learning_rate": 8.36126862056872e-05} +{"ts": "2025-12-27T16:39:56", "event": "train_log", "step": 4612, "epoch": 1.9459915611814345, "progress_pct": 32.43, "epoch_pct": 32.43, "eta": "46:54:37", "max_grad_norm": 0.8, "loss": 0.5291440486907959, "grad_norm": 1.0048432350158691, "learning_rate": 8.359490380570556e-05} +{"ts": "2025-12-27T16:40:15", "event": "train_log", "step": 4614, "epoch": 1.9468354430379748, "progress_pct": 32.45, "epoch_pct": 32.45, "eta": "46:53:27", "max_grad_norm": 0.8, "loss": 0.6857813000679016, "grad_norm": 1.1477346420288086, "learning_rate": 8.357711365602483e-05} +{"ts": "2025-12-27T16:40:34", "event": "train_log", "step": 4616, "epoch": 1.9476793248945148, "progress_pct": 32.46, "epoch_pct": 32.46, "eta": "46:52:19", "max_grad_norm": 0.8, "loss": 0.5581508278846741, "grad_norm": 0.959985077381134, "learning_rate": 8.355931576074882e-05} +{"ts": "2025-12-27T16:40:53", "event": "train_log", "step": 4618, "epoch": 1.9485232067510547, "progress_pct": 32.48, "epoch_pct": 32.48, "eta": "46:51:11", "max_grad_norm": 0.8, "loss": 0.6536211371421814, "grad_norm": 1.1104289293289185, "learning_rate": 8.35415101239832e-05} +{"ts": "2025-12-27T16:41:11", "event": "train_log", "step": 4620, "epoch": 1.9493670886075949, "progress_pct": 32.49, "epoch_pct": 32.49, "eta": "46:49:59", "max_grad_norm": 0.8, "loss": 0.6570560336112976, "grad_norm": 1.2344517707824707, "learning_rate": 8.352369674983535e-05} +{"ts": "2025-12-27T16:41:29", "event": "train_log", "step": 4622, "epoch": 1.950210970464135, "progress_pct": 32.5, "epoch_pct": 32.5, "eta": "46:48:49", "max_grad_norm": 0.8, "loss": 0.6070495247840881, "grad_norm": 1.3411606550216675, "learning_rate": 8.350587564241451e-05} +{"ts": "2025-12-27T16:41:47", "event": "train_log", "step": 4624, "epoch": 1.951054852320675, "progress_pct": 32.52, "epoch_pct": 32.52, "eta": "46:47:38", "max_grad_norm": 0.8, "loss": 0.6444135904312134, "grad_norm": 1.1713159084320068, "learning_rate": 8.348804680583166e-05} +{"ts": "2025-12-27T16:42:00", "event": "train_log", "step": 4626, "epoch": 1.9518987341772152, "progress_pct": 32.53, "epoch_pct": 32.53, "eta": "46:46:18", "max_grad_norm": 0.8, "loss": 0.6517419815063477, "grad_norm": 1.127242922782898, "learning_rate": 8.347021024419954e-05} +{"ts": "2025-12-27T16:42:14", "event": "train_log", "step": 4628, "epoch": 1.9527426160337553, "progress_pct": 32.55, "epoch_pct": 32.55, "eta": "46:44:57", "max_grad_norm": 0.8, "loss": 0.6174065470695496, "grad_norm": 1.0733028650283813, "learning_rate": 8.345236596163274e-05} +{"ts": "2025-12-27T16:42:25", "event": "train_log", "step": 4630, "epoch": 1.9535864978902953, "progress_pct": 32.56, "epoch_pct": 32.56, "eta": "46:43:34", "max_grad_norm": 0.8, "loss": 0.7163593769073486, "grad_norm": 1.1114680767059326, "learning_rate": 8.343451396224757e-05} +{"ts": "2025-12-27T16:42:38", "event": "train_log", "step": 4632, "epoch": 1.9544303797468354, "progress_pct": 32.57, "epoch_pct": 32.57, "eta": "46:42:12", "max_grad_norm": 0.8, "loss": 0.698553204536438, "grad_norm": 1.0839568376541138, "learning_rate": 8.341665425016216e-05} +{"ts": "2025-12-27T16:42:50", "event": "train_log", "step": 4634, "epoch": 1.9552742616033756, "progress_pct": 32.59, "epoch_pct": 32.59, "eta": "46:40:50", "max_grad_norm": 0.8, "loss": 0.6224857568740845, "grad_norm": 1.17001211643219, "learning_rate": 8.339878682949638e-05} +{"ts": "2025-12-27T16:43:03", "event": "train_log", "step": 4636, "epoch": 1.9561181434599155, "progress_pct": 32.6, "epoch_pct": 32.6, "eta": "46:39:29", "max_grad_norm": 0.8, "loss": 0.5931200981140137, "grad_norm": 3.483793020248413, "learning_rate": 8.338091170437193e-05} +{"ts": "2025-12-27T16:43:16", "event": "train_log", "step": 4638, "epoch": 1.9569620253164557, "progress_pct": 32.62, "epoch_pct": 32.62, "eta": "46:38:08", "max_grad_norm": 0.8, "loss": 0.6031442284584045, "grad_norm": 1.1575394868850708, "learning_rate": 8.336302887891224e-05} +{"ts": "2025-12-27T16:43:29", "event": "train_log", "step": 4640, "epoch": 1.9578059071729959, "progress_pct": 32.63, "epoch_pct": 32.63, "eta": "46:36:47", "max_grad_norm": 0.8, "loss": 0.6101768016815186, "grad_norm": 1.1494992971420288, "learning_rate": 8.334513835724252e-05} +{"ts": "2025-12-27T16:43:40", "event": "train_log", "step": 4642, "epoch": 1.9586497890295358, "progress_pct": 32.64, "epoch_pct": 32.64, "eta": "46:35:24", "max_grad_norm": 0.8, "loss": 0.6571711301803589, "grad_norm": 1.3858197927474976, "learning_rate": 8.332724014348981e-05} +{"ts": "2025-12-27T16:43:53", "event": "train_log", "step": 4644, "epoch": 1.959493670886076, "progress_pct": 32.66, "epoch_pct": 32.66, "eta": "46:34:02", "max_grad_norm": 0.8, "loss": 0.6391071677207947, "grad_norm": 1.1094943284988403, "learning_rate": 8.330933424178284e-05} +{"ts": "2025-12-27T16:44:06", "event": "train_log", "step": 4646, "epoch": 1.9603375527426161, "progress_pct": 32.67, "epoch_pct": 32.67, "eta": "46:32:41", "max_grad_norm": 0.8, "loss": 0.6542805433273315, "grad_norm": 1.1640198230743408, "learning_rate": 8.329142065625218e-05} +{"ts": "2025-12-27T16:44:18", "event": "train_log", "step": 4648, "epoch": 1.961181434599156, "progress_pct": 32.69, "epoch_pct": 32.69, "eta": "46:31:21", "max_grad_norm": 0.8, "loss": 0.6053075194358826, "grad_norm": 1.1080211400985718, "learning_rate": 8.327349939103016e-05} +{"ts": "2025-12-27T16:44:31", "event": "train_log", "step": 4650, "epoch": 1.9620253164556962, "progress_pct": 32.7, "epoch_pct": 32.7, "eta": "46:30:00", "max_grad_norm": 0.8, "loss": 0.6009573340415955, "grad_norm": 1.0137052536010742, "learning_rate": 8.325557045025085e-05} +{"ts": "2025-12-27T16:44:44", "event": "train_log", "step": 4652, "epoch": 1.9628691983122364, "progress_pct": 32.71, "epoch_pct": 32.71, "eta": "46:28:38", "max_grad_norm": 0.8, "loss": 0.5993483066558838, "grad_norm": 1.0867283344268799, "learning_rate": 8.323763383805012e-05} +{"ts": "2025-12-27T16:44:56", "event": "train_log", "step": 4654, "epoch": 1.9637130801687763, "progress_pct": 32.73, "epoch_pct": 32.73, "eta": "46:27:17", "max_grad_norm": 0.8, "loss": 0.6788463592529297, "grad_norm": 1.0577161312103271, "learning_rate": 8.321968955856562e-05} +{"ts": "2025-12-27T16:45:09", "event": "train_log", "step": 4656, "epoch": 1.9645569620253165, "progress_pct": 32.74, "epoch_pct": 32.74, "eta": "46:25:56", "max_grad_norm": 0.8, "loss": 0.5786917209625244, "grad_norm": 1.2002183198928833, "learning_rate": 8.320173761593672e-05} +{"ts": "2025-12-27T16:45:21", "event": "train_log", "step": 4658, "epoch": 1.9654008438818567, "progress_pct": 32.76, "epoch_pct": 32.76, "eta": "46:24:35", "max_grad_norm": 0.8, "loss": 0.7437994480133057, "grad_norm": 1.2266993522644043, "learning_rate": 8.318377801430461e-05} +{"ts": "2025-12-27T16:45:34", "event": "train_log", "step": 4660, "epoch": 1.9662447257383966, "progress_pct": 32.77, "epoch_pct": 32.77, "eta": "46:23:15", "max_grad_norm": 0.8, "loss": 0.6763550639152527, "grad_norm": 1.007582187652588, "learning_rate": 8.316581075781223e-05} +{"ts": "2025-12-27T16:45:46", "event": "train_log", "step": 4662, "epoch": 1.9670886075949368, "progress_pct": 32.78, "epoch_pct": 32.78, "eta": "46:21:53", "max_grad_norm": 0.8, "loss": 0.6953140497207642, "grad_norm": 1.2374811172485352, "learning_rate": 8.314783585060425e-05} +{"ts": "2025-12-27T16:45:59", "event": "train_log", "step": 4664, "epoch": 1.967932489451477, "progress_pct": 32.8, "epoch_pct": 32.8, "eta": "46:20:32", "max_grad_norm": 0.8, "loss": 0.6867341995239258, "grad_norm": 1.1791057586669922, "learning_rate": 8.312985329682717e-05} +{"ts": "2025-12-27T16:46:11", "event": "train_log", "step": 4666, "epoch": 1.9687763713080169, "progress_pct": 32.81, "epoch_pct": 32.81, "eta": "46:19:12", "max_grad_norm": 0.8, "loss": 0.6445001363754272, "grad_norm": 1.1903331279754639, "learning_rate": 8.31118631006292e-05} +{"ts": "2025-12-27T16:46:24", "event": "train_log", "step": 4668, "epoch": 1.9696202531645568, "progress_pct": 32.83, "epoch_pct": 32.83, "eta": "46:17:51", "max_grad_norm": 0.8, "loss": 0.6500589847564697, "grad_norm": 1.1731067895889282, "learning_rate": 8.309386526616034e-05} +{"ts": "2025-12-27T16:46:36", "event": "train_log", "step": 4670, "epoch": 1.9704641350210972, "progress_pct": 32.84, "epoch_pct": 32.84, "eta": "46:16:30", "max_grad_norm": 0.8, "loss": 0.6215718984603882, "grad_norm": 0.9470233917236328, "learning_rate": 8.307585979757233e-05} +{"ts": "2025-12-27T16:46:49", "event": "train_log", "step": 4672, "epoch": 1.9713080168776371, "progress_pct": 32.86, "epoch_pct": 32.86, "eta": "46:15:10", "max_grad_norm": 0.8, "loss": 0.6396787762641907, "grad_norm": 1.2900800704956055, "learning_rate": 8.305784669901872e-05} +{"ts": "2025-12-27T16:47:01", "event": "train_log", "step": 4674, "epoch": 1.972151898734177, "progress_pct": 32.87, "epoch_pct": 32.87, "eta": "46:13:49", "max_grad_norm": 0.8, "loss": 0.6581959128379822, "grad_norm": 1.1729133129119873, "learning_rate": 8.303982597465474e-05} +{"ts": "2025-12-27T16:47:14", "event": "train_log", "step": 4676, "epoch": 1.9729957805907175, "progress_pct": 32.88, "epoch_pct": 32.88, "eta": "46:12:29", "max_grad_norm": 0.8, "loss": 0.7013490796089172, "grad_norm": 1.1450555324554443, "learning_rate": 8.302179762863746e-05} +{"ts": "2025-12-27T16:47:26", "event": "train_log", "step": 4678, "epoch": 1.9738396624472574, "progress_pct": 32.9, "epoch_pct": 32.9, "eta": "46:11:08", "max_grad_norm": 0.8, "loss": 0.6796102523803711, "grad_norm": 1.1506338119506836, "learning_rate": 8.300376166512567e-05} +{"ts": "2025-12-27T16:47:38", "event": "train_log", "step": 4680, "epoch": 1.9746835443037973, "progress_pct": 32.91, "epoch_pct": 32.91, "eta": "46:09:46", "max_grad_norm": 0.8, "loss": 0.6960519552230835, "grad_norm": 1.149979591369629, "learning_rate": 8.298571808827991e-05} +{"ts": "2025-12-27T16:47:50", "event": "train_log", "step": 4682, "epoch": 1.9755274261603377, "progress_pct": 32.93, "epoch_pct": 32.93, "eta": "46:08:25", "max_grad_norm": 0.8, "loss": 0.6789507865905762, "grad_norm": 1.1078912019729614, "learning_rate": 8.296766690226249e-05} +{"ts": "2025-12-27T16:48:03", "event": "train_log", "step": 4684, "epoch": 1.9763713080168777, "progress_pct": 32.94, "epoch_pct": 32.94, "eta": "46:07:05", "max_grad_norm": 0.8, "loss": 0.5962659120559692, "grad_norm": 1.0199202299118042, "learning_rate": 8.294960811123747e-05} +{"ts": "2025-12-27T16:48:15", "event": "train_log", "step": 4686, "epoch": 1.9772151898734176, "progress_pct": 32.95, "epoch_pct": 32.95, "eta": "46:05:44", "max_grad_norm": 0.8, "loss": 0.6483094692230225, "grad_norm": 1.2226134538650513, "learning_rate": 8.293154171937068e-05} +{"ts": "2025-12-27T16:48:28", "event": "train_log", "step": 4688, "epoch": 1.9780590717299578, "progress_pct": 32.97, "epoch_pct": 32.97, "eta": "46:04:24", "max_grad_norm": 0.8, "loss": 0.6750242710113525, "grad_norm": 1.184095025062561, "learning_rate": 8.291346773082965e-05} +{"ts": "2025-12-27T16:48:40", "event": "train_log", "step": 4690, "epoch": 1.978902953586498, "progress_pct": 32.98, "epoch_pct": 32.98, "eta": "46:03:03", "max_grad_norm": 0.8, "loss": 0.7094066739082336, "grad_norm": 1.1018693447113037, "learning_rate": 8.289538614978375e-05} +{"ts": "2025-12-27T16:48:51", "event": "train_log", "step": 4692, "epoch": 1.9797468354430379, "progress_pct": 33.0, "epoch_pct": 33.0, "eta": "46:01:42", "max_grad_norm": 0.8, "loss": 0.6554126739501953, "grad_norm": 1.0342390537261963, "learning_rate": 8.287729698040403e-05} +{"ts": "2025-12-27T16:49:04", "event": "train_log", "step": 4694, "epoch": 1.980590717299578, "progress_pct": 33.01, "epoch_pct": 33.01, "eta": "46:00:22", "max_grad_norm": 0.8, "loss": 0.5493529438972473, "grad_norm": 1.0603563785552979, "learning_rate": 8.285920022686332e-05} +{"ts": "2025-12-27T16:49:17", "event": "train_log", "step": 4696, "epoch": 1.9814345991561182, "progress_pct": 33.02, "epoch_pct": 33.02, "eta": "45:59:03", "max_grad_norm": 0.8, "loss": 0.6824741363525391, "grad_norm": 1.139609932899475, "learning_rate": 8.284109589333617e-05} +{"ts": "2025-12-27T16:49:29", "event": "train_log", "step": 4698, "epoch": 1.9822784810126581, "progress_pct": 33.04, "epoch_pct": 33.04, "eta": "45:57:41", "max_grad_norm": 0.8, "loss": 0.7121000289916992, "grad_norm": 1.2167822122573853, "learning_rate": 8.282298398399895e-05} +{"ts": "2025-12-27T16:49:41", "event": "train_log", "step": 4700, "epoch": 1.9831223628691983, "progress_pct": 33.05, "epoch_pct": 33.05, "eta": "45:56:22", "max_grad_norm": 0.8, "loss": 0.6711249351501465, "grad_norm": 1.109857201576233, "learning_rate": 8.280486450302968e-05} +{"ts": "2025-12-27T16:58:16", "event": "train_log", "step": 4700, "epoch": 1.9831223628691983, "progress_pct": 33.05, "epoch_pct": 33.05, "eta": "46:13:45", "max_grad_norm": 0.8, "eval_loss": 0.6923081278800964, "eval_runtime": 514.7729, "eval_samples_per_second": 4.093, "eval_steps_per_second": 4.093} +{"ts": "2025-12-27T16:58:29", "event": "train_log", "step": 4702, "epoch": 1.9839662447257385, "progress_pct": 33.07, "epoch_pct": 33.07, "eta": "46:12:25", "max_grad_norm": 0.8, "loss": 0.581635594367981, "grad_norm": 1.1387107372283936, "learning_rate": 8.27867374546082e-05} +{"ts": "2025-12-27T16:58:41", "event": "train_log", "step": 4704, "epoch": 1.9848101265822784, "progress_pct": 33.08, "epoch_pct": 33.08, "eta": "46:11:04", "max_grad_norm": 0.8, "loss": 0.6867302060127258, "grad_norm": 1.2519257068634033, "learning_rate": 8.27686028429161e-05} +{"ts": "2025-12-27T16:58:54", "event": "train_log", "step": 4706, "epoch": 1.9856540084388186, "progress_pct": 33.09, "epoch_pct": 33.09, "eta": "46:09:44", "max_grad_norm": 0.8, "loss": 0.6494556665420532, "grad_norm": 1.0927205085754395, "learning_rate": 8.275046067213663e-05} +{"ts": "2025-12-27T16:59:07", "event": "train_log", "step": 4708, "epoch": 1.9864978902953587, "progress_pct": 33.11, "epoch_pct": 33.11, "eta": "46:08:26", "max_grad_norm": 0.8, "loss": 0.6949493288993835, "grad_norm": 1.042035698890686, "learning_rate": 8.273231094645487e-05} +{"ts": "2025-12-27T16:59:20", "event": "train_log", "step": 4710, "epoch": 1.9873417721518987, "progress_pct": 33.12, "epoch_pct": 33.12, "eta": "46:07:06", "max_grad_norm": 0.8, "loss": 0.6535884737968445, "grad_norm": 1.0220824480056763, "learning_rate": 8.271415367005762e-05} +{"ts": "2025-12-27T16:59:32", "event": "train_log", "step": 4712, "epoch": 1.9881856540084388, "progress_pct": 33.14, "epoch_pct": 33.14, "eta": "46:05:45", "max_grad_norm": 0.8, "loss": 0.6635278463363647, "grad_norm": 1.3023611307144165, "learning_rate": 8.269598884713339e-05} +{"ts": "2025-12-27T16:59:44", "event": "train_log", "step": 4714, "epoch": 1.989029535864979, "progress_pct": 33.15, "epoch_pct": 33.15, "eta": "46:04:23", "max_grad_norm": 0.8, "loss": 0.7194697856903076, "grad_norm": 1.2526965141296387, "learning_rate": 8.267781648187248e-05} +{"ts": "2025-12-27T16:59:58", "event": "train_log", "step": 4716, "epoch": 1.989873417721519, "progress_pct": 33.16, "epoch_pct": 33.16, "eta": "46:03:06", "max_grad_norm": 0.8, "loss": 0.6355333924293518, "grad_norm": 1.0388038158416748, "learning_rate": 8.265963657846691e-05} +{"ts": "2025-12-27T17:00:10", "event": "train_log", "step": 4718, "epoch": 1.990717299578059, "progress_pct": 33.18, "epoch_pct": 33.18, "eta": "46:01:45", "max_grad_norm": 0.8, "loss": 0.6898305416107178, "grad_norm": 1.0852965116500854, "learning_rate": 8.264144914111041e-05} +{"ts": "2025-12-27T17:00:22", "event": "train_log", "step": 4720, "epoch": 1.9915611814345993, "progress_pct": 33.19, "epoch_pct": 33.19, "eta": "46:00:25", "max_grad_norm": 0.8, "loss": 0.6202836036682129, "grad_norm": 1.0714049339294434, "learning_rate": 8.262325417399847e-05} +{"ts": "2025-12-27T17:00:35", "event": "train_log", "step": 4722, "epoch": 1.9924050632911392, "progress_pct": 33.21, "epoch_pct": 33.21, "eta": "45:59:06", "max_grad_norm": 0.8, "loss": 0.6160458326339722, "grad_norm": 1.0767238140106201, "learning_rate": 8.260505168132835e-05} +{"ts": "2025-12-27T17:00:48", "event": "train_log", "step": 4724, "epoch": 1.9932489451476794, "progress_pct": 33.22, "epoch_pct": 33.22, "eta": "45:57:48", "max_grad_norm": 0.8, "loss": 0.6049920916557312, "grad_norm": 0.9605211615562439, "learning_rate": 8.258684166729899e-05} +{"ts": "2025-12-27T17:01:01", "event": "train_log", "step": 4726, "epoch": 1.9940928270042195, "progress_pct": 33.23, "epoch_pct": 33.23, "eta": "45:56:28", "max_grad_norm": 0.8, "loss": 0.5622014999389648, "grad_norm": 1.0580185651779175, "learning_rate": 8.256862413611113e-05} +{"ts": "2025-12-27T17:01:14", "event": "train_log", "step": 4728, "epoch": 1.9949367088607595, "progress_pct": 33.25, "epoch_pct": 33.25, "eta": "45:55:09", "max_grad_norm": 0.8, "loss": 0.6678924560546875, "grad_norm": 1.1039034128189087, "learning_rate": 8.255039909196713e-05} +{"ts": "2025-12-27T17:01:26", "event": "train_log", "step": 4730, "epoch": 1.9957805907172996, "progress_pct": 33.26, "epoch_pct": 33.26, "eta": "45:53:49", "max_grad_norm": 0.8, "loss": 0.658260703086853, "grad_norm": 1.1482586860656738, "learning_rate": 8.253216653907123e-05} +{"ts": "2025-12-27T17:01:39", "event": "train_log", "step": 4732, "epoch": 1.9966244725738398, "progress_pct": 33.28, "epoch_pct": 33.28, "eta": "45:52:30", "max_grad_norm": 0.8, "loss": 0.6461613178253174, "grad_norm": 1.135349988937378, "learning_rate": 8.251392648162929e-05} +{"ts": "2025-12-27T17:01:51", "event": "train_log", "step": 4734, "epoch": 1.9974683544303797, "progress_pct": 33.29, "epoch_pct": 33.29, "eta": "45:51:10", "max_grad_norm": 0.8, "loss": 0.6837426424026489, "grad_norm": 1.0155420303344727, "learning_rate": 8.249567892384895e-05} +{"ts": "2025-12-27T17:02:04", "event": "train_log", "step": 4736, "epoch": 1.9983122362869197, "progress_pct": 33.31, "epoch_pct": 33.31, "eta": "45:49:51", "max_grad_norm": 0.8, "loss": 0.6091697812080383, "grad_norm": 1.3392970561981201, "learning_rate": 8.247742386993958e-05} +{"ts": "2025-12-27T17:02:17", "event": "train_log", "step": 4738, "epoch": 1.99915611814346, "progress_pct": 33.32, "epoch_pct": 33.32, "eta": "45:48:33", "max_grad_norm": 0.8, "loss": 0.6539653539657593, "grad_norm": 1.0509974956512451, "learning_rate": 8.245916132411226e-05} +{"ts": "2025-12-27T17:02:30", "event": "train_log", "step": 4740, "epoch": 2.0, "progress_pct": 33.33, "epoch_pct": 33.33, "eta": "45:47:14", "max_grad_norm": 0.8, "loss": 0.5630147457122803, "grad_norm": 0.9777396321296692, "learning_rate": 8.244089129057982e-05} +{"ts": "2025-12-27T17:02:43", "event": "train_log", "step": 4742, "epoch": 2.00084388185654, "progress_pct": 33.35, "epoch_pct": 33.35, "eta": "45:45:55", "max_grad_norm": 0.8, "loss": 0.6190353631973267, "grad_norm": 1.1639164686203003, "learning_rate": 8.24226137735568e-05} +{"ts": "2025-12-27T17:02:55", "event": "train_log", "step": 4744, "epoch": 2.0016877637130803, "progress_pct": 33.36, "epoch_pct": 33.36, "eta": "45:44:35", "max_grad_norm": 0.8, "loss": 0.6282529234886169, "grad_norm": 1.119614839553833, "learning_rate": 8.240432877725947e-05} +{"ts": "2025-12-27T17:03:07", "event": "train_log", "step": 4746, "epoch": 2.0025316455696203, "progress_pct": 33.38, "epoch_pct": 33.38, "eta": "45:43:16", "max_grad_norm": 0.8, "loss": 0.6176725625991821, "grad_norm": 1.114739179611206, "learning_rate": 8.238603630590581e-05} +{"ts": "2025-12-27T17:03:21", "event": "train_log", "step": 4748, "epoch": 2.00337552742616, "progress_pct": 33.39, "epoch_pct": 33.39, "eta": "45:41:59", "max_grad_norm": 0.8, "loss": 0.5182007551193237, "grad_norm": 1.0543076992034912, "learning_rate": 8.236773636371557e-05} +{"ts": "2025-12-27T17:03:33", "event": "train_log", "step": 4750, "epoch": 2.0042194092827006, "progress_pct": 33.4, "epoch_pct": 33.4, "eta": "45:40:40", "max_grad_norm": 0.8, "loss": 0.532536506652832, "grad_norm": 1.060389518737793, "learning_rate": 8.234942895491019e-05} +{"ts": "2025-12-27T17:03:46", "event": "train_log", "step": 4752, "epoch": 2.0050632911392405, "progress_pct": 33.42, "epoch_pct": 33.42, "eta": "45:39:22", "max_grad_norm": 0.8, "loss": 0.5474061369895935, "grad_norm": 1.0824412107467651, "learning_rate": 8.233111408371282e-05} +{"ts": "2025-12-27T17:03:59", "event": "train_log", "step": 4754, "epoch": 2.0059071729957805, "progress_pct": 33.43, "epoch_pct": 33.43, "eta": "45:38:04", "max_grad_norm": 0.8, "loss": 0.586384654045105, "grad_norm": 1.1450858116149902, "learning_rate": 8.231279175434838e-05} +{"ts": "2025-12-27T17:04:12", "event": "train_log", "step": 4756, "epoch": 2.006751054852321, "progress_pct": 33.45, "epoch_pct": 33.45, "eta": "45:36:46", "max_grad_norm": 0.8, "loss": 0.6469444036483765, "grad_norm": 1.1225577592849731, "learning_rate": 8.229446197104345e-05} +{"ts": "2025-12-27T17:04:24", "event": "train_log", "step": 4758, "epoch": 2.007594936708861, "progress_pct": 33.46, "epoch_pct": 33.46, "eta": "45:35:27", "max_grad_norm": 0.8, "loss": 0.5371572971343994, "grad_norm": 1.7292449474334717, "learning_rate": 8.227612473802637e-05} +{"ts": "2025-12-27T17:04:37", "event": "train_log", "step": 4760, "epoch": 2.0084388185654007, "progress_pct": 33.47, "epoch_pct": 33.47, "eta": "45:34:08", "max_grad_norm": 0.8, "loss": 0.558707058429718, "grad_norm": 1.1743781566619873, "learning_rate": 8.22577800595272e-05} +{"ts": "2025-12-27T17:04:49", "event": "train_log", "step": 4762, "epoch": 2.009282700421941, "progress_pct": 33.49, "epoch_pct": 33.49, "eta": "45:32:49", "max_grad_norm": 0.8, "loss": 0.5943514108657837, "grad_norm": 1.0385273694992065, "learning_rate": 8.223942793977769e-05} +{"ts": "2025-12-27T17:05:02", "event": "train_log", "step": 4764, "epoch": 2.010126582278481, "progress_pct": 33.5, "epoch_pct": 33.5, "eta": "45:31:30", "max_grad_norm": 0.8, "loss": 0.5630753636360168, "grad_norm": 1.1302000284194946, "learning_rate": 8.222106838301131e-05} +{"ts": "2025-12-27T17:05:15", "event": "train_log", "step": 4766, "epoch": 2.010970464135021, "progress_pct": 33.52, "epoch_pct": 33.52, "eta": "45:30:12", "max_grad_norm": 0.8, "loss": 0.527510404586792, "grad_norm": 1.140005111694336, "learning_rate": 8.220270139346327e-05} +{"ts": "2025-12-27T17:05:27", "event": "train_log", "step": 4768, "epoch": 2.0118143459915614, "progress_pct": 33.53, "epoch_pct": 33.53, "eta": "45:28:54", "max_grad_norm": 0.8, "loss": 0.6315013766288757, "grad_norm": 1.1979734897613525, "learning_rate": 8.21843269753705e-05} +{"ts": "2025-12-27T17:05:40", "event": "train_log", "step": 4770, "epoch": 2.0126582278481013, "progress_pct": 33.54, "epoch_pct": 33.54, "eta": "45:27:35", "max_grad_norm": 0.8, "loss": 0.6225199699401855, "grad_norm": 1.3759459257125854, "learning_rate": 8.21659451329716e-05} +{"ts": "2025-12-27T17:05:52", "event": "train_log", "step": 4772, "epoch": 2.0135021097046413, "progress_pct": 33.56, "epoch_pct": 33.56, "eta": "45:26:16", "max_grad_norm": 0.8, "loss": 0.6838938593864441, "grad_norm": 1.330600380897522, "learning_rate": 8.21475558705069e-05} +{"ts": "2025-12-27T17:06:05", "event": "train_log", "step": 4774, "epoch": 2.014345991561181, "progress_pct": 33.57, "epoch_pct": 33.57, "eta": "45:24:59", "max_grad_norm": 0.8, "loss": 0.606302797794342, "grad_norm": 1.2365351915359497, "learning_rate": 8.21291591922185e-05} +{"ts": "2025-12-27T17:06:17", "event": "train_log", "step": 4776, "epoch": 2.0151898734177216, "progress_pct": 33.59, "epoch_pct": 33.59, "eta": "45:23:40", "max_grad_norm": 0.8, "loss": 0.6194182634353638, "grad_norm": 1.1886142492294312, "learning_rate": 8.211075510235011e-05} +{"ts": "2025-12-27T17:06:30", "event": "train_log", "step": 4778, "epoch": 2.0160337552742615, "progress_pct": 33.6, "epoch_pct": 33.6, "eta": "45:22:21", "max_grad_norm": 0.8, "loss": 0.639540433883667, "grad_norm": 1.1414743661880493, "learning_rate": 8.209234360514721e-05} +{"ts": "2025-12-27T17:06:41", "event": "train_log", "step": 4780, "epoch": 2.0168776371308015, "progress_pct": 33.61, "epoch_pct": 33.61, "eta": "45:21:02", "max_grad_norm": 0.8, "loss": 0.6350902318954468, "grad_norm": 1.2877455949783325, "learning_rate": 8.2073924704857e-05} +{"ts": "2025-12-27T17:06:54", "event": "train_log", "step": 4782, "epoch": 2.017721518987342, "progress_pct": 33.63, "epoch_pct": 33.63, "eta": "45:19:45", "max_grad_norm": 0.8, "loss": 0.5152000784873962, "grad_norm": 1.095578908920288, "learning_rate": 8.205549840572834e-05} +{"ts": "2025-12-27T17:07:08", "event": "train_log", "step": 4784, "epoch": 2.018565400843882, "progress_pct": 33.64, "epoch_pct": 33.64, "eta": "45:18:29", "max_grad_norm": 0.8, "loss": 0.46245837211608887, "grad_norm": 1.0043798685073853, "learning_rate": 8.203706471201183e-05} +{"ts": "2025-12-27T17:07:20", "event": "train_log", "step": 4786, "epoch": 2.0194092827004217, "progress_pct": 33.66, "epoch_pct": 33.66, "eta": "45:17:11", "max_grad_norm": 0.8, "loss": 0.6471722722053528, "grad_norm": 1.2133857011795044, "learning_rate": 8.201862362795979e-05} +{"ts": "2025-12-27T17:07:33", "event": "train_log", "step": 4788, "epoch": 2.020253164556962, "progress_pct": 33.67, "epoch_pct": 33.67, "eta": "45:15:53", "max_grad_norm": 0.8, "loss": 0.5790625214576721, "grad_norm": 1.0835390090942383, "learning_rate": 8.200017515782619e-05} +{"ts": "2025-12-27T17:07:46", "event": "train_log", "step": 4790, "epoch": 2.021097046413502, "progress_pct": 33.68, "epoch_pct": 33.68, "eta": "45:14:37", "max_grad_norm": 0.8, "loss": 0.5826238989830017, "grad_norm": 1.0176091194152832, "learning_rate": 8.198171930586678e-05} +{"ts": "2025-12-27T17:08:00", "event": "train_log", "step": 4792, "epoch": 2.021940928270042, "progress_pct": 33.7, "epoch_pct": 33.7, "eta": "45:13:20", "max_grad_norm": 0.8, "loss": 0.5781272649765015, "grad_norm": 1.1581370830535889, "learning_rate": 8.196325607633893e-05} +{"ts": "2025-12-27T17:08:12", "event": "train_log", "step": 4794, "epoch": 2.0227848101265824, "progress_pct": 33.71, "epoch_pct": 33.71, "eta": "45:12:02", "max_grad_norm": 0.8, "loss": 0.6600401997566223, "grad_norm": 1.243381142616272, "learning_rate": 8.194478547350178e-05} +{"ts": "2025-12-27T17:08:25", "event": "train_log", "step": 4796, "epoch": 2.0236286919831223, "progress_pct": 33.73, "epoch_pct": 33.73, "eta": "45:10:45", "max_grad_norm": 0.8, "loss": 0.5291268825531006, "grad_norm": 1.0718560218811035, "learning_rate": 8.192630750161612e-05} +{"ts": "2025-12-27T17:08:37", "event": "train_log", "step": 4798, "epoch": 2.0244725738396623, "progress_pct": 33.74, "epoch_pct": 33.74, "eta": "45:09:26", "max_grad_norm": 0.8, "loss": 0.6564924120903015, "grad_norm": 1.2338320016860962, "learning_rate": 8.190782216494448e-05} +{"ts": "2025-12-27T17:08:50", "event": "train_log", "step": 4800, "epoch": 2.0253164556962027, "progress_pct": 33.76, "epoch_pct": 33.76, "eta": "45:08:10", "max_grad_norm": 0.8, "loss": 0.5471183657646179, "grad_norm": 0.978547990322113, "learning_rate": 8.188932946775107e-05} +{"ts": "2025-12-27T17:17:24", "event": "train_log", "step": 4800, "epoch": 2.0253164556962027, "progress_pct": 33.76, "epoch_pct": 33.76, "eta": "45:24:59", "max_grad_norm": 0.8, "eval_loss": 0.6924457550048828, "eval_runtime": 514.0427, "eval_samples_per_second": 4.099, "eval_steps_per_second": 4.099} +{"ts": "2025-12-27T17:17:36", "event": "train_log", "step": 4802, "epoch": 2.0261603375527426, "progress_pct": 33.77, "epoch_pct": 33.77, "eta": "45:23:40", "max_grad_norm": 0.8, "loss": 0.567442774772644, "grad_norm": 1.1782792806625366, "learning_rate": 8.18708294143018e-05} +{"ts": "2025-12-27T17:17:49", "event": "train_log", "step": 4804, "epoch": 2.0270042194092825, "progress_pct": 33.78, "epoch_pct": 33.78, "eta": "45:22:21", "max_grad_norm": 0.8, "loss": 0.6005180478096008, "grad_norm": 1.0768574476242065, "learning_rate": 8.185232200886426e-05} +{"ts": "2025-12-27T17:18:01", "event": "train_log", "step": 4806, "epoch": 2.027848101265823, "progress_pct": 33.8, "epoch_pct": 33.8, "eta": "45:21:03", "max_grad_norm": 0.8, "loss": 0.616436779499054, "grad_norm": 1.3096717596054077, "learning_rate": 8.18338072557078e-05} +{"ts": "2025-12-27T17:18:14", "event": "train_log", "step": 4808, "epoch": 2.028691983122363, "progress_pct": 33.81, "epoch_pct": 33.81, "eta": "45:19:45", "max_grad_norm": 0.8, "loss": 0.49587416648864746, "grad_norm": 1.0233508348464966, "learning_rate": 8.181528515910336e-05} +{"ts": "2025-12-27T17:18:25", "event": "train_log", "step": 4810, "epoch": 2.029535864978903, "progress_pct": 33.83, "epoch_pct": 33.83, "eta": "45:18:26", "max_grad_norm": 0.8, "loss": 0.5758571624755859, "grad_norm": 1.0800065994262695, "learning_rate": 8.179675572332366e-05} +{"ts": "2025-12-27T17:18:38", "event": "train_log", "step": 4812, "epoch": 2.030379746835443, "progress_pct": 33.84, "epoch_pct": 33.84, "eta": "45:17:08", "max_grad_norm": 0.8, "loss": 0.561736524105072, "grad_norm": 1.09299898147583, "learning_rate": 8.177821895264309e-05} +{"ts": "2025-12-27T17:18:52", "event": "train_log", "step": 4814, "epoch": 2.031223628691983, "progress_pct": 33.85, "epoch_pct": 33.85, "eta": "45:15:52", "max_grad_norm": 0.8, "loss": 0.5249468088150024, "grad_norm": 1.1439210176467896, "learning_rate": 8.175967485133771e-05} +{"ts": "2025-12-27T17:19:05", "event": "train_log", "step": 4816, "epoch": 2.032067510548523, "progress_pct": 33.87, "epoch_pct": 33.87, "eta": "45:14:36", "max_grad_norm": 0.8, "loss": 0.6429001688957214, "grad_norm": 1.15841805934906, "learning_rate": 8.174112342368532e-05} +{"ts": "2025-12-27T17:19:17", "event": "train_log", "step": 4818, "epoch": 2.0329113924050635, "progress_pct": 33.88, "epoch_pct": 33.88, "eta": "45:13:18", "max_grad_norm": 0.8, "loss": 0.60152667760849, "grad_norm": 1.1720670461654663, "learning_rate": 8.172256467396533e-05} +{"ts": "2025-12-27T17:19:30", "event": "train_log", "step": 4820, "epoch": 2.0337552742616034, "progress_pct": 33.9, "epoch_pct": 33.9, "eta": "45:12:01", "max_grad_norm": 0.8, "loss": 0.5553541779518127, "grad_norm": 1.2652091979980469, "learning_rate": 8.170399860645892e-05} +{"ts": "2025-12-27T17:19:44", "event": "train_log", "step": 4822, "epoch": 2.0345991561181433, "progress_pct": 33.91, "epoch_pct": 33.91, "eta": "45:10:45", "max_grad_norm": 0.8, "loss": 0.5369323492050171, "grad_norm": 1.0768507719039917, "learning_rate": 8.168542522544893e-05} +{"ts": "2025-12-27T17:19:56", "event": "train_log", "step": 4824, "epoch": 2.0354430379746837, "progress_pct": 33.92, "epoch_pct": 33.92, "eta": "45:09:27", "max_grad_norm": 0.8, "loss": 0.5468952655792236, "grad_norm": 0.9906469583511353, "learning_rate": 8.166684453521986e-05} +{"ts": "2025-12-27T17:20:08", "event": "train_log", "step": 4826, "epoch": 2.0362869198312237, "progress_pct": 33.94, "epoch_pct": 33.94, "eta": "45:08:07", "max_grad_norm": 0.8, "loss": 0.5795659422874451, "grad_norm": 1.3448988199234009, "learning_rate": 8.164825654005792e-05} +{"ts": "2025-12-27T17:20:19", "event": "train_log", "step": 4828, "epoch": 2.0371308016877636, "progress_pct": 33.95, "epoch_pct": 33.95, "eta": "45:06:48", "max_grad_norm": 0.8, "loss": 0.6465779542922974, "grad_norm": 1.2502341270446777, "learning_rate": 8.162966124425103e-05} +{"ts": "2025-12-27T17:20:32", "event": "train_log", "step": 4830, "epoch": 2.037974683544304, "progress_pct": 33.97, "epoch_pct": 33.97, "eta": "45:05:31", "max_grad_norm": 0.8, "loss": 0.5509394407272339, "grad_norm": 1.1512303352355957, "learning_rate": 8.161105865208875e-05} +{"ts": "2025-12-27T17:20:44", "event": "train_log", "step": 4832, "epoch": 2.038818565400844, "progress_pct": 33.98, "epoch_pct": 33.98, "eta": "45:04:12", "max_grad_norm": 0.8, "loss": 0.5515735745429993, "grad_norm": 1.2513408660888672, "learning_rate": 8.159244876786232e-05} +{"ts": "2025-12-27T17:20:56", "event": "train_log", "step": 4834, "epoch": 2.039662447257384, "progress_pct": 33.99, "epoch_pct": 33.99, "eta": "45:02:54", "max_grad_norm": 0.8, "loss": 0.757799506187439, "grad_norm": 1.3035682439804077, "learning_rate": 8.157383159586473e-05} +{"ts": "2025-12-27T17:21:08", "event": "train_log", "step": 4836, "epoch": 2.0405063291139243, "progress_pct": 34.01, "epoch_pct": 34.01, "eta": "45:01:37", "max_grad_norm": 0.8, "loss": 0.607295036315918, "grad_norm": 1.1136540174484253, "learning_rate": 8.155520714039056e-05} +{"ts": "2025-12-27T17:21:21", "event": "train_log", "step": 4838, "epoch": 2.041350210970464, "progress_pct": 34.02, "epoch_pct": 34.02, "eta": "45:00:20", "max_grad_norm": 0.8, "loss": 0.5769712328910828, "grad_norm": 1.220146656036377, "learning_rate": 8.153657540573613e-05} +{"ts": "2025-12-27T17:21:33", "event": "train_log", "step": 4840, "epoch": 2.042194092827004, "progress_pct": 34.04, "epoch_pct": 34.04, "eta": "44:59:01", "max_grad_norm": 0.8, "loss": 0.5746933817863464, "grad_norm": 1.2104195356369019, "learning_rate": 8.151793639619944e-05} +{"ts": "2025-12-27T17:21:46", "event": "train_log", "step": 4842, "epoch": 2.043037974683544, "progress_pct": 34.05, "epoch_pct": 34.05, "eta": "44:57:44", "max_grad_norm": 0.8, "loss": 0.5932332277297974, "grad_norm": 1.241708517074585, "learning_rate": 8.149929011608014e-05} +{"ts": "2025-12-27T17:21:58", "event": "train_log", "step": 4844, "epoch": 2.0438818565400845, "progress_pct": 34.06, "epoch_pct": 34.06, "eta": "44:56:27", "max_grad_norm": 0.8, "loss": 0.583284318447113, "grad_norm": 1.1172713041305542, "learning_rate": 8.148063656967955e-05} +{"ts": "2025-12-27T17:22:11", "event": "train_log", "step": 4846, "epoch": 2.0447257383966244, "progress_pct": 34.08, "epoch_pct": 34.08, "eta": "44:55:10", "max_grad_norm": 0.8, "loss": 0.5589476823806763, "grad_norm": 1.0867618322372437, "learning_rate": 8.14619757613007e-05} +{"ts": "2025-12-27T17:22:23", "event": "train_log", "step": 4848, "epoch": 2.0455696202531644, "progress_pct": 34.09, "epoch_pct": 34.09, "eta": "44:53:53", "max_grad_norm": 0.8, "loss": 0.6118156313896179, "grad_norm": 1.2470483779907227, "learning_rate": 8.14433076952483e-05} +{"ts": "2025-12-27T17:22:36", "event": "train_log", "step": 4850, "epoch": 2.0464135021097047, "progress_pct": 34.11, "epoch_pct": 34.11, "eta": "44:52:38", "max_grad_norm": 0.8, "loss": 0.5815895795822144, "grad_norm": 1.0908832550048828, "learning_rate": 8.142463237582868e-05} +{"ts": "2025-12-27T17:22:49", "event": "train_log", "step": 4852, "epoch": 2.0472573839662447, "progress_pct": 34.12, "epoch_pct": 34.12, "eta": "44:51:20", "max_grad_norm": 0.8, "loss": 0.6232373714447021, "grad_norm": 1.2589281797409058, "learning_rate": 8.140594980734989e-05} +{"ts": "2025-12-27T17:23:01", "event": "train_log", "step": 4854, "epoch": 2.0481012658227846, "progress_pct": 34.14, "epoch_pct": 34.14, "eta": "44:50:03", "max_grad_norm": 0.8, "loss": 0.5992053151130676, "grad_norm": 1.234152913093567, "learning_rate": 8.138725999412165e-05} +{"ts": "2025-12-27T17:23:13", "event": "train_log", "step": 4856, "epoch": 2.048945147679325, "progress_pct": 34.15, "epoch_pct": 34.15, "eta": "44:48:45", "max_grad_norm": 0.8, "loss": 0.6494496464729309, "grad_norm": 1.3304446935653687, "learning_rate": 8.136856294045533e-05} +{"ts": "2025-12-27T17:23:25", "event": "train_log", "step": 4858, "epoch": 2.049789029535865, "progress_pct": 34.16, "epoch_pct": 34.16, "eta": "44:47:27", "max_grad_norm": 0.8, "loss": 0.6263431906700134, "grad_norm": 1.1871088743209839, "learning_rate": 8.134985865066398e-05} +{"ts": "2025-12-27T17:23:37", "event": "train_log", "step": 4860, "epoch": 2.050632911392405, "progress_pct": 34.18, "epoch_pct": 34.18, "eta": "44:46:10", "max_grad_norm": 0.8, "loss": 0.6036502122879028, "grad_norm": 1.1454699039459229, "learning_rate": 8.133114712906234e-05} +{"ts": "2025-12-27T17:23:50", "event": "train_log", "step": 4862, "epoch": 2.0514767932489453, "progress_pct": 34.19, "epoch_pct": 34.19, "eta": "44:44:54", "max_grad_norm": 0.8, "loss": 0.5674451589584351, "grad_norm": 1.2953420877456665, "learning_rate": 8.131242837996675e-05} +{"ts": "2025-12-27T17:24:03", "event": "train_log", "step": 4864, "epoch": 2.052320675105485, "progress_pct": 34.21, "epoch_pct": 34.21, "eta": "44:43:38", "max_grad_norm": 0.8, "loss": 0.5616317987442017, "grad_norm": 1.1874405145645142, "learning_rate": 8.129370240769534e-05} +{"ts": "2025-12-27T17:24:14", "event": "train_log", "step": 4866, "epoch": 2.053164556962025, "progress_pct": 34.22, "epoch_pct": 34.22, "eta": "44:42:20", "max_grad_norm": 0.8, "loss": 0.6495023369789124, "grad_norm": 1.2936227321624756, "learning_rate": 8.127496921656777e-05} +{"ts": "2025-12-27T17:24:27", "event": "train_log", "step": 4868, "epoch": 2.0540084388185655, "progress_pct": 34.23, "epoch_pct": 34.23, "eta": "44:41:03", "max_grad_norm": 0.8, "loss": 0.6028099060058594, "grad_norm": 1.1935228109359741, "learning_rate": 8.125622881090544e-05} +{"ts": "2025-12-27T17:24:40", "event": "train_log", "step": 4870, "epoch": 2.0548523206751055, "progress_pct": 34.25, "epoch_pct": 34.25, "eta": "44:39:48", "max_grad_norm": 0.8, "loss": 0.476296067237854, "grad_norm": 0.9932331442832947, "learning_rate": 8.123748119503143e-05} +{"ts": "2025-12-27T17:24:52", "event": "train_log", "step": 4872, "epoch": 2.0556962025316454, "progress_pct": 34.26, "epoch_pct": 34.26, "eta": "44:38:31", "max_grad_norm": 0.8, "loss": 0.6191902756690979, "grad_norm": 1.3878839015960693, "learning_rate": 8.121872637327042e-05} +{"ts": "2025-12-27T17:25:05", "event": "train_log", "step": 4874, "epoch": 2.056540084388186, "progress_pct": 34.28, "epoch_pct": 34.28, "eta": "44:37:15", "max_grad_norm": 0.8, "loss": 0.566487729549408, "grad_norm": 1.1185581684112549, "learning_rate": 8.11999643499488e-05} +{"ts": "2025-12-27T17:25:17", "event": "train_log", "step": 4876, "epoch": 2.0573839662447257, "progress_pct": 34.29, "epoch_pct": 34.29, "eta": "44:35:59", "max_grad_norm": 0.8, "loss": 0.5970078706741333, "grad_norm": 1.3729257583618164, "learning_rate": 8.118119512939464e-05} +{"ts": "2025-12-27T17:25:31", "event": "train_log", "step": 4878, "epoch": 2.0582278481012657, "progress_pct": 34.3, "epoch_pct": 34.3, "eta": "44:34:44", "max_grad_norm": 0.8, "loss": 0.570341944694519, "grad_norm": 1.1332688331604004, "learning_rate": 8.11624187159376e-05} +{"ts": "2025-12-27T17:25:43", "event": "train_log", "step": 4880, "epoch": 2.059071729957806, "progress_pct": 34.32, "epoch_pct": 34.32, "eta": "44:33:27", "max_grad_norm": 0.8, "loss": 0.6302897334098816, "grad_norm": 1.2648937702178955, "learning_rate": 8.114363511390903e-05} +{"ts": "2025-12-27T17:25:55", "event": "train_log", "step": 4882, "epoch": 2.059915611814346, "progress_pct": 34.33, "epoch_pct": 34.33, "eta": "44:32:11", "max_grad_norm": 0.8, "loss": 0.5619142651557922, "grad_norm": 1.250616192817688, "learning_rate": 8.112484432764197e-05} +{"ts": "2025-12-27T17:26:09", "event": "train_log", "step": 4884, "epoch": 2.060759493670886, "progress_pct": 34.35, "epoch_pct": 34.35, "eta": "44:30:58", "max_grad_norm": 0.8, "loss": 0.5426228642463684, "grad_norm": 0.9710861444473267, "learning_rate": 8.110604636147109e-05} +{"ts": "2025-12-27T17:26:22", "event": "train_log", "step": 4886, "epoch": 2.0616033755274263, "progress_pct": 34.36, "epoch_pct": 34.36, "eta": "44:29:42", "max_grad_norm": 0.8, "loss": 0.5498107671737671, "grad_norm": 1.1979506015777588, "learning_rate": 8.108724121973271e-05} +{"ts": "2025-12-27T17:26:35", "event": "train_log", "step": 4888, "epoch": 2.0624472573839663, "progress_pct": 34.37, "epoch_pct": 34.37, "eta": "44:28:27", "max_grad_norm": 0.8, "loss": 0.5695134401321411, "grad_norm": 1.0936485528945923, "learning_rate": 8.106842890676483e-05} +{"ts": "2025-12-27T17:26:48", "event": "train_log", "step": 4890, "epoch": 2.0632911392405062, "progress_pct": 34.39, "epoch_pct": 34.39, "eta": "44:27:12", "max_grad_norm": 0.8, "loss": 0.5998331308364868, "grad_norm": 1.1246092319488525, "learning_rate": 8.10496094269071e-05} +{"ts": "2025-12-27T17:27:01", "event": "train_log", "step": 4892, "epoch": 2.0641350210970466, "progress_pct": 34.4, "epoch_pct": 34.4, "eta": "44:25:57", "max_grad_norm": 0.8, "loss": 0.5702623128890991, "grad_norm": 1.244438648223877, "learning_rate": 8.103078278450075e-05} +{"ts": "2025-12-27T17:27:19", "event": "train_log", "step": 4894, "epoch": 2.0649789029535865, "progress_pct": 34.42, "epoch_pct": 34.42, "eta": "44:24:51", "max_grad_norm": 0.8, "loss": 0.5392299890518188, "grad_norm": 1.1585633754730225, "learning_rate": 8.101194898388881e-05} +{"ts": "2025-12-27T17:27:33", "event": "train_log", "step": 4896, "epoch": 2.0658227848101265, "progress_pct": 34.43, "epoch_pct": 34.43, "eta": "44:23:39", "max_grad_norm": 0.8, "loss": 0.5640127658843994, "grad_norm": 1.3044285774230957, "learning_rate": 8.099310802941582e-05} +{"ts": "2025-12-27T17:27:45", "event": "train_log", "step": 4898, "epoch": 2.066666666666667, "progress_pct": 34.44, "epoch_pct": 34.44, "eta": "44:22:23", "max_grad_norm": 0.8, "loss": 0.6103175282478333, "grad_norm": 1.2483032941818237, "learning_rate": 8.097425992542804e-05} +{"ts": "2025-12-27T17:27:59", "event": "train_log", "step": 4900, "epoch": 2.067510548523207, "progress_pct": 34.46, "epoch_pct": 34.46, "eta": "44:21:09", "max_grad_norm": 0.8, "loss": 0.5041166543960571, "grad_norm": 1.0845462083816528, "learning_rate": 8.095540467627337e-05} +{"ts": "2025-12-27T17:36:32", "event": "train_log", "step": 4900, "epoch": 2.067510548523207, "progress_pct": 34.46, "epoch_pct": 34.46, "eta": "44:37:26", "max_grad_norm": 0.8, "eval_loss": 0.6941288113594055, "eval_runtime": 513.4497, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104} +{"ts": "2025-12-27T17:36:45", "event": "train_log", "step": 4902, "epoch": 2.0683544303797468, "progress_pct": 34.47, "epoch_pct": 34.47, "eta": "44:36:10", "max_grad_norm": 0.8, "loss": 0.6253946423530579, "grad_norm": 1.2493232488632202, "learning_rate": 8.093654228630134e-05} +{"ts": "2025-12-27T17:36:59", "event": "train_log", "step": 4904, "epoch": 2.0691983122362867, "progress_pct": 34.49, "epoch_pct": 34.49, "eta": "44:34:57", "max_grad_norm": 0.8, "loss": 0.523486852645874, "grad_norm": 1.1668756008148193, "learning_rate": 8.091767275986317e-05} +{"ts": "2025-12-27T17:37:12", "event": "train_log", "step": 4906, "epoch": 2.070042194092827, "progress_pct": 34.5, "epoch_pct": 34.5, "eta": "44:33:41", "max_grad_norm": 0.8, "loss": 0.5569989681243896, "grad_norm": 1.1709638833999634, "learning_rate": 8.089879610131167e-05} +{"ts": "2025-12-27T17:37:25", "event": "train_log", "step": 4908, "epoch": 2.070886075949367, "progress_pct": 34.51, "epoch_pct": 34.51, "eta": "44:32:26", "max_grad_norm": 0.8, "loss": 0.642728865146637, "grad_norm": 1.1044740676879883, "learning_rate": 8.087991231500133e-05} +{"ts": "2025-12-27T17:37:38", "event": "train_log", "step": 4910, "epoch": 2.071729957805907, "progress_pct": 34.53, "epoch_pct": 34.53, "eta": "44:31:11", "max_grad_norm": 0.8, "loss": 0.5998259782791138, "grad_norm": 1.1032549142837524, "learning_rate": 8.086102140528828e-05} +{"ts": "2025-12-27T17:37:52", "event": "train_log", "step": 4912, "epoch": 2.0725738396624473, "progress_pct": 34.54, "epoch_pct": 34.54, "eta": "44:29:57", "max_grad_norm": 0.8, "loss": 0.5460172891616821, "grad_norm": 0.9980027079582214, "learning_rate": 8.08421233765303e-05} +{"ts": "2025-12-27T17:38:04", "event": "train_log", "step": 4914, "epoch": 2.0734177215189873, "progress_pct": 34.56, "epoch_pct": 34.56, "eta": "44:28:41", "max_grad_norm": 0.8, "loss": 0.5643284916877747, "grad_norm": 1.0866090059280396, "learning_rate": 8.082321823308679e-05} +{"ts": "2025-12-27T17:38:17", "event": "train_log", "step": 4916, "epoch": 2.0742616033755272, "progress_pct": 34.57, "epoch_pct": 34.57, "eta": "44:27:25", "max_grad_norm": 0.8, "loss": 0.554400622844696, "grad_norm": 1.1942687034606934, "learning_rate": 8.080430597931878e-05} +{"ts": "2025-12-27T17:38:29", "event": "train_log", "step": 4918, "epoch": 2.0751054852320676, "progress_pct": 34.59, "epoch_pct": 34.59, "eta": "44:26:09", "max_grad_norm": 0.8, "loss": 0.5955621004104614, "grad_norm": 1.0680599212646484, "learning_rate": 8.078538661958901e-05} +{"ts": "2025-12-27T17:38:41", "event": "train_log", "step": 4920, "epoch": 2.0759493670886076, "progress_pct": 34.6, "epoch_pct": 34.6, "eta": "44:24:53", "max_grad_norm": 0.8, "loss": 0.5970203280448914, "grad_norm": 1.20845627784729, "learning_rate": 8.076646015826179e-05} +{"ts": "2025-12-27T17:38:54", "event": "train_log", "step": 4922, "epoch": 2.0767932489451475, "progress_pct": 34.61, "epoch_pct": 34.61, "eta": "44:23:38", "max_grad_norm": 0.8, "loss": 0.6467664837837219, "grad_norm": 1.8368924856185913, "learning_rate": 8.074752659970308e-05} +{"ts": "2025-12-27T17:39:07", "event": "train_log", "step": 4924, "epoch": 2.077637130801688, "progress_pct": 34.63, "epoch_pct": 34.63, "eta": "44:22:22", "max_grad_norm": 0.8, "loss": 0.630719006061554, "grad_norm": 1.3291922807693481, "learning_rate": 8.072858594828053e-05} +{"ts": "2025-12-27T17:39:19", "event": "train_log", "step": 4926, "epoch": 2.078481012658228, "progress_pct": 34.64, "epoch_pct": 34.64, "eta": "44:21:06", "max_grad_norm": 0.8, "loss": 0.601140022277832, "grad_norm": 1.1496083736419678, "learning_rate": 8.070963820836333e-05} +{"ts": "2025-12-27T17:39:32", "event": "train_log", "step": 4928, "epoch": 2.0793248945147678, "progress_pct": 34.66, "epoch_pct": 34.66, "eta": "44:19:52", "max_grad_norm": 0.8, "loss": 0.6096881031990051, "grad_norm": 1.1562724113464355, "learning_rate": 8.069068338432239e-05} +{"ts": "2025-12-27T17:39:46", "event": "train_log", "step": 4930, "epoch": 2.080168776371308, "progress_pct": 34.67, "epoch_pct": 34.67, "eta": "44:18:39", "max_grad_norm": 0.8, "loss": 0.5085908770561218, "grad_norm": 1.0115300416946411, "learning_rate": 8.067172148053021e-05} +{"ts": "2025-12-27T17:39:58", "event": "train_log", "step": 4932, "epoch": 2.081012658227848, "progress_pct": 34.68, "epoch_pct": 34.68, "eta": "44:17:23", "max_grad_norm": 0.8, "loss": 0.5268720984458923, "grad_norm": 1.2181830406188965, "learning_rate": 8.065275250136097e-05} +{"ts": "2025-12-27T17:40:11", "event": "train_log", "step": 4934, "epoch": 2.081856540084388, "progress_pct": 34.7, "epoch_pct": 34.7, "eta": "44:16:07", "max_grad_norm": 0.8, "loss": 0.6075665950775146, "grad_norm": 1.1249788999557495, "learning_rate": 8.06337764511904e-05} +{"ts": "2025-12-27T17:40:23", "event": "train_log", "step": 4936, "epoch": 2.0827004219409284, "progress_pct": 34.71, "epoch_pct": 34.71, "eta": "44:14:52", "max_grad_norm": 0.8, "loss": 0.59170001745224, "grad_norm": 1.1143964529037476, "learning_rate": 8.061479333439595e-05} +{"ts": "2025-12-27T17:40:35", "event": "train_log", "step": 4938, "epoch": 2.0835443037974684, "progress_pct": 34.73, "epoch_pct": 34.73, "eta": "44:13:36", "max_grad_norm": 0.8, "loss": 0.6689745187759399, "grad_norm": 1.4773131608963013, "learning_rate": 8.059580315535664e-05} +{"ts": "2025-12-27T17:40:48", "event": "train_log", "step": 4940, "epoch": 2.0843881856540083, "progress_pct": 34.74, "epoch_pct": 34.74, "eta": "44:12:22", "max_grad_norm": 0.8, "loss": 0.5409777760505676, "grad_norm": 1.143965244293213, "learning_rate": 8.057680591845316e-05} +{"ts": "2025-12-27T17:41:01", "event": "train_log", "step": 4942, "epoch": 2.0852320675105487, "progress_pct": 34.75, "epoch_pct": 34.75, "eta": "44:11:08", "max_grad_norm": 0.8, "loss": 0.5778636336326599, "grad_norm": 1.0384942293167114, "learning_rate": 8.055780162806777e-05} +{"ts": "2025-12-27T17:41:15", "event": "train_log", "step": 4944, "epoch": 2.0860759493670886, "progress_pct": 34.77, "epoch_pct": 34.77, "eta": "44:09:54", "max_grad_norm": 0.8, "loss": 0.5576038360595703, "grad_norm": 1.0102177858352661, "learning_rate": 8.053879028858442e-05} +{"ts": "2025-12-27T17:41:27", "event": "train_log", "step": 4946, "epoch": 2.0869198312236286, "progress_pct": 34.78, "epoch_pct": 34.78, "eta": "44:08:39", "max_grad_norm": 0.8, "loss": 0.5873376131057739, "grad_norm": 1.3792158365249634, "learning_rate": 8.051977190438868e-05} +{"ts": "2025-12-27T17:41:39", "event": "train_log", "step": 4948, "epoch": 2.087763713080169, "progress_pct": 34.8, "epoch_pct": 34.8, "eta": "44:07:23", "max_grad_norm": 0.8, "loss": 0.6067743301391602, "grad_norm": 1.4402949810028076, "learning_rate": 8.050074647986768e-05} +{"ts": "2025-12-27T17:41:52", "event": "train_log", "step": 4950, "epoch": 2.088607594936709, "progress_pct": 34.81, "epoch_pct": 34.81, "eta": "44:06:08", "max_grad_norm": 0.8, "loss": 0.604671835899353, "grad_norm": 1.2719058990478516, "learning_rate": 8.048171401941027e-05} +{"ts": "2025-12-27T17:42:04", "event": "train_log", "step": 4952, "epoch": 2.089451476793249, "progress_pct": 34.82, "epoch_pct": 34.82, "eta": "44:04:53", "max_grad_norm": 0.8, "loss": 0.5743544697761536, "grad_norm": 1.1054867506027222, "learning_rate": 8.046267452740683e-05} +{"ts": "2025-12-27T17:42:18", "event": "train_log", "step": 4954, "epoch": 2.090295358649789, "progress_pct": 34.84, "epoch_pct": 34.84, "eta": "44:03:40", "max_grad_norm": 0.8, "loss": 0.576278567314148, "grad_norm": 1.0521535873413086, "learning_rate": 8.044362800824944e-05} +{"ts": "2025-12-27T17:42:30", "event": "train_log", "step": 4956, "epoch": 2.091139240506329, "progress_pct": 34.85, "epoch_pct": 34.85, "eta": "44:02:24", "max_grad_norm": 0.8, "loss": 0.5903641581535339, "grad_norm": 1.2665088176727295, "learning_rate": 8.042457446633174e-05} +{"ts": "2025-12-27T17:42:42", "event": "train_log", "step": 4958, "epoch": 2.091983122362869, "progress_pct": 34.87, "epoch_pct": 34.87, "eta": "44:01:09", "max_grad_norm": 0.8, "loss": 0.5854214429855347, "grad_norm": 1.1283398866653442, "learning_rate": 8.040551390604902e-05} +{"ts": "2025-12-27T17:42:55", "event": "train_log", "step": 4960, "epoch": 2.0928270042194095, "progress_pct": 34.88, "epoch_pct": 34.88, "eta": "43:59:56", "max_grad_norm": 0.8, "loss": 0.5843619108200073, "grad_norm": 1.1194316148757935, "learning_rate": 8.03864463317982e-05} +{"ts": "2025-12-27T17:43:07", "event": "train_log", "step": 4962, "epoch": 2.0936708860759494, "progress_pct": 34.89, "epoch_pct": 34.89, "eta": "43:58:39", "max_grad_norm": 0.8, "loss": 0.6115096211433411, "grad_norm": 1.3581651449203491, "learning_rate": 8.036737174797778e-05} +{"ts": "2025-12-27T17:43:19", "event": "train_log", "step": 4964, "epoch": 2.0945147679324894, "progress_pct": 34.91, "epoch_pct": 34.91, "eta": "43:57:24", "max_grad_norm": 0.8, "loss": 0.5998795032501221, "grad_norm": 1.341748595237732, "learning_rate": 8.034829015898793e-05} +{"ts": "2025-12-27T17:43:32", "event": "train_log", "step": 4966, "epoch": 2.0953586497890297, "progress_pct": 34.92, "epoch_pct": 34.92, "eta": "43:56:10", "max_grad_norm": 0.8, "loss": 0.628372311592102, "grad_norm": 1.2212611436843872, "learning_rate": 8.032920156923038e-05} +{"ts": "2025-12-27T17:43:45", "event": "train_log", "step": 4968, "epoch": 2.0962025316455697, "progress_pct": 34.94, "epoch_pct": 34.94, "eta": "43:54:56", "max_grad_norm": 0.8, "loss": 0.5668916702270508, "grad_norm": 1.1348317861557007, "learning_rate": 8.031010598310851e-05} +{"ts": "2025-12-27T17:43:58", "event": "train_log", "step": 4970, "epoch": 2.0970464135021096, "progress_pct": 34.95, "epoch_pct": 34.95, "eta": "43:53:44", "max_grad_norm": 0.8, "loss": 0.5253881216049194, "grad_norm": 1.1106547117233276, "learning_rate": 8.029100340502731e-05} +{"ts": "2025-12-27T17:44:10", "event": "train_log", "step": 4972, "epoch": 2.09789029535865, "progress_pct": 34.96, "epoch_pct": 34.96, "eta": "43:52:28", "max_grad_norm": 0.8, "loss": 0.5790762901306152, "grad_norm": 1.2471354007720947, "learning_rate": 8.027189383939339e-05} +{"ts": "2025-12-27T17:44:23", "event": "train_log", "step": 4974, "epoch": 2.09873417721519, "progress_pct": 34.98, "epoch_pct": 34.98, "eta": "43:51:14", "max_grad_norm": 0.8, "loss": 0.6382888555526733, "grad_norm": 1.2477394342422485, "learning_rate": 8.025277729061492e-05} +{"ts": "2025-12-27T17:44:35", "event": "train_log", "step": 4976, "epoch": 2.09957805907173, "progress_pct": 34.99, "epoch_pct": 34.99, "eta": "43:49:59", "max_grad_norm": 0.8, "loss": 0.5962072610855103, "grad_norm": 1.2716054916381836, "learning_rate": 8.023365376310176e-05} +{"ts": "2025-12-27T17:44:47", "event": "train_log", "step": 4978, "epoch": 2.10042194092827, "progress_pct": 35.01, "epoch_pct": 35.01, "eta": "43:48:44", "max_grad_norm": 0.8, "loss": 0.5882940292358398, "grad_norm": 1.257820725440979, "learning_rate": 8.021452326126532e-05} +{"ts": "2025-12-27T17:45:00", "event": "train_log", "step": 4980, "epoch": 2.1012658227848102, "progress_pct": 35.02, "epoch_pct": 35.02, "eta": "43:47:29", "max_grad_norm": 0.8, "loss": 0.5640701055526733, "grad_norm": 1.0924186706542969, "learning_rate": 8.019538578951864e-05} +{"ts": "2025-12-27T17:45:12", "event": "train_log", "step": 4982, "epoch": 2.10210970464135, "progress_pct": 35.04, "epoch_pct": 35.04, "eta": "43:46:15", "max_grad_norm": 0.8, "loss": 0.5746428966522217, "grad_norm": 1.1250383853912354, "learning_rate": 8.017624135227637e-05} +{"ts": "2025-12-27T17:45:26", "event": "train_log", "step": 4984, "epoch": 2.10295358649789, "progress_pct": 35.05, "epoch_pct": 35.05, "eta": "43:45:03", "max_grad_norm": 0.8, "loss": 0.5611346960067749, "grad_norm": 1.131323218345642, "learning_rate": 8.015708995395477e-05} +{"ts": "2025-12-27T17:45:38", "event": "train_log", "step": 4986, "epoch": 2.1037974683544305, "progress_pct": 35.06, "epoch_pct": 35.06, "eta": "43:43:49", "max_grad_norm": 0.8, "loss": 0.6173797249794006, "grad_norm": 1.4267152547836304, "learning_rate": 8.013793159897171e-05} +{"ts": "2025-12-27T17:45:50", "event": "train_log", "step": 4988, "epoch": 2.1046413502109704, "progress_pct": 35.08, "epoch_pct": 35.08, "eta": "43:42:33", "max_grad_norm": 0.8, "loss": 0.64865642786026, "grad_norm": 1.41414213180542, "learning_rate": 8.011876629174662e-05} +{"ts": "2025-12-27T17:46:03", "event": "train_log", "step": 4990, "epoch": 2.1054852320675104, "progress_pct": 35.09, "epoch_pct": 35.09, "eta": "43:41:20", "max_grad_norm": 0.8, "loss": 0.6125827431678772, "grad_norm": 1.1498184204101562, "learning_rate": 8.00995940367006e-05} +{"ts": "2025-12-27T17:46:15", "event": "train_log", "step": 4992, "epoch": 2.1063291139240508, "progress_pct": 35.11, "epoch_pct": 35.11, "eta": "43:40:06", "max_grad_norm": 0.8, "loss": 0.670495867729187, "grad_norm": 1.2327708005905151, "learning_rate": 8.00804148382563e-05} +{"ts": "2025-12-27T17:46:27", "event": "train_log", "step": 4994, "epoch": 2.1071729957805907, "progress_pct": 35.12, "epoch_pct": 35.12, "eta": "43:38:51", "max_grad_norm": 0.8, "loss": 0.6020209193229675, "grad_norm": 1.2797311544418335, "learning_rate": 8.0061228700838e-05} +{"ts": "2025-12-27T17:46:41", "event": "train_log", "step": 4996, "epoch": 2.1080168776371306, "progress_pct": 35.13, "epoch_pct": 35.13, "eta": "43:37:39", "max_grad_norm": 0.8, "loss": 0.5974310636520386, "grad_norm": 1.079584002494812, "learning_rate": 8.004203562887157e-05} +{"ts": "2025-12-27T17:46:53", "event": "train_log", "step": 4998, "epoch": 2.108860759493671, "progress_pct": 35.15, "epoch_pct": 35.15, "eta": "43:36:25", "max_grad_norm": 0.8, "loss": 0.6424587368965149, "grad_norm": 1.4352604150772095, "learning_rate": 8.002283562678452e-05} +{"ts": "2025-12-27T17:47:06", "event": "train_log", "step": 5000, "epoch": 2.109704641350211, "progress_pct": 35.16, "epoch_pct": 35.16, "eta": "43:35:12", "max_grad_norm": 0.8, "loss": 0.6185846328735352, "grad_norm": 1.0876719951629639, "learning_rate": 8.000362869900586e-05} +{"ts": "2025-12-27T17:58:22", "event": "train_log", "step": 5000, "epoch": 2.109704641350211, "progress_pct": 35.16, "epoch_pct": 35.16, "eta": "43:55:58", "max_grad_norm": 0.8, "eval_loss": 0.6908889412879944, "eval_runtime": 675.8398, "eval_samples_per_second": 3.118, "eval_steps_per_second": 3.118} +{"ts": "2025-12-27T17:58:35", "event": "train_log", "step": 5002, "epoch": 2.110548523206751, "progress_pct": 35.18, "epoch_pct": 35.18, "eta": "43:54:45", "max_grad_norm": 0.8, "loss": 0.6127280592918396, "grad_norm": 1.0125762224197388, "learning_rate": 7.998441484996631e-05} +{"ts": "2025-12-27T17:58:48", "event": "train_log", "step": 5004, "epoch": 2.1113924050632913, "progress_pct": 35.19, "epoch_pct": 35.19, "eta": "43:53:31", "max_grad_norm": 0.8, "loss": 0.5495694875717163, "grad_norm": 1.0253753662109375, "learning_rate": 7.99651940840981e-05} +{"ts": "2025-12-27T17:59:00", "event": "train_log", "step": 5006, "epoch": 2.1122362869198312, "progress_pct": 35.2, "epoch_pct": 35.2, "eta": "43:52:16", "max_grad_norm": 0.8, "loss": 0.6199497580528259, "grad_norm": 1.5620673894882202, "learning_rate": 7.994596640583511e-05} +{"ts": "2025-12-27T17:59:13", "event": "train_log", "step": 5008, "epoch": 2.113080168776371, "progress_pct": 35.22, "epoch_pct": 35.22, "eta": "43:51:03", "max_grad_norm": 0.8, "loss": 0.5896390676498413, "grad_norm": 1.3032969236373901, "learning_rate": 7.992673181961281e-05} +{"ts": "2025-12-27T17:59:27", "event": "train_log", "step": 5010, "epoch": 2.1139240506329116, "progress_pct": 35.23, "epoch_pct": 35.23, "eta": "43:49:51", "max_grad_norm": 0.8, "loss": 0.6332341432571411, "grad_norm": 1.0933046340942383, "learning_rate": 7.990749032986821e-05} +{"ts": "2025-12-27T17:59:39", "event": "train_log", "step": 5012, "epoch": 2.1147679324894515, "progress_pct": 35.25, "epoch_pct": 35.25, "eta": "43:48:35", "max_grad_norm": 0.8, "loss": 0.5964323282241821, "grad_norm": 1.3115314245224, "learning_rate": 7.988824194104e-05} +{"ts": "2025-12-27T17:59:51", "event": "train_log", "step": 5014, "epoch": 2.1156118143459914, "progress_pct": 35.26, "epoch_pct": 35.26, "eta": "43:47:21", "max_grad_norm": 0.8, "loss": 0.5938325524330139, "grad_norm": 1.229978084564209, "learning_rate": 7.986898665756837e-05} +{"ts": "2025-12-27T18:00:04", "event": "train_log", "step": 5016, "epoch": 2.116455696202532, "progress_pct": 35.27, "epoch_pct": 35.27, "eta": "43:46:07", "max_grad_norm": 0.8, "loss": 0.5761791467666626, "grad_norm": 1.1779940128326416, "learning_rate": 7.984972448389517e-05} +{"ts": "2025-12-27T18:00:17", "event": "train_log", "step": 5018, "epoch": 2.1172995780590718, "progress_pct": 35.29, "epoch_pct": 35.29, "eta": "43:44:54", "max_grad_norm": 0.8, "loss": 0.6073653101921082, "grad_norm": 1.063490629196167, "learning_rate": 7.98304554244638e-05} +{"ts": "2025-12-27T18:00:29", "event": "train_log", "step": 5020, "epoch": 2.1181434599156117, "progress_pct": 35.3, "epoch_pct": 35.3, "eta": "43:43:40", "max_grad_norm": 0.8, "loss": 0.6126761436462402, "grad_norm": 1.2390391826629639, "learning_rate": 7.981117948371927e-05} +{"ts": "2025-12-27T18:00:42", "event": "train_log", "step": 5022, "epoch": 2.118987341772152, "progress_pct": 35.32, "epoch_pct": 35.32, "eta": "43:42:26", "max_grad_norm": 0.8, "loss": 0.614434003829956, "grad_norm": 1.1946247816085815, "learning_rate": 7.979189666610818e-05} +{"ts": "2025-12-27T18:00:54", "event": "train_log", "step": 5024, "epoch": 2.119831223628692, "progress_pct": 35.33, "epoch_pct": 35.33, "eta": "43:41:11", "max_grad_norm": 0.8, "loss": 0.5947603583335876, "grad_norm": 1.1008374691009521, "learning_rate": 7.977260697607867e-05} +{"ts": "2025-12-27T18:01:06", "event": "train_log", "step": 5026, "epoch": 2.120675105485232, "progress_pct": 35.34, "epoch_pct": 35.34, "eta": "43:39:58", "max_grad_norm": 0.8, "loss": 0.583965539932251, "grad_norm": 1.14899480342865, "learning_rate": 7.975331041808054e-05} +{"ts": "2025-12-27T18:01:19", "event": "train_log", "step": 5028, "epoch": 2.1215189873417724, "progress_pct": 35.36, "epoch_pct": 35.36, "eta": "43:38:44", "max_grad_norm": 0.8, "loss": 0.615121603012085, "grad_norm": 1.1627864837646484, "learning_rate": 7.973400699656512e-05} +{"ts": "2025-12-27T18:01:31", "event": "train_log", "step": 5030, "epoch": 2.1223628691983123, "progress_pct": 35.37, "epoch_pct": 35.37, "eta": "43:37:29", "max_grad_norm": 0.8, "loss": 0.6268601417541504, "grad_norm": 1.3622617721557617, "learning_rate": 7.971469671598532e-05} +{"ts": "2025-12-27T18:01:43", "event": "train_log", "step": 5032, "epoch": 2.1232067510548522, "progress_pct": 35.39, "epoch_pct": 35.39, "eta": "43:36:15", "max_grad_norm": 0.8, "loss": 0.6021270155906677, "grad_norm": 1.1735879182815552, "learning_rate": 7.96953795807957e-05} +{"ts": "2025-12-27T18:01:55", "event": "train_log", "step": 5034, "epoch": 2.124050632911392, "progress_pct": 35.4, "epoch_pct": 35.4, "eta": "43:35:00", "max_grad_norm": 0.8, "loss": 0.636816680431366, "grad_norm": 1.3856201171875, "learning_rate": 7.96760555954523e-05} +{"ts": "2025-12-27T18:02:09", "event": "train_log", "step": 5036, "epoch": 2.1248945147679326, "progress_pct": 35.41, "epoch_pct": 35.41, "eta": "43:33:48", "max_grad_norm": 0.8, "loss": 0.5324423313140869, "grad_norm": 1.1410126686096191, "learning_rate": 7.965672476441282e-05} +{"ts": "2025-12-27T18:02:20", "event": "train_log", "step": 5038, "epoch": 2.1257383966244725, "progress_pct": 35.43, "epoch_pct": 35.43, "eta": "43:32:33", "max_grad_norm": 0.8, "loss": 0.7433624267578125, "grad_norm": 1.446070909500122, "learning_rate": 7.963738709213651e-05} +{"ts": "2025-12-27T18:02:32", "event": "train_log", "step": 5040, "epoch": 2.1265822784810124, "progress_pct": 35.44, "epoch_pct": 35.44, "eta": "43:31:18", "max_grad_norm": 0.8, "loss": 0.6359145641326904, "grad_norm": 1.3041753768920898, "learning_rate": 7.961804258308419e-05} +{"ts": "2025-12-27T18:02:45", "event": "train_log", "step": 5042, "epoch": 2.127426160337553, "progress_pct": 35.46, "epoch_pct": 35.46, "eta": "43:30:05", "max_grad_norm": 0.8, "loss": 0.6164234280586243, "grad_norm": 1.2043813467025757, "learning_rate": 7.959869124171826e-05} +{"ts": "2025-12-27T18:02:57", "event": "train_log", "step": 5044, "epoch": 2.1282700421940928, "progress_pct": 35.47, "epoch_pct": 35.47, "eta": "43:28:51", "max_grad_norm": 0.8, "loss": 0.6437279582023621, "grad_norm": 1.2375630140304565, "learning_rate": 7.957933307250273e-05} +{"ts": "2025-12-27T18:03:10", "event": "train_log", "step": 5046, "epoch": 2.1291139240506327, "progress_pct": 35.49, "epoch_pct": 35.49, "eta": "43:27:38", "max_grad_norm": 0.8, "loss": 0.585924506187439, "grad_norm": 1.210644245147705, "learning_rate": 7.955996807990314e-05} +{"ts": "2025-12-27T18:03:22", "event": "train_log", "step": 5048, "epoch": 2.129957805907173, "progress_pct": 35.5, "epoch_pct": 35.5, "eta": "43:26:24", "max_grad_norm": 0.8, "loss": 0.6081803441047668, "grad_norm": 1.2011489868164062, "learning_rate": 7.954059626838661e-05} +{"ts": "2025-12-27T18:03:35", "event": "train_log", "step": 5050, "epoch": 2.130801687763713, "progress_pct": 35.51, "epoch_pct": 35.51, "eta": "43:25:11", "max_grad_norm": 0.8, "loss": 0.5609047412872314, "grad_norm": 1.0365782976150513, "learning_rate": 7.952121764242187e-05} +{"ts": "2025-12-27T18:03:48", "event": "train_log", "step": 5052, "epoch": 2.131645569620253, "progress_pct": 35.53, "epoch_pct": 35.53, "eta": "43:23:59", "max_grad_norm": 0.8, "loss": 0.5612874031066895, "grad_norm": 1.7950767278671265, "learning_rate": 7.950183220647918e-05} +{"ts": "2025-12-27T18:04:00", "event": "train_log", "step": 5054, "epoch": 2.1324894514767934, "progress_pct": 35.54, "epoch_pct": 35.54, "eta": "43:22:45", "max_grad_norm": 0.8, "loss": 0.6554630994796753, "grad_norm": 1.2933409214019775, "learning_rate": 7.94824399650304e-05} +{"ts": "2025-12-27T18:04:13", "event": "train_log", "step": 5056, "epoch": 2.1333333333333333, "progress_pct": 35.56, "epoch_pct": 35.56, "eta": "43:21:32", "max_grad_norm": 0.8, "loss": 0.5623239278793335, "grad_norm": 1.129828929901123, "learning_rate": 7.946304092254894e-05} +{"ts": "2025-12-27T18:04:25", "event": "train_log", "step": 5058, "epoch": 2.1341772151898732, "progress_pct": 35.57, "epoch_pct": 35.57, "eta": "43:20:19", "max_grad_norm": 0.8, "loss": 0.5036910772323608, "grad_norm": 1.1060296297073364, "learning_rate": 7.944363508350978e-05} +{"ts": "2025-12-27T18:04:37", "event": "train_log", "step": 5060, "epoch": 2.1350210970464136, "progress_pct": 35.58, "epoch_pct": 35.58, "eta": "43:19:05", "max_grad_norm": 0.8, "loss": 0.5840913653373718, "grad_norm": 1.2622627019882202, "learning_rate": 7.94242224523895e-05} +{"ts": "2025-12-27T18:04:48", "event": "train_log", "step": 5062, "epoch": 2.1358649789029536, "progress_pct": 35.6, "epoch_pct": 35.6, "eta": "43:17:49", "max_grad_norm": 0.8, "loss": 0.6365578770637512, "grad_norm": 1.3803153038024902, "learning_rate": 7.940480303366618e-05} +{"ts": "2025-12-27T18:05:09", "event": "train_log", "step": 5064, "epoch": 2.1367088607594935, "progress_pct": 35.61, "epoch_pct": 35.61, "eta": "43:16:52", "max_grad_norm": 0.8, "loss": 0.6167916655540466, "grad_norm": 1.2524651288986206, "learning_rate": 7.938537683181955e-05} +{"ts": "2025-12-27T18:05:35", "event": "train_log", "step": 5066, "epoch": 2.137552742616034, "progress_pct": 35.63, "epoch_pct": 35.63, "eta": "43:16:03", "max_grad_norm": 0.8, "loss": 0.6356930732727051, "grad_norm": 1.3320350646972656, "learning_rate": 7.936594385133083e-05} +{"ts": "2025-12-27T18:06:01", "event": "train_log", "step": 5068, "epoch": 2.138396624472574, "progress_pct": 35.64, "epoch_pct": 35.64, "eta": "43:15:14", "max_grad_norm": 0.8, "loss": 0.5888242721557617, "grad_norm": 1.3180949687957764, "learning_rate": 7.934650409668285e-05} +{"ts": "2025-12-27T18:06:20", "event": "train_log", "step": 5070, "epoch": 2.1392405063291138, "progress_pct": 35.65, "epoch_pct": 35.65, "eta": "43:14:14", "max_grad_norm": 0.8, "loss": 0.608725905418396, "grad_norm": 1.1376243829727173, "learning_rate": 7.932705757235999e-05} +{"ts": "2025-12-27T18:06:33", "event": "train_log", "step": 5072, "epoch": 2.140084388185654, "progress_pct": 35.67, "epoch_pct": 35.67, "eta": "43:13:01", "max_grad_norm": 0.8, "loss": 0.5824158787727356, "grad_norm": 1.1734369993209839, "learning_rate": 7.930760428284817e-05} +{"ts": "2025-12-27T18:06:46", "event": "train_log", "step": 5074, "epoch": 2.140928270042194, "progress_pct": 35.68, "epoch_pct": 35.68, "eta": "43:11:49", "max_grad_norm": 0.8, "loss": 0.5629416704177856, "grad_norm": 1.1038579940795898, "learning_rate": 7.928814423263493e-05} +{"ts": "2025-12-27T18:06:58", "event": "train_log", "step": 5076, "epoch": 2.141772151898734, "progress_pct": 35.7, "epoch_pct": 35.7, "eta": "43:10:36", "max_grad_norm": 0.8, "loss": 0.5994445085525513, "grad_norm": 1.269780158996582, "learning_rate": 7.926867742620929e-05} +{"ts": "2025-12-27T18:07:10", "event": "train_log", "step": 5078, "epoch": 2.1426160337552744, "progress_pct": 35.71, "epoch_pct": 35.71, "eta": "43:09:22", "max_grad_norm": 0.8, "loss": 0.5845475792884827, "grad_norm": 1.2274279594421387, "learning_rate": 7.924920386806188e-05} +{"ts": "2025-12-27T18:07:23", "event": "train_log", "step": 5080, "epoch": 2.1434599156118144, "progress_pct": 35.72, "epoch_pct": 35.72, "eta": "43:08:10", "max_grad_norm": 0.8, "loss": 0.621201753616333, "grad_norm": 1.168766975402832, "learning_rate": 7.922972356268488e-05} +{"ts": "2025-12-27T18:07:41", "event": "train_log", "step": 5082, "epoch": 2.1443037974683543, "progress_pct": 35.74, "epoch_pct": 35.74, "eta": "43:07:08", "max_grad_norm": 0.8, "loss": 0.5282597541809082, "grad_norm": 1.0057638883590698, "learning_rate": 7.921023651457203e-05} +{"ts": "2025-12-27T18:08:07", "event": "train_log", "step": 5084, "epoch": 2.1451476793248947, "progress_pct": 35.75, "epoch_pct": 35.75, "eta": "43:06:20", "max_grad_norm": 0.8, "loss": 0.632583737373352, "grad_norm": 1.432309865951538, "learning_rate": 7.91907427282186e-05} +{"ts": "2025-12-27T18:08:35", "event": "train_log", "step": 5086, "epoch": 2.1459915611814346, "progress_pct": 35.77, "epoch_pct": 35.77, "eta": "43:05:35", "max_grad_norm": 0.8, "loss": 0.6239289045333862, "grad_norm": 1.3939776420593262, "learning_rate": 7.917124220812144e-05} +{"ts": "2025-12-27T18:09:05", "event": "train_log", "step": 5088, "epoch": 2.1468354430379746, "progress_pct": 35.78, "epoch_pct": 35.78, "eta": "43:04:53", "max_grad_norm": 0.8, "loss": 0.5749062895774841, "grad_norm": 1.3741775751113892, "learning_rate": 7.915173495877895e-05} +{"ts": "2025-12-27T18:09:26", "event": "train_log", "step": 5090, "epoch": 2.147679324894515, "progress_pct": 35.79, "epoch_pct": 35.79, "eta": "43:03:56", "max_grad_norm": 0.8, "loss": 0.6011738181114197, "grad_norm": 1.3123528957366943, "learning_rate": 7.913222098469109e-05} +{"ts": "2025-12-27T18:09:38", "event": "train_log", "step": 5092, "epoch": 2.148523206751055, "progress_pct": 35.81, "epoch_pct": 35.81, "eta": "43:02:43", "max_grad_norm": 0.8, "loss": 0.5804699659347534, "grad_norm": 1.3473498821258545, "learning_rate": 7.911270029035932e-05} +{"ts": "2025-12-27T18:09:50", "event": "train_log", "step": 5094, "epoch": 2.149367088607595, "progress_pct": 35.82, "epoch_pct": 35.82, "eta": "43:01:31", "max_grad_norm": 0.8, "loss": 0.6446103453636169, "grad_norm": 1.0873067378997803, "learning_rate": 7.909317288028673e-05} +{"ts": "2025-12-27T18:10:03", "event": "train_log", "step": 5096, "epoch": 2.1502109704641352, "progress_pct": 35.84, "epoch_pct": 35.84, "eta": "43:00:19", "max_grad_norm": 0.8, "loss": 0.6136524677276611, "grad_norm": 1.1374083757400513, "learning_rate": 7.907363875897789e-05} +{"ts": "2025-12-27T18:10:20", "event": "train_log", "step": 5098, "epoch": 2.151054852320675, "progress_pct": 35.85, "epoch_pct": 35.85, "eta": "42:59:15", "max_grad_norm": 0.8, "loss": 0.5107976794242859, "grad_norm": 1.1356533765792847, "learning_rate": 7.905409793093896e-05} +{"ts": "2025-12-27T18:10:49", "event": "train_log", "step": 5100, "epoch": 2.151898734177215, "progress_pct": 35.86, "epoch_pct": 35.86, "eta": "42:58:31", "max_grad_norm": 0.8, "loss": 0.6073099374771118, "grad_norm": 1.2579567432403564, "learning_rate": 7.903455040067763e-05} +{"ts": "2025-12-27T18:23:03", "event": "train_log", "step": 5100, "epoch": 2.151898734177215, "progress_pct": 35.86, "epoch_pct": 35.86, "eta": "43:20:23", "max_grad_norm": 0.8, "eval_loss": 0.6902023553848267, "eval_runtime": 733.915, "eval_samples_per_second": 2.871, "eval_steps_per_second": 2.871} +{"ts": "2025-12-27T18:23:15", "event": "train_log", "step": 5102, "epoch": 2.1527426160337555, "progress_pct": 35.88, "epoch_pct": 35.88, "eta": "43:19:10", "max_grad_norm": 0.8, "loss": 0.5562406182289124, "grad_norm": 1.2401398420333862, "learning_rate": 7.901499617270315e-05} +{"ts": "2025-12-27T18:23:38", "event": "train_log", "step": 5104, "epoch": 2.1535864978902954, "progress_pct": 35.89, "epoch_pct": 35.89, "eta": "43:18:16", "max_grad_norm": 0.8, "loss": 0.5749467015266418, "grad_norm": 1.086590051651001, "learning_rate": 7.899543525152628e-05} +{"ts": "2025-12-27T18:23:51", "event": "train_log", "step": 5106, "epoch": 2.1544303797468354, "progress_pct": 35.91, "epoch_pct": 35.91, "eta": "43:17:04", "max_grad_norm": 0.8, "loss": 0.6326877474784851, "grad_norm": 1.206458568572998, "learning_rate": 7.897586764165939e-05} +{"ts": "2025-12-27T18:24:04", "event": "train_log", "step": 5108, "epoch": 2.1552742616033758, "progress_pct": 35.92, "epoch_pct": 35.92, "eta": "43:15:51", "max_grad_norm": 0.8, "loss": 0.5616445541381836, "grad_norm": 1.030740737915039, "learning_rate": 7.895629334761632e-05} +{"ts": "2025-12-27T18:24:16", "event": "train_log", "step": 5110, "epoch": 2.1561181434599157, "progress_pct": 35.94, "epoch_pct": 35.94, "eta": "43:14:39", "max_grad_norm": 0.8, "loss": 0.6307384371757507, "grad_norm": 1.3338581323623657, "learning_rate": 7.89367123739125e-05} +{"ts": "2025-12-27T18:24:28", "event": "train_log", "step": 5112, "epoch": 2.1569620253164556, "progress_pct": 35.95, "epoch_pct": 35.95, "eta": "43:13:25", "max_grad_norm": 0.8, "loss": 0.6087653636932373, "grad_norm": 1.2684671878814697, "learning_rate": 7.891712472506485e-05} +{"ts": "2025-12-27T18:24:41", "event": "train_log", "step": 5114, "epoch": 2.1578059071729956, "progress_pct": 35.96, "epoch_pct": 35.96, "eta": "43:12:12", "max_grad_norm": 0.8, "loss": 0.5747998952865601, "grad_norm": 1.1610581874847412, "learning_rate": 7.889753040559188e-05} +{"ts": "2025-12-27T18:24:54", "event": "train_log", "step": 5116, "epoch": 2.158649789029536, "progress_pct": 35.98, "epoch_pct": 35.98, "eta": "43:11:01", "max_grad_norm": 0.8, "loss": 0.6143770217895508, "grad_norm": 1.4069275856018066, "learning_rate": 7.887792942001366e-05} +{"ts": "2025-12-27T18:25:07", "event": "train_log", "step": 5118, "epoch": 2.159493670886076, "progress_pct": 35.99, "epoch_pct": 35.99, "eta": "43:09:48", "max_grad_norm": 0.8, "loss": 0.552534282207489, "grad_norm": 1.0858227014541626, "learning_rate": 7.885832177285173e-05} +{"ts": "2025-12-27T18:25:20", "event": "train_log", "step": 5120, "epoch": 2.160337552742616, "progress_pct": 36.01, "epoch_pct": 36.01, "eta": "43:08:37", "max_grad_norm": 0.8, "loss": 0.5781989693641663, "grad_norm": 1.067070722579956, "learning_rate": 7.88387074686292e-05} +{"ts": "2025-12-27T18:25:33", "event": "train_log", "step": 5122, "epoch": 2.1611814345991562, "progress_pct": 36.02, "epoch_pct": 36.02, "eta": "43:07:25", "max_grad_norm": 0.8, "loss": 0.5521422624588013, "grad_norm": 1.139981746673584, "learning_rate": 7.881908651187072e-05} +{"ts": "2025-12-27T18:25:47", "event": "train_log", "step": 5124, "epoch": 2.162025316455696, "progress_pct": 36.03, "epoch_pct": 36.03, "eta": "43:06:16", "max_grad_norm": 0.8, "loss": 0.5755025744438171, "grad_norm": 1.0987457036972046, "learning_rate": 7.879945890710245e-05} +{"ts": "2025-12-27T18:26:00", "event": "train_log", "step": 5126, "epoch": 2.162869198312236, "progress_pct": 36.05, "epoch_pct": 36.05, "eta": "43:05:04", "max_grad_norm": 0.8, "loss": 0.5783509612083435, "grad_norm": 1.1530758142471313, "learning_rate": 7.877982465885214e-05} +{"ts": "2025-12-27T18:26:24", "event": "train_log", "step": 5128, "epoch": 2.1637130801687765, "progress_pct": 36.06, "epoch_pct": 36.06, "eta": "43:04:11", "max_grad_norm": 0.8, "loss": 0.5942281484603882, "grad_norm": 1.2285696268081665, "learning_rate": 7.876018377164899e-05} +{"ts": "2025-12-27T18:26:37", "event": "train_log", "step": 5130, "epoch": 2.1645569620253164, "progress_pct": 36.08, "epoch_pct": 36.08, "eta": "43:03:00", "max_grad_norm": 0.8, "loss": 0.5539707541465759, "grad_norm": 1.1283711194992065, "learning_rate": 7.874053625002378e-05} +{"ts": "2025-12-27T18:26:56", "event": "train_log", "step": 5132, "epoch": 2.1654008438818564, "progress_pct": 36.09, "epoch_pct": 36.09, "eta": "43:02:00", "max_grad_norm": 0.8, "loss": 0.5955292582511902, "grad_norm": 1.3213335275650024, "learning_rate": 7.872088209850885e-05} +{"ts": "2025-12-27T18:27:26", "event": "train_log", "step": 5134, "epoch": 2.1662447257383968, "progress_pct": 36.1, "epoch_pct": 36.1, "eta": "43:01:18", "max_grad_norm": 0.8, "loss": 0.5422899723052979, "grad_norm": 1.1748592853546143, "learning_rate": 7.8701221321638e-05} +{"ts": "2025-12-27T18:27:54", "event": "train_log", "step": 5136, "epoch": 2.1670886075949367, "progress_pct": 36.12, "epoch_pct": 36.12, "eta": "43:00:34", "max_grad_norm": 0.8, "loss": 0.5547205209732056, "grad_norm": 1.0752148628234863, "learning_rate": 7.868155392394662e-05} +{"ts": "2025-12-27T18:28:19", "event": "train_log", "step": 5138, "epoch": 2.1679324894514767, "progress_pct": 36.13, "epoch_pct": 36.13, "eta": "42:59:44", "max_grad_norm": 0.8, "loss": 0.5938948392868042, "grad_norm": 1.1814554929733276, "learning_rate": 7.86618799099716e-05} +{"ts": "2025-12-27T18:28:32", "event": "train_log", "step": 5140, "epoch": 2.168776371308017, "progress_pct": 36.15, "epoch_pct": 36.15, "eta": "42:58:33", "max_grad_norm": 0.8, "loss": 0.6468925476074219, "grad_norm": 1.3455278873443604, "learning_rate": 7.864219928425132e-05} +{"ts": "2025-12-27T18:28:45", "event": "train_log", "step": 5142, "epoch": 2.169620253164557, "progress_pct": 36.16, "epoch_pct": 36.16, "eta": "42:57:21", "max_grad_norm": 0.8, "loss": 0.5704391002655029, "grad_norm": 1.2695354223251343, "learning_rate": 7.862251205132576e-05} +{"ts": "2025-12-27T18:28:57", "event": "train_log", "step": 5144, "epoch": 2.170464135021097, "progress_pct": 36.17, "epoch_pct": 36.17, "eta": "42:56:08", "max_grad_norm": 0.8, "loss": 0.6057283878326416, "grad_norm": 1.1529468297958374, "learning_rate": 7.860281821573638e-05} +{"ts": "2025-12-27T18:29:10", "event": "train_log", "step": 5146, "epoch": 2.1713080168776373, "progress_pct": 36.19, "epoch_pct": 36.19, "eta": "42:54:56", "max_grad_norm": 0.8, "loss": 0.6135527491569519, "grad_norm": 1.3461004495620728, "learning_rate": 7.858311778202616e-05} +{"ts": "2025-12-27T18:29:22", "event": "train_log", "step": 5148, "epoch": 2.1721518987341772, "progress_pct": 36.2, "epoch_pct": 36.2, "eta": "42:53:44", "max_grad_norm": 0.8, "loss": 0.5585638880729675, "grad_norm": 1.1258536577224731, "learning_rate": 7.856341075473962e-05} +{"ts": "2025-12-27T18:29:35", "event": "train_log", "step": 5150, "epoch": 2.172995780590717, "progress_pct": 36.22, "epoch_pct": 36.22, "eta": "42:52:33", "max_grad_norm": 0.8, "loss": 0.5780918002128601, "grad_norm": 1.254898190498352, "learning_rate": 7.854369713842279e-05} +{"ts": "2025-12-27T18:29:47", "event": "train_log", "step": 5152, "epoch": 2.1738396624472576, "progress_pct": 36.23, "epoch_pct": 36.23, "eta": "42:51:20", "max_grad_norm": 0.8, "loss": 0.595267117023468, "grad_norm": 1.2730201482772827, "learning_rate": 7.852397693762321e-05} +{"ts": "2025-12-27T18:30:05", "event": "train_log", "step": 5154, "epoch": 2.1746835443037975, "progress_pct": 36.24, "epoch_pct": 36.24, "eta": "42:50:17", "max_grad_norm": 0.8, "loss": 0.5636162161827087, "grad_norm": 1.1875078678131104, "learning_rate": 7.850425015688999e-05} +{"ts": "2025-12-27T18:30:22", "event": "train_log", "step": 5156, "epoch": 2.1755274261603375, "progress_pct": 36.26, "epoch_pct": 36.26, "eta": "42:49:13", "max_grad_norm": 0.8, "loss": 0.6362089514732361, "grad_norm": 1.0930945873260498, "learning_rate": 7.848451680077366e-05} +{"ts": "2025-12-27T18:30:35", "event": "train_log", "step": 5158, "epoch": 2.176371308016878, "progress_pct": 36.27, "epoch_pct": 36.27, "eta": "42:48:03", "max_grad_norm": 0.8, "loss": 0.6268675327301025, "grad_norm": 1.2274452447891235, "learning_rate": 7.846477687382639e-05} +{"ts": "2025-12-27T18:30:47", "event": "train_log", "step": 5160, "epoch": 2.1772151898734178, "progress_pct": 36.29, "epoch_pct": 36.29, "eta": "42:46:50", "max_grad_norm": 0.8, "loss": 0.6014906167984009, "grad_norm": 1.2023133039474487, "learning_rate": 7.844503038060176e-05} +{"ts": "2025-12-27T18:30:59", "event": "train_log", "step": 5162, "epoch": 2.1780590717299577, "progress_pct": 36.3, "epoch_pct": 36.3, "eta": "42:45:38", "max_grad_norm": 0.8, "loss": 0.6180019974708557, "grad_norm": 1.2616889476776123, "learning_rate": 7.842527732565491e-05} +{"ts": "2025-12-27T18:31:12", "event": "train_log", "step": 5164, "epoch": 2.1789029535864977, "progress_pct": 36.32, "epoch_pct": 36.32, "eta": "42:44:26", "max_grad_norm": 0.8, "loss": 0.5400100946426392, "grad_norm": 1.1046907901763916, "learning_rate": 7.84055177135425e-05} +{"ts": "2025-12-27T18:31:25", "event": "train_log", "step": 5166, "epoch": 2.179746835443038, "progress_pct": 36.33, "epoch_pct": 36.33, "eta": "42:43:16", "max_grad_norm": 0.8, "loss": 0.5713199973106384, "grad_norm": 1.1664032936096191, "learning_rate": 7.83857515488227e-05} +{"ts": "2025-12-27T18:31:38", "event": "train_log", "step": 5168, "epoch": 2.180590717299578, "progress_pct": 36.34, "epoch_pct": 36.34, "eta": "42:42:05", "max_grad_norm": 0.8, "loss": 0.5741307735443115, "grad_norm": 1.2526558637619019, "learning_rate": 7.836597883605519e-05} +{"ts": "2025-12-27T18:31:52", "event": "train_log", "step": 5170, "epoch": 2.181434599156118, "progress_pct": 36.36, "epoch_pct": 36.36, "eta": "42:40:55", "max_grad_norm": 0.8, "loss": 0.47188031673431396, "grad_norm": 1.0457103252410889, "learning_rate": 7.834619957980112e-05} +{"ts": "2025-12-27T18:32:07", "event": "train_log", "step": 5172, "epoch": 2.1822784810126583, "progress_pct": 36.37, "epoch_pct": 36.37, "eta": "42:39:49", "max_grad_norm": 0.8, "loss": 0.6149471998214722, "grad_norm": 1.1978110074996948, "learning_rate": 7.832641378462319e-05} +{"ts": "2025-12-27T18:32:37", "event": "train_log", "step": 5174, "epoch": 2.1831223628691983, "progress_pct": 36.39, "epoch_pct": 36.39, "eta": "42:39:08", "max_grad_norm": 0.8, "loss": 0.5520018339157104, "grad_norm": 1.2231460809707642, "learning_rate": 7.830662145508567e-05} +{"ts": "2025-12-27T18:33:09", "event": "train_log", "step": 5176, "epoch": 2.183966244725738, "progress_pct": 36.4, "epoch_pct": 36.4, "eta": "42:38:30", "max_grad_norm": 0.8, "loss": 0.6536548733711243, "grad_norm": 1.4367618560791016, "learning_rate": 7.828682259575417e-05} +{"ts": "2025-12-27T18:33:41", "event": "train_log", "step": 5178, "epoch": 2.1848101265822786, "progress_pct": 36.41, "epoch_pct": 36.41, "eta": "42:37:53", "max_grad_norm": 0.8, "loss": 0.5324372053146362, "grad_norm": 1.0891374349594116, "learning_rate": 7.826701721119598e-05} +{"ts": "2025-12-27T18:34:14", "event": "train_log", "step": 5180, "epoch": 2.1856540084388185, "progress_pct": 36.43, "epoch_pct": 36.43, "eta": "42:37:18", "max_grad_norm": 0.8, "loss": 0.6127952337265015, "grad_norm": 1.118695616722107, "learning_rate": 7.82472053059798e-05} +{"ts": "2025-12-27T18:34:39", "event": "train_log", "step": 5182, "epoch": 2.1864978902953585, "progress_pct": 36.44, "epoch_pct": 36.44, "eta": "42:36:28", "max_grad_norm": 0.8, "loss": 0.505962610244751, "grad_norm": 1.1116070747375488, "learning_rate": 7.822738688467585e-05} +{"ts": "2025-12-27T18:34:51", "event": "train_log", "step": 5184, "epoch": 2.187341772151899, "progress_pct": 36.46, "epoch_pct": 36.46, "eta": "42:35:17", "max_grad_norm": 0.8, "loss": 0.6210073232650757, "grad_norm": 1.2140545845031738, "learning_rate": 7.820756195185586e-05} +{"ts": "2025-12-27T18:35:03", "event": "train_log", "step": 5186, "epoch": 2.188185654008439, "progress_pct": 36.47, "epoch_pct": 36.47, "eta": "42:34:05", "max_grad_norm": 0.8, "loss": 0.6517674326896667, "grad_norm": 1.2135601043701172, "learning_rate": 7.818773051209307e-05} +{"ts": "2025-12-27T18:35:15", "event": "train_log", "step": 5188, "epoch": 2.1890295358649787, "progress_pct": 36.48, "epoch_pct": 36.48, "eta": "42:32:53", "max_grad_norm": 0.8, "loss": 0.5577492117881775, "grad_norm": 1.3875514268875122, "learning_rate": 7.816789256996218e-05} +{"ts": "2025-12-27T18:35:29", "event": "train_log", "step": 5190, "epoch": 2.189873417721519, "progress_pct": 36.5, "epoch_pct": 36.5, "eta": "42:31:43", "max_grad_norm": 0.8, "loss": 0.6010199189186096, "grad_norm": 1.181325912475586, "learning_rate": 7.814804813003949e-05} +{"ts": "2025-12-27T18:35:41", "event": "train_log", "step": 5192, "epoch": 2.190717299578059, "progress_pct": 36.51, "epoch_pct": 36.51, "eta": "42:30:32", "max_grad_norm": 0.8, "loss": 0.5635302662849426, "grad_norm": 1.102044701576233, "learning_rate": 7.812819719690265e-05} +{"ts": "2025-12-27T18:35:54", "event": "train_log", "step": 5194, "epoch": 2.191561181434599, "progress_pct": 36.53, "epoch_pct": 36.53, "eta": "42:29:21", "max_grad_norm": 0.8, "loss": 0.5804321765899658, "grad_norm": 1.4227958917617798, "learning_rate": 7.810833977513094e-05} +{"ts": "2025-12-27T18:36:06", "event": "train_log", "step": 5196, "epoch": 2.1924050632911394, "progress_pct": 36.54, "epoch_pct": 36.54, "eta": "42:28:10", "max_grad_norm": 0.8, "loss": 0.6005555987358093, "grad_norm": 1.2573446035385132, "learning_rate": 7.80884758693051e-05} +{"ts": "2025-12-27T18:36:19", "event": "train_log", "step": 5198, "epoch": 2.1932489451476793, "progress_pct": 36.55, "epoch_pct": 36.55, "eta": "42:26:58", "max_grad_norm": 0.8, "loss": 0.6263643503189087, "grad_norm": 1.3534085750579834, "learning_rate": 7.80686054840073e-05} +{"ts": "2025-12-27T18:36:31", "event": "train_log", "step": 5200, "epoch": 2.1940928270042193, "progress_pct": 36.57, "epoch_pct": 36.57, "eta": "42:25:47", "max_grad_norm": 0.8, "loss": 0.6235764622688293, "grad_norm": 1.6895852088928223, "learning_rate": 7.804872862382131e-05} +{"ts": "2025-12-27T18:55:59", "event": "train_log", "step": 5200, "epoch": 2.1940928270042193, "progress_pct": 36.57, "epoch_pct": 36.57, "eta": "42:59:33", "max_grad_norm": 0.8, "eval_loss": 0.6915348172187805, "eval_runtime": 1167.9782, "eval_samples_per_second": 1.804, "eval_steps_per_second": 1.804} +{"ts": "2025-12-27T18:56:30", "event": "train_log", "step": 5202, "epoch": 2.1949367088607596, "progress_pct": 36.58, "epoch_pct": 36.58, "eta": "42:58:53", "max_grad_norm": 0.8, "loss": 0.5586035847663879, "grad_norm": 1.138973593711853, "learning_rate": 7.802884529333227e-05} +{"ts": "2025-12-27T18:56:59", "event": "train_log", "step": 5204, "epoch": 2.1957805907172996, "progress_pct": 36.6, "epoch_pct": 36.6, "eta": "42:58:09", "max_grad_norm": 0.8, "loss": 0.5768917202949524, "grad_norm": 1.3664026260375977, "learning_rate": 7.800895549712697e-05} +{"ts": "2025-12-27T18:57:26", "event": "train_log", "step": 5206, "epoch": 2.1966244725738395, "progress_pct": 36.61, "epoch_pct": 36.61, "eta": "42:57:23", "max_grad_norm": 0.8, "loss": 0.6046215891838074, "grad_norm": 1.2182449102401733, "learning_rate": 7.798905923979353e-05} +{"ts": "2025-12-27T18:57:38", "event": "train_log", "step": 5208, "epoch": 2.19746835443038, "progress_pct": 36.62, "epoch_pct": 36.62, "eta": "42:56:10", "max_grad_norm": 0.8, "loss": 0.5412904024124146, "grad_norm": 1.2692211866378784, "learning_rate": 7.796915652592167e-05} +{"ts": "2025-12-27T18:57:58", "event": "train_log", "step": 5210, "epoch": 2.19831223628692, "progress_pct": 36.64, "epoch_pct": 36.64, "eta": "42:55:10", "max_grad_norm": 0.8, "loss": 0.5328584909439087, "grad_norm": 1.200822114944458, "learning_rate": 7.794924736010256e-05} +{"ts": "2025-12-27T18:58:29", "event": "train_log", "step": 5212, "epoch": 2.19915611814346, "progress_pct": 36.65, "epoch_pct": 36.65, "eta": "42:54:31", "max_grad_norm": 0.8, "loss": 0.5497913360595703, "grad_norm": 1.1093779802322388, "learning_rate": 7.792933174692886e-05} +{"ts": "2025-12-27T18:58:58", "event": "train_log", "step": 5214, "epoch": 2.2, "progress_pct": 36.67, "epoch_pct": 36.67, "eta": "42:53:48", "max_grad_norm": 0.8, "loss": 0.5908066034317017, "grad_norm": 1.3838921785354614, "learning_rate": 7.790940969099471e-05} +{"ts": "2025-12-27T18:59:27", "event": "train_log", "step": 5216, "epoch": 2.20084388185654, "progress_pct": 36.68, "epoch_pct": 36.68, "eta": "42:53:03", "max_grad_norm": 0.8, "loss": 0.6117307543754578, "grad_norm": 1.1411913633346558, "learning_rate": 7.788948119689576e-05} +{"ts": "2025-12-27T18:59:56", "event": "train_log", "step": 5218, "epoch": 2.20168776371308, "progress_pct": 36.69, "epoch_pct": 36.69, "eta": "42:52:20", "max_grad_norm": 0.8, "loss": 0.5788605809211731, "grad_norm": 1.5668916702270508, "learning_rate": 7.786954626922913e-05} +{"ts": "2025-12-27T19:00:24", "event": "train_log", "step": 5220, "epoch": 2.2025316455696204, "progress_pct": 36.71, "epoch_pct": 36.71, "eta": "42:51:35", "max_grad_norm": 0.8, "loss": 0.5948591828346252, "grad_norm": 1.195027232170105, "learning_rate": 7.784960491259344e-05} +{"ts": "2025-12-27T19:00:53", "event": "train_log", "step": 5222, "epoch": 2.2033755274261604, "progress_pct": 36.72, "epoch_pct": 36.72, "eta": "42:50:52", "max_grad_norm": 0.8, "loss": 0.6321669220924377, "grad_norm": 1.2665271759033203, "learning_rate": 7.782965713158872e-05} +{"ts": "2025-12-27T19:01:23", "event": "train_log", "step": 5224, "epoch": 2.2042194092827003, "progress_pct": 36.74, "epoch_pct": 36.74, "eta": "42:50:11", "max_grad_norm": 0.8, "loss": 0.5853859186172485, "grad_norm": 1.123711109161377, "learning_rate": 7.78097029308166e-05} +{"ts": "2025-12-27T19:01:53", "event": "train_log", "step": 5226, "epoch": 2.2050632911392407, "progress_pct": 36.75, "epoch_pct": 36.75, "eta": "42:49:29", "max_grad_norm": 0.8, "loss": 0.6485977172851562, "grad_norm": 1.9381071329116821, "learning_rate": 7.77897423148801e-05} +{"ts": "2025-12-27T19:02:23", "event": "train_log", "step": 5228, "epoch": 2.2059071729957807, "progress_pct": 36.77, "epoch_pct": 36.77, "eta": "42:48:46", "max_grad_norm": 0.8, "loss": 0.6243517398834229, "grad_norm": 1.4062265157699585, "learning_rate": 7.776977528838376e-05} +{"ts": "2025-12-27T19:02:52", "event": "train_log", "step": 5230, "epoch": 2.2067510548523206, "progress_pct": 36.78, "epoch_pct": 36.78, "eta": "42:48:03", "max_grad_norm": 0.8, "loss": 0.5770578980445862, "grad_norm": 1.2127182483673096, "learning_rate": 7.774980185593358e-05} +{"ts": "2025-12-27T19:03:19", "event": "train_log", "step": 5232, "epoch": 2.207594936708861, "progress_pct": 36.79, "epoch_pct": 36.79, "eta": "42:47:17", "max_grad_norm": 0.8, "loss": 0.6521194577217102, "grad_norm": 1.250847578048706, "learning_rate": 7.772982202213709e-05} +{"ts": "2025-12-27T19:03:50", "event": "train_log", "step": 5234, "epoch": 2.208438818565401, "progress_pct": 36.81, "epoch_pct": 36.81, "eta": "42:46:36", "max_grad_norm": 0.8, "loss": 0.5755271911621094, "grad_norm": 1.2568131685256958, "learning_rate": 7.77098357916032e-05} +{"ts": "2025-12-27T19:04:20", "event": "train_log", "step": 5236, "epoch": 2.209282700421941, "progress_pct": 36.82, "epoch_pct": 36.82, "eta": "42:45:55", "max_grad_norm": 0.8, "loss": 0.5486469864845276, "grad_norm": 1.2422975301742554, "learning_rate": 7.768984316894236e-05} +{"ts": "2025-12-27T19:04:51", "event": "train_log", "step": 5238, "epoch": 2.2101265822784812, "progress_pct": 36.84, "epoch_pct": 36.84, "eta": "42:45:15", "max_grad_norm": 0.8, "loss": 0.5512928366661072, "grad_norm": 1.1018635034561157, "learning_rate": 7.766984415876652e-05} +{"ts": "2025-12-27T19:05:20", "event": "train_log", "step": 5240, "epoch": 2.210970464135021, "progress_pct": 36.85, "epoch_pct": 36.85, "eta": "42:44:32", "max_grad_norm": 0.8, "loss": 0.5753499269485474, "grad_norm": 1.2261123657226562, "learning_rate": 7.764983876568903e-05} +{"ts": "2025-12-27T19:05:50", "event": "train_log", "step": 5242, "epoch": 2.211814345991561, "progress_pct": 36.86, "epoch_pct": 36.86, "eta": "42:43:51", "max_grad_norm": 0.8, "loss": 0.5404848456382751, "grad_norm": 1.2222342491149902, "learning_rate": 7.762982699432474e-05} +{"ts": "2025-12-27T19:06:21", "event": "train_log", "step": 5244, "epoch": 2.212658227848101, "progress_pct": 36.88, "epoch_pct": 36.88, "eta": "42:43:10", "max_grad_norm": 0.8, "loss": 0.5999218821525574, "grad_norm": 1.231494426727295, "learning_rate": 7.760980884929004e-05} +{"ts": "2025-12-27T19:06:49", "event": "train_log", "step": 5246, "epoch": 2.2135021097046415, "progress_pct": 36.89, "epoch_pct": 36.89, "eta": "42:42:26", "max_grad_norm": 0.8, "loss": 0.6123101115226746, "grad_norm": 1.1530078649520874, "learning_rate": 7.758978433520268e-05} +{"ts": "2025-12-27T19:07:18", "event": "train_log", "step": 5248, "epoch": 2.2143459915611814, "progress_pct": 36.91, "epoch_pct": 36.91, "eta": "42:41:43", "max_grad_norm": 0.8, "loss": 0.5945886969566345, "grad_norm": 1.182706594467163, "learning_rate": 7.756975345668194e-05} +{"ts": "2025-12-27T19:07:49", "event": "train_log", "step": 5250, "epoch": 2.2151898734177213, "progress_pct": 36.92, "epoch_pct": 36.92, "eta": "42:41:02", "max_grad_norm": 0.8, "loss": 0.5698213577270508, "grad_norm": 1.0788652896881104, "learning_rate": 7.754971621834857e-05} +{"ts": "2025-12-27T19:08:18", "event": "train_log", "step": 5252, "epoch": 2.2160337552742617, "progress_pct": 36.93, "epoch_pct": 36.93, "eta": "42:40:19", "max_grad_norm": 0.8, "loss": 0.5959678888320923, "grad_norm": 1.2243359088897705, "learning_rate": 7.752967262482477e-05} +{"ts": "2025-12-27T19:08:47", "event": "train_log", "step": 5254, "epoch": 2.2168776371308017, "progress_pct": 36.95, "epoch_pct": 36.95, "eta": "42:39:36", "max_grad_norm": 0.8, "loss": 0.586794376373291, "grad_norm": 1.4292869567871094, "learning_rate": 7.750962268073421e-05} +{"ts": "2025-12-27T19:09:16", "event": "train_log", "step": 5256, "epoch": 2.2177215189873416, "progress_pct": 36.96, "epoch_pct": 36.96, "eta": "42:38:53", "max_grad_norm": 0.8, "loss": 0.5513298511505127, "grad_norm": 1.1809570789337158, "learning_rate": 7.748956639070204e-05} +{"ts": "2025-12-27T19:09:44", "event": "train_log", "step": 5258, "epoch": 2.218565400843882, "progress_pct": 36.98, "epoch_pct": 36.98, "eta": "42:38:07", "max_grad_norm": 0.8, "loss": 0.6402831673622131, "grad_norm": 1.485813856124878, "learning_rate": 7.746950375935484e-05} +{"ts": "2025-12-27T19:10:14", "event": "train_log", "step": 5260, "epoch": 2.219409282700422, "progress_pct": 36.99, "epoch_pct": 36.99, "eta": "42:37:26", "max_grad_norm": 0.8, "loss": 0.5729117393493652, "grad_norm": 1.0851374864578247, "learning_rate": 7.744943479132069e-05} +{"ts": "2025-12-27T19:10:42", "event": "train_log", "step": 5262, "epoch": 2.220253164556962, "progress_pct": 37.0, "epoch_pct": 37.0, "eta": "42:36:41", "max_grad_norm": 0.8, "loss": 0.6239725947380066, "grad_norm": 1.4308949708938599, "learning_rate": 7.742935949122911e-05} +{"ts": "2025-12-27T19:11:10", "event": "train_log", "step": 5264, "epoch": 2.2210970464135023, "progress_pct": 37.02, "epoch_pct": 37.02, "eta": "42:35:57", "max_grad_norm": 0.8, "loss": 0.6260181069374084, "grad_norm": 1.379258155822754, "learning_rate": 7.740927786371107e-05} +{"ts": "2025-12-27T19:11:40", "event": "train_log", "step": 5266, "epoch": 2.221940928270042, "progress_pct": 37.03, "epoch_pct": 37.03, "eta": "42:35:16", "max_grad_norm": 0.8, "loss": 0.6074157357215881, "grad_norm": 1.1661925315856934, "learning_rate": 7.738918991339905e-05} +{"ts": "2025-12-27T19:12:09", "event": "train_log", "step": 5268, "epoch": 2.222784810126582, "progress_pct": 37.05, "epoch_pct": 37.05, "eta": "42:34:32", "max_grad_norm": 0.8, "loss": 0.6119515895843506, "grad_norm": 1.168901801109314, "learning_rate": 7.736909564492694e-05} +{"ts": "2025-12-27T19:12:39", "event": "train_log", "step": 5270, "epoch": 2.2236286919831225, "progress_pct": 37.06, "epoch_pct": 37.06, "eta": "42:33:51", "max_grad_norm": 0.8, "loss": 0.5505842566490173, "grad_norm": 1.1451057195663452, "learning_rate": 7.734899506293008e-05} +{"ts": "2025-12-27T19:13:08", "event": "train_log", "step": 5272, "epoch": 2.2244725738396625, "progress_pct": 37.07, "epoch_pct": 37.07, "eta": "42:33:08", "max_grad_norm": 0.8, "loss": 0.6117991805076599, "grad_norm": 1.2303991317749023, "learning_rate": 7.732888817204533e-05} +{"ts": "2025-12-27T19:13:39", "event": "train_log", "step": 5274, "epoch": 2.2253164556962024, "progress_pct": 37.09, "epoch_pct": 37.09, "eta": "42:32:27", "max_grad_norm": 0.8, "loss": 0.5589770078659058, "grad_norm": 1.04572331905365, "learning_rate": 7.730877497691092e-05} +{"ts": "2025-12-27T19:14:08", "event": "train_log", "step": 5276, "epoch": 2.226160337552743, "progress_pct": 37.1, "epoch_pct": 37.1, "eta": "42:31:44", "max_grad_norm": 0.8, "loss": 0.6288654208183289, "grad_norm": 1.2047234773635864, "learning_rate": 7.72886554821666e-05} +{"ts": "2025-12-27T19:14:36", "event": "train_log", "step": 5278, "epoch": 2.2270042194092827, "progress_pct": 37.12, "epoch_pct": 37.12, "eta": "42:31:00", "max_grad_norm": 0.8, "loss": 0.6174501776695251, "grad_norm": 1.2036652565002441, "learning_rate": 7.726852969245355e-05} +{"ts": "2025-12-27T19:15:07", "event": "train_log", "step": 5280, "epoch": 2.2278481012658227, "progress_pct": 37.13, "epoch_pct": 37.13, "eta": "42:30:19", "max_grad_norm": 0.8, "loss": 0.6027677655220032, "grad_norm": 1.1740167140960693, "learning_rate": 7.72483976124144e-05} +{"ts": "2025-12-27T19:15:39", "event": "train_log", "step": 5282, "epoch": 2.228691983122363, "progress_pct": 37.14, "epoch_pct": 37.14, "eta": "42:29:41", "max_grad_norm": 0.8, "loss": 0.6016151309013367, "grad_norm": 1.0600008964538574, "learning_rate": 7.722825924669326e-05} +{"ts": "2025-12-27T19:16:09", "event": "train_log", "step": 5284, "epoch": 2.229535864978903, "progress_pct": 37.16, "epoch_pct": 37.16, "eta": "42:29:00", "max_grad_norm": 0.8, "loss": 0.5905849933624268, "grad_norm": 1.2631008625030518, "learning_rate": 7.720811459993562e-05} +{"ts": "2025-12-27T19:16:39", "event": "train_log", "step": 5286, "epoch": 2.230379746835443, "progress_pct": 37.17, "epoch_pct": 37.17, "eta": "42:28:20", "max_grad_norm": 0.8, "loss": 0.5129587054252625, "grad_norm": 1.1024738550186157, "learning_rate": 7.718796367678848e-05} +{"ts": "2025-12-27T19:17:09", "event": "train_log", "step": 5288, "epoch": 2.2312236286919833, "progress_pct": 37.19, "epoch_pct": 37.19, "eta": "42:27:38", "max_grad_norm": 0.8, "loss": 0.5709586143493652, "grad_norm": 1.23116934299469, "learning_rate": 7.716780648190028e-05} +{"ts": "2025-12-27T19:17:38", "event": "train_log", "step": 5290, "epoch": 2.2320675105485233, "progress_pct": 37.2, "epoch_pct": 37.2, "eta": "42:26:54", "max_grad_norm": 0.8, "loss": 0.5454761385917664, "grad_norm": 1.2739102840423584, "learning_rate": 7.714764301992088e-05} +{"ts": "2025-12-27T19:18:05", "event": "train_log", "step": 5292, "epoch": 2.232911392405063, "progress_pct": 37.22, "epoch_pct": 37.22, "eta": "42:26:09", "max_grad_norm": 0.8, "loss": 0.537248969078064, "grad_norm": 1.303963303565979, "learning_rate": 7.712747329550162e-05} +{"ts": "2025-12-27T19:18:35", "event": "train_log", "step": 5294, "epoch": 2.233755274261603, "progress_pct": 37.23, "epoch_pct": 37.23, "eta": "42:25:27", "max_grad_norm": 0.8, "loss": 0.6364415884017944, "grad_norm": 1.2454309463500977, "learning_rate": 7.710729731329529e-05} +{"ts": "2025-12-27T19:19:04", "event": "train_log", "step": 5296, "epoch": 2.2345991561181435, "progress_pct": 37.24, "epoch_pct": 37.24, "eta": "42:24:44", "max_grad_norm": 0.8, "loss": 0.5640100240707397, "grad_norm": 1.2401882410049438, "learning_rate": 7.708711507795605e-05} +{"ts": "2025-12-27T19:19:33", "event": "train_log", "step": 5298, "epoch": 2.2354430379746835, "progress_pct": 37.26, "epoch_pct": 37.26, "eta": "42:24:01", "max_grad_norm": 0.8, "loss": 0.5919729471206665, "grad_norm": 1.197432041168213, "learning_rate": 7.706692659413959e-05} +{"ts": "2025-12-27T19:20:02", "event": "train_log", "step": 5300, "epoch": 2.2362869198312234, "progress_pct": 37.27, "epoch_pct": 37.27, "eta": "42:23:18", "max_grad_norm": 0.8, "loss": 0.5569849014282227, "grad_norm": 1.1779764890670776, "learning_rate": 7.704673186650298e-05} +{"ts": "2025-12-27T19:32:22", "event": "train_log", "step": 5300, "epoch": 2.2362869198312234, "progress_pct": 37.27, "epoch_pct": 37.27, "eta": "42:44:03", "max_grad_norm": 0.8, "eval_loss": 0.6898328065872192, "eval_runtime": 739.3794, "eval_samples_per_second": 2.85, "eval_steps_per_second": 2.85} +{"ts": "2025-12-27T19:32:34", "event": "train_log", "step": 5302, "epoch": 2.237130801687764, "progress_pct": 37.29, "epoch_pct": 37.29, "eta": "42:42:52", "max_grad_norm": 0.8, "loss": 0.5823061466217041, "grad_norm": 1.1371463537216187, "learning_rate": 7.702653089970479e-05} +{"ts": "2025-12-27T19:32:47", "event": "train_log", "step": 5304, "epoch": 2.2379746835443037, "progress_pct": 37.3, "epoch_pct": 37.3, "eta": "42:41:41", "max_grad_norm": 0.8, "loss": 0.5556252002716064, "grad_norm": 1.1877846717834473, "learning_rate": 7.700632369840497e-05} +{"ts": "2025-12-27T19:33:00", "event": "train_log", "step": 5306, "epoch": 2.2388185654008437, "progress_pct": 37.31, "epoch_pct": 37.31, "eta": "42:40:30", "max_grad_norm": 0.8, "loss": 0.5794119834899902, "grad_norm": 1.1580896377563477, "learning_rate": 7.698611026726492e-05} +{"ts": "2025-12-27T19:33:12", "event": "train_log", "step": 5308, "epoch": 2.239662447257384, "progress_pct": 37.33, "epoch_pct": 37.33, "eta": "42:39:19", "max_grad_norm": 0.8, "loss": 0.5828680396080017, "grad_norm": 1.29141366481781, "learning_rate": 7.696589061094755e-05} +{"ts": "2025-12-27T19:33:25", "event": "train_log", "step": 5310, "epoch": 2.240506329113924, "progress_pct": 37.34, "epoch_pct": 37.34, "eta": "42:38:08", "max_grad_norm": 0.8, "loss": 0.6161736845970154, "grad_norm": 1.1286728382110596, "learning_rate": 7.694566473411706e-05} +{"ts": "2025-12-27T19:33:38", "event": "train_log", "step": 5312, "epoch": 2.241350210970464, "progress_pct": 37.36, "epoch_pct": 37.36, "eta": "42:36:57", "max_grad_norm": 0.8, "loss": 0.570767879486084, "grad_norm": 1.0969985723495483, "learning_rate": 7.692543264143925e-05} +{"ts": "2025-12-27T19:33:50", "event": "train_log", "step": 5314, "epoch": 2.2421940928270043, "progress_pct": 37.37, "epoch_pct": 37.37, "eta": "42:35:44", "max_grad_norm": 0.8, "loss": 0.631476104259491, "grad_norm": 1.2902227640151978, "learning_rate": 7.690519433758123e-05} +{"ts": "2025-12-27T19:34:02", "event": "train_log", "step": 5316, "epoch": 2.2430379746835443, "progress_pct": 37.38, "epoch_pct": 37.38, "eta": "42:34:32", "max_grad_norm": 0.8, "loss": 0.6142309904098511, "grad_norm": 1.432735800743103, "learning_rate": 7.68849498272116e-05} +{"ts": "2025-12-27T19:34:15", "event": "train_log", "step": 5318, "epoch": 2.243881856540084, "progress_pct": 37.4, "epoch_pct": 37.4, "eta": "42:33:22", "max_grad_norm": 0.8, "loss": 0.5871514081954956, "grad_norm": 1.0824161767959595, "learning_rate": 7.686469911500038e-05} +{"ts": "2025-12-27T19:34:27", "event": "train_log", "step": 5320, "epoch": 2.2447257383966246, "progress_pct": 37.41, "epoch_pct": 37.41, "eta": "42:32:11", "max_grad_norm": 0.8, "loss": 0.6144557595252991, "grad_norm": 1.1694978475570679, "learning_rate": 7.684444220561902e-05} +{"ts": "2025-12-27T19:34:40", "event": "train_log", "step": 5322, "epoch": 2.2455696202531645, "progress_pct": 37.43, "epoch_pct": 37.43, "eta": "42:31:00", "max_grad_norm": 0.8, "loss": 0.6049425601959229, "grad_norm": 1.2981040477752686, "learning_rate": 7.68241791037404e-05} +{"ts": "2025-12-27T19:34:52", "event": "train_log", "step": 5324, "epoch": 2.2464135021097045, "progress_pct": 37.44, "epoch_pct": 37.44, "eta": "42:29:49", "max_grad_norm": 0.8, "loss": 0.5571867823600769, "grad_norm": 1.132128357887268, "learning_rate": 7.680390981403885e-05} +{"ts": "2025-12-27T19:35:05", "event": "train_log", "step": 5326, "epoch": 2.247257383966245, "progress_pct": 37.45, "epoch_pct": 37.45, "eta": "42:28:38", "max_grad_norm": 0.8, "loss": 0.5710517168045044, "grad_norm": 1.1760079860687256, "learning_rate": 7.678363434119005e-05} +{"ts": "2025-12-27T19:35:17", "event": "train_log", "step": 5328, "epoch": 2.248101265822785, "progress_pct": 37.47, "epoch_pct": 37.47, "eta": "42:27:27", "max_grad_norm": 0.8, "loss": 0.5508866906166077, "grad_norm": 1.1918572187423706, "learning_rate": 7.67633526898712e-05} +{"ts": "2025-12-27T19:35:30", "event": "train_log", "step": 5330, "epoch": 2.2489451476793247, "progress_pct": 37.48, "epoch_pct": 37.48, "eta": "42:26:17", "max_grad_norm": 0.8, "loss": 0.6242696046829224, "grad_norm": 1.1837294101715088, "learning_rate": 7.674306486476091e-05} +{"ts": "2025-12-27T19:35:43", "event": "train_log", "step": 5332, "epoch": 2.249789029535865, "progress_pct": 37.5, "epoch_pct": 37.5, "eta": "42:25:06", "max_grad_norm": 0.8, "loss": 0.5821678042411804, "grad_norm": 1.384918212890625, "learning_rate": 7.672277087053914e-05} +{"ts": "2025-12-27T19:35:55", "event": "train_log", "step": 5334, "epoch": 2.250632911392405, "progress_pct": 37.51, "epoch_pct": 37.51, "eta": "42:23:56", "max_grad_norm": 0.8, "loss": 0.5415928363800049, "grad_norm": 1.1248877048492432, "learning_rate": 7.670247071188738e-05} +{"ts": "2025-12-27T19:36:08", "event": "train_log", "step": 5336, "epoch": 2.251476793248945, "progress_pct": 37.52, "epoch_pct": 37.52, "eta": "42:22:45", "max_grad_norm": 0.8, "loss": 0.5475174188613892, "grad_norm": 1.228140950202942, "learning_rate": 7.668216439348843e-05} +{"ts": "2025-12-27T19:36:20", "event": "train_log", "step": 5338, "epoch": 2.2523206751054854, "progress_pct": 37.54, "epoch_pct": 37.54, "eta": "42:21:33", "max_grad_norm": 0.8, "loss": 0.5793306231498718, "grad_norm": 1.3816046714782715, "learning_rate": 7.666185192002662e-05} +{"ts": "2025-12-27T19:36:32", "event": "train_log", "step": 5340, "epoch": 2.2531645569620253, "progress_pct": 37.55, "epoch_pct": 37.55, "eta": "42:20:22", "max_grad_norm": 0.8, "loss": 0.6221131682395935, "grad_norm": 1.2446565628051758, "learning_rate": 7.664153329618759e-05} +{"ts": "2025-12-27T19:36:45", "event": "train_log", "step": 5342, "epoch": 2.2540084388185653, "progress_pct": 37.57, "epoch_pct": 37.57, "eta": "42:19:13", "max_grad_norm": 0.8, "loss": 0.5403847694396973, "grad_norm": 1.1677669286727905, "learning_rate": 7.662120852665852e-05} +{"ts": "2025-12-27T19:36:57", "event": "train_log", "step": 5344, "epoch": 2.2548523206751057, "progress_pct": 37.58, "epoch_pct": 37.58, "eta": "42:18:02", "max_grad_norm": 0.8, "loss": 0.620201587677002, "grad_norm": 1.2485873699188232, "learning_rate": 7.66008776161279e-05} +{"ts": "2025-12-27T19:37:10", "event": "train_log", "step": 5346, "epoch": 2.2556962025316456, "progress_pct": 37.59, "epoch_pct": 37.59, "eta": "42:16:51", "max_grad_norm": 0.8, "loss": 0.5969216227531433, "grad_norm": 1.2486802339553833, "learning_rate": 7.658054056928568e-05} +{"ts": "2025-12-27T19:37:22", "event": "train_log", "step": 5348, "epoch": 2.2565400843881855, "progress_pct": 37.61, "epoch_pct": 37.61, "eta": "42:15:40", "max_grad_norm": 0.8, "loss": 0.6376339793205261, "grad_norm": 1.2621372938156128, "learning_rate": 7.656019739082326e-05} +{"ts": "2025-12-27T19:37:34", "event": "train_log", "step": 5350, "epoch": 2.257383966244726, "progress_pct": 37.62, "epoch_pct": 37.62, "eta": "42:14:29", "max_grad_norm": 0.8, "loss": 0.6374872326850891, "grad_norm": 1.238633155822754, "learning_rate": 7.65398480854334e-05} +{"ts": "2025-12-27T19:37:46", "event": "train_log", "step": 5352, "epoch": 2.258227848101266, "progress_pct": 37.64, "epoch_pct": 37.64, "eta": "42:13:18", "max_grad_norm": 0.8, "loss": 0.6348551511764526, "grad_norm": 1.3031803369522095, "learning_rate": 7.651949265781029e-05} +{"ts": "2025-12-27T19:37:58", "event": "train_log", "step": 5354, "epoch": 2.259071729957806, "progress_pct": 37.65, "epoch_pct": 37.65, "eta": "42:12:07", "max_grad_norm": 0.8, "loss": 0.6267750859260559, "grad_norm": 1.3735158443450928, "learning_rate": 7.649913111264952e-05} +{"ts": "2025-12-27T19:38:11", "event": "train_log", "step": 5356, "epoch": 2.259915611814346, "progress_pct": 37.67, "epoch_pct": 37.67, "eta": "42:10:57", "max_grad_norm": 0.8, "loss": 0.623030960559845, "grad_norm": 1.1227772235870361, "learning_rate": 7.647876345464817e-05} +{"ts": "2025-12-27T19:38:23", "event": "train_log", "step": 5358, "epoch": 2.260759493670886, "progress_pct": 37.68, "epoch_pct": 37.68, "eta": "42:09:46", "max_grad_norm": 0.8, "loss": 0.5810713171958923, "grad_norm": 1.4555678367614746, "learning_rate": 7.645838968850459e-05} +{"ts": "2025-12-27T19:38:35", "event": "train_log", "step": 5360, "epoch": 2.261603375527426, "progress_pct": 37.69, "epoch_pct": 37.69, "eta": "42:08:35", "max_grad_norm": 0.8, "loss": 0.6150093078613281, "grad_norm": 1.227725863456726, "learning_rate": 7.643800981891867e-05} +{"ts": "2025-12-27T19:38:48", "event": "train_log", "step": 5362, "epoch": 2.2624472573839665, "progress_pct": 37.71, "epoch_pct": 37.71, "eta": "42:07:26", "max_grad_norm": 0.8, "loss": 0.5350445508956909, "grad_norm": 1.0648300647735596, "learning_rate": 7.641762385059161e-05} +{"ts": "2025-12-27T19:39:01", "event": "train_log", "step": 5364, "epoch": 2.2632911392405064, "progress_pct": 37.72, "epoch_pct": 37.72, "eta": "42:06:16", "max_grad_norm": 0.8, "loss": 0.6253421306610107, "grad_norm": 1.179452896118164, "learning_rate": 7.639723178822613e-05} +{"ts": "2025-12-27T19:39:14", "event": "train_log", "step": 5366, "epoch": 2.2641350210970463, "progress_pct": 37.74, "epoch_pct": 37.74, "eta": "42:05:06", "max_grad_norm": 0.8, "loss": 0.5512562990188599, "grad_norm": 1.0983240604400635, "learning_rate": 7.637683363652621e-05} +{"ts": "2025-12-27T19:39:33", "event": "train_log", "step": 5368, "epoch": 2.2649789029535867, "progress_pct": 37.75, "epoch_pct": 37.75, "eta": "42:04:08", "max_grad_norm": 0.8, "loss": 0.5584151148796082, "grad_norm": 1.1825451850891113, "learning_rate": 7.635642940019736e-05} +{"ts": "2025-12-27T19:39:57", "event": "train_log", "step": 5370, "epoch": 2.2658227848101267, "progress_pct": 37.76, "epoch_pct": 37.76, "eta": "42:03:17", "max_grad_norm": 0.8, "loss": 0.5881790518760681, "grad_norm": 1.1022000312805176, "learning_rate": 7.633601908394643e-05} +{"ts": "2025-12-27T19:40:20", "event": "train_log", "step": 5372, "epoch": 2.2666666666666666, "progress_pct": 37.78, "epoch_pct": 37.78, "eta": "42:02:23", "max_grad_norm": 0.8, "loss": 0.6060683131217957, "grad_norm": 1.1935697793960571, "learning_rate": 7.631560269248169e-05} +{"ts": "2025-12-27T19:40:42", "event": "train_log", "step": 5374, "epoch": 2.267510548523207, "progress_pct": 37.79, "epoch_pct": 37.79, "eta": "42:01:29", "max_grad_norm": 0.8, "loss": 0.5877062678337097, "grad_norm": 1.1174103021621704, "learning_rate": 7.62951802305128e-05} +{"ts": "2025-12-27T19:41:03", "event": "train_log", "step": 5376, "epoch": 2.268354430379747, "progress_pct": 37.81, "epoch_pct": 37.81, "eta": "42:00:33", "max_grad_norm": 0.8, "loss": 0.5145504474639893, "grad_norm": 1.3934977054595947, "learning_rate": 7.627475170275086e-05} +{"ts": "2025-12-27T19:41:24", "event": "train_log", "step": 5378, "epoch": 2.269198312236287, "progress_pct": 37.82, "epoch_pct": 37.82, "eta": "41:59:38", "max_grad_norm": 0.8, "loss": 0.6194025874137878, "grad_norm": 1.2637842893600464, "learning_rate": 7.625431711390831e-05} +{"ts": "2025-12-27T19:41:46", "event": "train_log", "step": 5380, "epoch": 2.270042194092827, "progress_pct": 37.83, "epoch_pct": 37.83, "eta": "41:58:43", "max_grad_norm": 0.8, "loss": 0.6205627918243408, "grad_norm": 1.2034388780593872, "learning_rate": 7.623387646869902e-05} +{"ts": "2025-12-27T19:42:02", "event": "train_log", "step": 5382, "epoch": 2.270886075949367, "progress_pct": 37.85, "epoch_pct": 37.85, "eta": "41:57:40", "max_grad_norm": 0.8, "loss": 0.5609696507453918, "grad_norm": 0.953880250453949, "learning_rate": 7.621342977183826e-05} +{"ts": "2025-12-27T19:42:14", "event": "train_log", "step": 5384, "epoch": 2.271729957805907, "progress_pct": 37.86, "epoch_pct": 37.86, "eta": "41:56:29", "max_grad_norm": 0.8, "loss": 0.6044906377792358, "grad_norm": 1.2841949462890625, "learning_rate": 7.619297702804272e-05} +{"ts": "2025-12-27T19:42:27", "event": "train_log", "step": 5386, "epoch": 2.272573839662447, "progress_pct": 37.88, "epoch_pct": 37.88, "eta": "41:55:19", "max_grad_norm": 0.8, "loss": 0.5420435667037964, "grad_norm": 1.146804690361023, "learning_rate": 7.617251824203037e-05} +{"ts": "2025-12-27T19:42:40", "event": "train_log", "step": 5388, "epoch": 2.2734177215189875, "progress_pct": 37.89, "epoch_pct": 37.89, "eta": "41:54:10", "max_grad_norm": 0.8, "loss": 0.6230710744857788, "grad_norm": 1.2225698232650757, "learning_rate": 7.615205341852076e-05} +{"ts": "2025-12-27T19:42:52", "event": "train_log", "step": 5390, "epoch": 2.2742616033755274, "progress_pct": 37.9, "epoch_pct": 37.9, "eta": "41:53:00", "max_grad_norm": 0.8, "loss": 0.6486349701881409, "grad_norm": 1.3423371315002441, "learning_rate": 7.613158256223467e-05} +{"ts": "2025-12-27T19:43:05", "event": "train_log", "step": 5392, "epoch": 2.2751054852320673, "progress_pct": 37.92, "epoch_pct": 37.92, "eta": "41:51:51", "max_grad_norm": 0.8, "loss": 0.6527825593948364, "grad_norm": 1.0840023756027222, "learning_rate": 7.611110567789435e-05} +{"ts": "2025-12-27T19:43:16", "event": "train_log", "step": 5394, "epoch": 2.2759493670886077, "progress_pct": 37.93, "epoch_pct": 37.93, "eta": "41:50:40", "max_grad_norm": 0.8, "loss": 0.6859483122825623, "grad_norm": 1.342466950416565, "learning_rate": 7.609062277022341e-05} +{"ts": "2025-12-27T19:43:30", "event": "train_log", "step": 5396, "epoch": 2.2767932489451477, "progress_pct": 37.95, "epoch_pct": 37.95, "eta": "41:49:32", "max_grad_norm": 0.8, "loss": 0.5536003708839417, "grad_norm": 1.0406129360198975, "learning_rate": 7.607013384394691e-05} +{"ts": "2025-12-27T19:43:42", "event": "train_log", "step": 5398, "epoch": 2.2776371308016876, "progress_pct": 37.96, "epoch_pct": 37.96, "eta": "41:48:23", "max_grad_norm": 0.8, "loss": 0.5488654971122742, "grad_norm": 1.0853544473648071, "learning_rate": 7.604963890379118e-05} +{"ts": "2025-12-27T19:43:56", "event": "train_log", "step": 5400, "epoch": 2.278481012658228, "progress_pct": 37.97, "epoch_pct": 37.97, "eta": "41:47:15", "max_grad_norm": 0.8, "loss": 0.6072142720222473, "grad_norm": 1.0330145359039307, "learning_rate": 7.602913795448407e-05} +{"ts": "2025-12-27T19:58:17", "event": "train_log", "step": 5400, "epoch": 2.278481012658228, "progress_pct": 37.97, "epoch_pct": 37.97, "eta": "42:10:42", "max_grad_norm": 0.8, "eval_loss": 0.6875645518302917, "eval_runtime": 861.3558, "eval_samples_per_second": 2.446, "eval_steps_per_second": 2.446} +{"ts": "2025-12-27T19:58:39", "event": "train_log", "step": 5402, "epoch": 2.279324894514768, "progress_pct": 37.99, "epoch_pct": 37.99, "eta": "42:09:46", "max_grad_norm": 0.8, "loss": 0.5420109033584595, "grad_norm": 1.1858742237091064, "learning_rate": 7.600863100075472e-05} +{"ts": "2025-12-27T19:59:01", "event": "train_log", "step": 5404, "epoch": 2.280168776371308, "progress_pct": 38.0, "epoch_pct": 38.0, "eta": "42:08:53", "max_grad_norm": 0.8, "loss": 0.6109243631362915, "grad_norm": 1.2126039266586304, "learning_rate": 7.598811804733373e-05} +{"ts": "2025-12-27T19:59:22", "event": "train_log", "step": 5406, "epoch": 2.2810126582278483, "progress_pct": 38.02, "epoch_pct": 38.02, "eta": "42:07:56", "max_grad_norm": 0.8, "loss": 0.5889696478843689, "grad_norm": 1.1290241479873657, "learning_rate": 7.5967599098953e-05} +{"ts": "2025-12-27T19:59:43", "event": "train_log", "step": 5408, "epoch": 2.281856540084388, "progress_pct": 38.03, "epoch_pct": 38.03, "eta": "42:06:59", "max_grad_norm": 0.8, "loss": 0.6548630595207214, "grad_norm": 1.320263147354126, "learning_rate": 7.594707416034586e-05} +{"ts": "2025-12-27T20:00:04", "event": "train_log", "step": 5410, "epoch": 2.282700421940928, "progress_pct": 38.05, "epoch_pct": 38.05, "eta": "42:06:04", "max_grad_norm": 0.8, "loss": 0.6556787490844727, "grad_norm": 1.346169114112854, "learning_rate": 7.592654323624703e-05} +{"ts": "2025-12-27T20:00:26", "event": "train_log", "step": 5412, "epoch": 2.2835443037974685, "progress_pct": 38.06, "epoch_pct": 38.06, "eta": "42:05:09", "max_grad_norm": 0.8, "loss": 0.5631673336029053, "grad_norm": 1.2104716300964355, "learning_rate": 7.590600633139265e-05} +{"ts": "2025-12-27T20:00:47", "event": "train_log", "step": 5414, "epoch": 2.2843881856540085, "progress_pct": 38.07, "epoch_pct": 38.07, "eta": "42:04:12", "max_grad_norm": 0.8, "loss": 0.5931088328361511, "grad_norm": 1.3298237323760986, "learning_rate": 7.58854634505201e-05} +{"ts": "2025-12-27T20:01:08", "event": "train_log", "step": 5416, "epoch": 2.2852320675105484, "progress_pct": 38.09, "epoch_pct": 38.09, "eta": "42:03:16", "max_grad_norm": 0.8, "loss": 0.6966755986213684, "grad_norm": 1.4201204776763916, "learning_rate": 7.586491459836829e-05} +{"ts": "2025-12-27T20:01:29", "event": "train_log", "step": 5418, "epoch": 2.286075949367089, "progress_pct": 38.1, "epoch_pct": 38.1, "eta": "42:02:21", "max_grad_norm": 0.8, "loss": 0.6172569394111633, "grad_norm": 1.253135323524475, "learning_rate": 7.584435977967743e-05} +{"ts": "2025-12-27T20:01:50", "event": "train_log", "step": 5420, "epoch": 2.2869198312236287, "progress_pct": 38.12, "epoch_pct": 38.12, "eta": "42:01:24", "max_grad_norm": 0.8, "loss": 0.5376655459403992, "grad_norm": 1.133144736289978, "learning_rate": 7.582379899918911e-05} +{"ts": "2025-12-27T20:02:12", "event": "train_log", "step": 5422, "epoch": 2.2877637130801687, "progress_pct": 38.13, "epoch_pct": 38.13, "eta": "42:00:29", "max_grad_norm": 0.8, "loss": 0.6138498187065125, "grad_norm": 1.1103745698928833, "learning_rate": 7.580323226164632e-05} +{"ts": "2025-12-27T20:02:33", "event": "train_log", "step": 5424, "epoch": 2.2886075949367086, "progress_pct": 38.14, "epoch_pct": 38.14, "eta": "41:59:34", "max_grad_norm": 0.8, "loss": 0.5049096345901489, "grad_norm": 1.091636300086975, "learning_rate": 7.57826595717934e-05} +{"ts": "2025-12-27T20:02:57", "event": "train_log", "step": 5426, "epoch": 2.289451476793249, "progress_pct": 38.16, "epoch_pct": 38.16, "eta": "41:58:42", "max_grad_norm": 0.8, "loss": 0.5666115283966064, "grad_norm": 1.2486571073532104, "learning_rate": 7.57620809343761e-05} +{"ts": "2025-12-27T20:03:19", "event": "train_log", "step": 5428, "epoch": 2.290295358649789, "progress_pct": 38.17, "epoch_pct": 38.17, "eta": "41:57:48", "max_grad_norm": 0.8, "loss": 0.49512919783592224, "grad_norm": 1.510684847831726, "learning_rate": 7.57414963541415e-05} +{"ts": "2025-12-27T20:03:41", "event": "train_log", "step": 5430, "epoch": 2.291139240506329, "progress_pct": 38.19, "epoch_pct": 38.19, "eta": "41:56:53", "max_grad_norm": 0.8, "loss": 0.558807373046875, "grad_norm": 1.1142191886901855, "learning_rate": 7.572090583583805e-05} +{"ts": "2025-12-27T20:04:02", "event": "train_log", "step": 5432, "epoch": 2.2919831223628693, "progress_pct": 38.2, "epoch_pct": 38.2, "eta": "41:55:58", "max_grad_norm": 0.8, "loss": 0.6245265603065491, "grad_norm": 1.1162657737731934, "learning_rate": 7.57003093842156e-05} +{"ts": "2025-12-27T20:04:24", "event": "train_log", "step": 5434, "epoch": 2.292827004219409, "progress_pct": 38.21, "epoch_pct": 38.21, "eta": "41:55:03", "max_grad_norm": 0.8, "loss": 0.5505527853965759, "grad_norm": 1.2784614562988281, "learning_rate": 7.567970700402537e-05} +{"ts": "2025-12-27T20:04:45", "event": "train_log", "step": 5436, "epoch": 2.293670886075949, "progress_pct": 38.23, "epoch_pct": 38.23, "eta": "41:54:07", "max_grad_norm": 0.8, "loss": 0.6137702465057373, "grad_norm": 1.3142638206481934, "learning_rate": 7.565909870001992e-05} +{"ts": "2025-12-27T20:05:07", "event": "train_log", "step": 5438, "epoch": 2.2945147679324895, "progress_pct": 38.24, "epoch_pct": 38.24, "eta": "41:53:14", "max_grad_norm": 0.8, "loss": 0.540766716003418, "grad_norm": 1.072805404663086, "learning_rate": 7.563848447695318e-05} +{"ts": "2025-12-27T20:05:27", "event": "train_log", "step": 5440, "epoch": 2.2953586497890295, "progress_pct": 38.26, "epoch_pct": 38.26, "eta": "41:52:16", "max_grad_norm": 0.8, "loss": 0.6806555986404419, "grad_norm": 1.2861377000808716, "learning_rate": 7.561786433958048e-05} +{"ts": "2025-12-27T20:05:49", "event": "train_log", "step": 5442, "epoch": 2.2962025316455694, "progress_pct": 38.27, "epoch_pct": 38.27, "eta": "41:51:21", "max_grad_norm": 0.8, "loss": 0.6191258430480957, "grad_norm": 1.3193045854568481, "learning_rate": 7.559723829265847e-05} +{"ts": "2025-12-27T20:06:10", "event": "train_log", "step": 5444, "epoch": 2.29704641350211, "progress_pct": 38.28, "epoch_pct": 38.28, "eta": "41:50:26", "max_grad_norm": 0.8, "loss": 0.6067718863487244, "grad_norm": 1.1969127655029297, "learning_rate": 7.55766063409452e-05} +{"ts": "2025-12-27T20:06:32", "event": "train_log", "step": 5446, "epoch": 2.2978902953586497, "progress_pct": 38.3, "epoch_pct": 38.3, "eta": "41:49:32", "max_grad_norm": 0.8, "loss": 0.5673627257347107, "grad_norm": 1.2129666805267334, "learning_rate": 7.555596848920006e-05} +{"ts": "2025-12-27T20:06:53", "event": "train_log", "step": 5448, "epoch": 2.2987341772151897, "progress_pct": 38.31, "epoch_pct": 38.31, "eta": "41:48:35", "max_grad_norm": 0.8, "loss": 0.61825031042099, "grad_norm": 1.1639961004257202, "learning_rate": 7.553532474218379e-05} +{"ts": "2025-12-27T20:07:12", "event": "train_log", "step": 5450, "epoch": 2.29957805907173, "progress_pct": 38.33, "epoch_pct": 38.33, "eta": "41:47:37", "max_grad_norm": 0.8, "loss": 0.6096790432929993, "grad_norm": 1.3893283605575562, "learning_rate": 7.551467510465852e-05} +{"ts": "2025-12-27T20:07:36", "event": "train_log", "step": 5452, "epoch": 2.30042194092827, "progress_pct": 38.34, "epoch_pct": 38.34, "eta": "41:46:46", "max_grad_norm": 0.8, "loss": 0.6121414303779602, "grad_norm": 1.0708417892456055, "learning_rate": 7.549401958138772e-05} +{"ts": "2025-12-27T20:07:56", "event": "train_log", "step": 5454, "epoch": 2.30126582278481, "progress_pct": 38.35, "epoch_pct": 38.35, "eta": "41:45:49", "max_grad_norm": 0.8, "loss": 0.6504668593406677, "grad_norm": 1.3299298286437988, "learning_rate": 7.547335817713624e-05} +{"ts": "2025-12-27T20:08:18", "event": "train_log", "step": 5456, "epoch": 2.3021097046413503, "progress_pct": 38.37, "epoch_pct": 38.37, "eta": "41:44:54", "max_grad_norm": 0.8, "loss": 0.5761144161224365, "grad_norm": 1.3594682216644287, "learning_rate": 7.545269089667022e-05} +{"ts": "2025-12-27T20:08:38", "event": "train_log", "step": 5458, "epoch": 2.3029535864978903, "progress_pct": 38.38, "epoch_pct": 38.38, "eta": "41:43:57", "max_grad_norm": 0.8, "loss": 0.5457773804664612, "grad_norm": 1.1089586019515991, "learning_rate": 7.543201774475726e-05} +{"ts": "2025-12-27T20:08:59", "event": "train_log", "step": 5460, "epoch": 2.3037974683544302, "progress_pct": 38.4, "epoch_pct": 38.4, "eta": "41:43:01", "max_grad_norm": 0.8, "loss": 0.6014775037765503, "grad_norm": 1.3472918272018433, "learning_rate": 7.541133872616624e-05} +{"ts": "2025-12-27T20:09:21", "event": "train_log", "step": 5462, "epoch": 2.3046413502109706, "progress_pct": 38.41, "epoch_pct": 38.41, "eta": "41:42:07", "max_grad_norm": 0.8, "loss": 0.6246467232704163, "grad_norm": 1.2757689952850342, "learning_rate": 7.53906538456674e-05} +{"ts": "2025-12-27T20:09:41", "event": "train_log", "step": 5464, "epoch": 2.3054852320675105, "progress_pct": 38.42, "epoch_pct": 38.42, "eta": "41:41:11", "max_grad_norm": 0.8, "loss": 0.6583935022354126, "grad_norm": 1.4598166942596436, "learning_rate": 7.536996310803236e-05} +{"ts": "2025-12-27T20:10:02", "event": "train_log", "step": 5466, "epoch": 2.3063291139240505, "progress_pct": 38.44, "epoch_pct": 38.44, "eta": "41:40:15", "max_grad_norm": 0.8, "loss": 0.562523603439331, "grad_norm": 1.2861602306365967, "learning_rate": 7.534926651803407e-05} +{"ts": "2025-12-27T20:10:24", "event": "train_log", "step": 5468, "epoch": 2.307172995780591, "progress_pct": 38.45, "epoch_pct": 38.45, "eta": "41:39:20", "max_grad_norm": 0.8, "loss": 0.6093505620956421, "grad_norm": 1.0953221321105957, "learning_rate": 7.532856408044684e-05} +{"ts": "2025-12-27T20:10:45", "event": "train_log", "step": 5470, "epoch": 2.308016877637131, "progress_pct": 38.47, "epoch_pct": 38.47, "eta": "41:38:25", "max_grad_norm": 0.8, "loss": 0.6196447014808655, "grad_norm": 1.0982829332351685, "learning_rate": 7.530785580004631e-05} +{"ts": "2025-12-27T20:11:06", "event": "train_log", "step": 5472, "epoch": 2.3088607594936708, "progress_pct": 38.48, "epoch_pct": 38.48, "eta": "41:37:30", "max_grad_norm": 0.8, "loss": 0.6360989212989807, "grad_norm": 1.2224280834197998, "learning_rate": 7.52871416816095e-05} +{"ts": "2025-12-27T20:11:26", "event": "train_log", "step": 5474, "epoch": 2.309704641350211, "progress_pct": 38.5, "epoch_pct": 38.5, "eta": "41:36:33", "max_grad_norm": 0.8, "loss": 0.6189543008804321, "grad_norm": 1.244486927986145, "learning_rate": 7.526642172991476e-05} +{"ts": "2025-12-27T20:11:47", "event": "train_log", "step": 5476, "epoch": 2.310548523206751, "progress_pct": 38.51, "epoch_pct": 38.51, "eta": "41:35:37", "max_grad_norm": 0.8, "loss": 0.6137582659721375, "grad_norm": 1.2408053874969482, "learning_rate": 7.524569594974178e-05} +{"ts": "2025-12-27T20:12:07", "event": "train_log", "step": 5478, "epoch": 2.311392405063291, "progress_pct": 38.52, "epoch_pct": 38.52, "eta": "41:34:41", "max_grad_norm": 0.8, "loss": 0.6462169289588928, "grad_norm": 1.3323272466659546, "learning_rate": 7.522496434587157e-05} +{"ts": "2025-12-27T20:12:30", "event": "train_log", "step": 5480, "epoch": 2.3122362869198314, "progress_pct": 38.54, "epoch_pct": 38.54, "eta": "41:33:48", "max_grad_norm": 0.8, "loss": 0.5495362877845764, "grad_norm": 1.1076425313949585, "learning_rate": 7.520422692308657e-05} +{"ts": "2025-12-27T20:12:51", "event": "train_log", "step": 5482, "epoch": 2.3130801687763713, "progress_pct": 38.55, "epoch_pct": 38.55, "eta": "41:32:52", "max_grad_norm": 0.8, "loss": 0.5560636520385742, "grad_norm": 1.3298509120941162, "learning_rate": 7.518348368617046e-05} +{"ts": "2025-12-27T20:13:12", "event": "train_log", "step": 5484, "epoch": 2.3139240506329113, "progress_pct": 38.57, "epoch_pct": 38.57, "eta": "41:31:58", "max_grad_norm": 0.8, "loss": 0.5763371586799622, "grad_norm": 1.0740195512771606, "learning_rate": 7.516273463990832e-05} +{"ts": "2025-12-27T20:13:32", "event": "train_log", "step": 5486, "epoch": 2.3147679324894517, "progress_pct": 38.58, "epoch_pct": 38.58, "eta": "41:31:01", "max_grad_norm": 0.8, "loss": 0.5111498832702637, "grad_norm": 1.0748567581176758, "learning_rate": 7.514197978908657e-05} +{"ts": "2025-12-27T20:13:52", "event": "train_log", "step": 5488, "epoch": 2.3156118143459916, "progress_pct": 38.59, "epoch_pct": 38.59, "eta": "41:30:04", "max_grad_norm": 0.8, "loss": 0.6599951982498169, "grad_norm": 1.2047218084335327, "learning_rate": 7.512121913849294e-05} +{"ts": "2025-12-27T20:14:13", "event": "train_log", "step": 5490, "epoch": 2.3164556962025316, "progress_pct": 38.61, "epoch_pct": 38.61, "eta": "41:29:09", "max_grad_norm": 0.8, "loss": 0.6409770846366882, "grad_norm": 1.2956700325012207, "learning_rate": 7.510045269291651e-05} +{"ts": "2025-12-27T20:14:36", "event": "train_log", "step": 5492, "epoch": 2.317299578059072, "progress_pct": 38.62, "epoch_pct": 38.62, "eta": "41:28:16", "max_grad_norm": 0.8, "loss": 0.5967662334442139, "grad_norm": 1.241860032081604, "learning_rate": 7.50796804571477e-05} +{"ts": "2025-12-27T20:14:57", "event": "train_log", "step": 5494, "epoch": 2.318143459915612, "progress_pct": 38.64, "epoch_pct": 38.64, "eta": "41:27:21", "max_grad_norm": 0.8, "loss": 0.5856342315673828, "grad_norm": 1.1612682342529297, "learning_rate": 7.50589024359783e-05} +{"ts": "2025-12-27T20:15:19", "event": "train_log", "step": 5496, "epoch": 2.318987341772152, "progress_pct": 38.65, "epoch_pct": 38.65, "eta": "41:26:27", "max_grad_norm": 0.8, "loss": 0.5652023553848267, "grad_norm": 1.0895500183105469, "learning_rate": 7.503811863420135e-05} +{"ts": "2025-12-27T20:15:41", "event": "train_log", "step": 5498, "epoch": 2.319831223628692, "progress_pct": 38.66, "epoch_pct": 38.66, "eta": "41:25:33", "max_grad_norm": 0.8, "loss": 0.6777268648147583, "grad_norm": 1.3374481201171875, "learning_rate": 7.50173290566113e-05} +{"ts": "2025-12-27T20:16:03", "event": "train_log", "step": 5500, "epoch": 2.320675105485232, "progress_pct": 38.68, "epoch_pct": 38.68, "eta": "41:24:40", "max_grad_norm": 0.8, "loss": 0.6052314043045044, "grad_norm": 1.192614197731018, "learning_rate": 7.499653370800391e-05} +{"ts": "2025-12-27T20:31:44", "event": "train_log", "step": 5500, "epoch": 2.320675105485232, "progress_pct": 38.68, "epoch_pct": 38.68, "eta": "41:49:33", "max_grad_norm": 0.8, "eval_loss": 0.6867148876190186, "eval_runtime": 941.3545, "eval_samples_per_second": 2.238, "eval_steps_per_second": 2.238} +{"ts": "2025-12-27T20:32:06", "event": "train_log", "step": 5502, "epoch": 2.321518987341772, "progress_pct": 38.69, "epoch_pct": 38.69, "eta": "41:48:39", "max_grad_norm": 0.8, "loss": 0.5208253860473633, "grad_norm": 1.1008832454681396, "learning_rate": 7.497573259317625e-05} +{"ts": "2025-12-27T20:32:27", "event": "train_log", "step": 5504, "epoch": 2.3223628691983125, "progress_pct": 38.71, "epoch_pct": 38.71, "eta": "41:47:43", "max_grad_norm": 0.8, "loss": 0.6352296471595764, "grad_norm": 1.2141541242599487, "learning_rate": 7.495492571692677e-05} +{"ts": "2025-12-27T20:32:48", "event": "train_log", "step": 5506, "epoch": 2.3232067510548524, "progress_pct": 38.72, "epoch_pct": 38.72, "eta": "41:46:47", "max_grad_norm": 0.8, "loss": 0.6132256388664246, "grad_norm": 1.2588802576065063, "learning_rate": 7.493411308405517e-05} +{"ts": "2025-12-27T20:33:10", "event": "train_log", "step": 5508, "epoch": 2.3240506329113924, "progress_pct": 38.73, "epoch_pct": 38.73, "eta": "41:45:51", "max_grad_norm": 0.8, "loss": 0.571265697479248, "grad_norm": 1.348765254020691, "learning_rate": 7.491329469936258e-05} +{"ts": "2025-12-27T20:33:30", "event": "train_log", "step": 5510, "epoch": 2.3248945147679323, "progress_pct": 38.75, "epoch_pct": 38.75, "eta": "41:44:55", "max_grad_norm": 0.8, "loss": 0.5433708429336548, "grad_norm": 1.266377329826355, "learning_rate": 7.489247056765135e-05} +{"ts": "2025-12-27T20:33:51", "event": "train_log", "step": 5512, "epoch": 2.3257383966244727, "progress_pct": 38.76, "epoch_pct": 38.76, "eta": "41:43:58", "max_grad_norm": 0.8, "loss": 0.6193158030509949, "grad_norm": 1.2920128107070923, "learning_rate": 7.487164069372523e-05} +{"ts": "2025-12-27T20:34:12", "event": "train_log", "step": 5514, "epoch": 2.3265822784810126, "progress_pct": 38.78, "epoch_pct": 38.78, "eta": "41:43:03", "max_grad_norm": 0.8, "loss": 0.5817977786064148, "grad_norm": 1.068169116973877, "learning_rate": 7.485080508238928e-05} +{"ts": "2025-12-27T20:34:33", "event": "train_log", "step": 5516, "epoch": 2.3274261603375526, "progress_pct": 38.79, "epoch_pct": 38.79, "eta": "41:42:07", "max_grad_norm": 0.8, "loss": 0.6558082103729248, "grad_norm": 1.2941710948944092, "learning_rate": 7.482996373844985e-05} +{"ts": "2025-12-27T20:34:57", "event": "train_log", "step": 5518, "epoch": 2.328270042194093, "progress_pct": 38.8, "epoch_pct": 38.8, "eta": "41:41:15", "max_grad_norm": 0.8, "loss": 0.5569961667060852, "grad_norm": 1.2143336534500122, "learning_rate": 7.480911666671467e-05} +{"ts": "2025-12-27T20:35:18", "event": "train_log", "step": 5520, "epoch": 2.329113924050633, "progress_pct": 38.82, "epoch_pct": 38.82, "eta": "41:40:20", "max_grad_norm": 0.8, "loss": 0.6497300863265991, "grad_norm": 1.3364789485931396, "learning_rate": 7.478826387199274e-05} +{"ts": "2025-12-27T20:35:40", "event": "train_log", "step": 5522, "epoch": 2.329957805907173, "progress_pct": 38.83, "epoch_pct": 38.83, "eta": "41:39:27", "max_grad_norm": 0.8, "loss": 0.5793087482452393, "grad_norm": 1.057530403137207, "learning_rate": 7.47674053590944e-05} +{"ts": "2025-12-27T20:36:02", "event": "train_log", "step": 5524, "epoch": 2.330801687763713, "progress_pct": 38.85, "epoch_pct": 38.85, "eta": "41:38:32", "max_grad_norm": 0.8, "loss": 0.5583140850067139, "grad_norm": 1.1543176174163818, "learning_rate": 7.47465411328313e-05} +{"ts": "2025-12-27T20:36:24", "event": "train_log", "step": 5526, "epoch": 2.331645569620253, "progress_pct": 38.86, "epoch_pct": 38.86, "eta": "41:37:38", "max_grad_norm": 0.8, "loss": 0.6318784952163696, "grad_norm": 1.3409180641174316, "learning_rate": 7.472567119801645e-05} +{"ts": "2025-12-27T20:36:46", "event": "train_log", "step": 5528, "epoch": 2.332489451476793, "progress_pct": 38.87, "epoch_pct": 38.87, "eta": "41:36:43", "max_grad_norm": 0.8, "loss": 0.5950855612754822, "grad_norm": 1.2899413108825684, "learning_rate": 7.47047955594641e-05} +{"ts": "2025-12-27T20:37:06", "event": "train_log", "step": 5530, "epoch": 2.3333333333333335, "progress_pct": 38.89, "epoch_pct": 38.89, "eta": "41:35:47", "max_grad_norm": 0.8, "loss": 0.6181023716926575, "grad_norm": 1.329220175743103, "learning_rate": 7.468391422198989e-05} +{"ts": "2025-12-27T20:37:28", "event": "train_log", "step": 5532, "epoch": 2.3341772151898734, "progress_pct": 38.9, "epoch_pct": 38.9, "eta": "41:34:53", "max_grad_norm": 0.8, "loss": 0.6384578943252563, "grad_norm": 1.202129602432251, "learning_rate": 7.466302719041073e-05} +{"ts": "2025-12-27T20:37:47", "event": "train_log", "step": 5534, "epoch": 2.3350210970464134, "progress_pct": 38.92, "epoch_pct": 38.92, "eta": "41:33:54", "max_grad_norm": 0.8, "loss": 0.6059293746948242, "grad_norm": 1.1890549659729004, "learning_rate": 7.464213446954487e-05} +{"ts": "2025-12-27T20:38:10", "event": "train_log", "step": 5536, "epoch": 2.3358649789029537, "progress_pct": 38.93, "epoch_pct": 38.93, "eta": "41:33:01", "max_grad_norm": 0.8, "loss": 0.6432797908782959, "grad_norm": 1.2041429281234741, "learning_rate": 7.462123606421183e-05} +{"ts": "2025-12-27T20:38:30", "event": "train_log", "step": 5538, "epoch": 2.3367088607594937, "progress_pct": 38.95, "epoch_pct": 38.95, "eta": "41:32:04", "max_grad_norm": 0.8, "loss": 0.6796717047691345, "grad_norm": 1.3827080726623535, "learning_rate": 7.460033197923249e-05} +{"ts": "2025-12-27T20:38:50", "event": "train_log", "step": 5540, "epoch": 2.3375527426160336, "progress_pct": 38.96, "epoch_pct": 38.96, "eta": "41:31:07", "max_grad_norm": 0.8, "loss": 0.5772476196289062, "grad_norm": 1.2323482036590576, "learning_rate": 7.457942221942903e-05} +{"ts": "2025-12-27T20:39:12", "event": "train_log", "step": 5542, "epoch": 2.338396624472574, "progress_pct": 38.97, "epoch_pct": 38.97, "eta": "41:30:13", "max_grad_norm": 0.8, "loss": 0.5964269042015076, "grad_norm": 1.2011388540267944, "learning_rate": 7.455850678962493e-05} +{"ts": "2025-12-27T20:39:33", "event": "train_log", "step": 5544, "epoch": 2.339240506329114, "progress_pct": 38.99, "epoch_pct": 38.99, "eta": "41:29:18", "max_grad_norm": 0.8, "loss": 0.6416608095169067, "grad_norm": 1.1133569478988647, "learning_rate": 7.453758569464495e-05} +{"ts": "2025-12-27T20:39:55", "event": "train_log", "step": 5546, "epoch": 2.340084388185654, "progress_pct": 39.0, "epoch_pct": 39.0, "eta": "41:28:24", "max_grad_norm": 0.8, "loss": 0.5668829679489136, "grad_norm": 1.1257679462432861, "learning_rate": 7.451665893931521e-05} +{"ts": "2025-12-27T20:40:16", "event": "train_log", "step": 5548, "epoch": 2.3409282700421943, "progress_pct": 39.02, "epoch_pct": 39.02, "eta": "41:27:29", "max_grad_norm": 0.8, "loss": 0.6029916405677795, "grad_norm": 1.3494724035263062, "learning_rate": 7.449572652846311e-05} +{"ts": "2025-12-27T20:40:37", "event": "train_log", "step": 5550, "epoch": 2.3417721518987342, "progress_pct": 39.03, "epoch_pct": 39.03, "eta": "41:26:33", "max_grad_norm": 0.8, "loss": 0.6336984634399414, "grad_norm": 1.2199759483337402, "learning_rate": 7.447478846691735e-05} +{"ts": "2025-12-27T20:40:57", "event": "train_log", "step": 5552, "epoch": 2.342616033755274, "progress_pct": 39.04, "epoch_pct": 39.04, "eta": "41:25:36", "max_grad_norm": 0.8, "loss": 0.579140305519104, "grad_norm": 1.2806570529937744, "learning_rate": 7.445384475950792e-05} +{"ts": "2025-12-27T20:41:20", "event": "train_log", "step": 5554, "epoch": 2.343459915611814, "progress_pct": 39.06, "epoch_pct": 39.06, "eta": "41:24:44", "max_grad_norm": 0.8, "loss": 0.6061640381813049, "grad_norm": 0.9874221086502075, "learning_rate": 7.443289541106616e-05} +{"ts": "2025-12-27T20:41:42", "event": "train_log", "step": 5556, "epoch": 2.3443037974683545, "progress_pct": 39.07, "epoch_pct": 39.07, "eta": "41:23:50", "max_grad_norm": 0.8, "loss": 0.5502339601516724, "grad_norm": 1.2271486520767212, "learning_rate": 7.441194042642467e-05} +{"ts": "2025-12-27T20:42:01", "event": "train_log", "step": 5558, "epoch": 2.3451476793248944, "progress_pct": 39.09, "epoch_pct": 39.09, "eta": "41:22:52", "max_grad_norm": 0.8, "loss": 0.5774438381195068, "grad_norm": 1.2522462606430054, "learning_rate": 7.439097981041738e-05} +{"ts": "2025-12-27T20:42:23", "event": "train_log", "step": 5560, "epoch": 2.3459915611814344, "progress_pct": 39.1, "epoch_pct": 39.1, "eta": "41:21:58", "max_grad_norm": 0.8, "loss": 0.6091527342796326, "grad_norm": 1.267204761505127, "learning_rate": 7.437001356787945e-05} +{"ts": "2025-12-27T20:42:46", "event": "train_log", "step": 5562, "epoch": 2.3468354430379748, "progress_pct": 39.11, "epoch_pct": 39.11, "eta": "41:21:05", "max_grad_norm": 0.8, "loss": 0.5443631410598755, "grad_norm": 1.1711935997009277, "learning_rate": 7.434904170364747e-05} +{"ts": "2025-12-27T20:43:08", "event": "train_log", "step": 5564, "epoch": 2.3476793248945147, "progress_pct": 39.13, "epoch_pct": 39.13, "eta": "41:20:12", "max_grad_norm": 0.8, "loss": 0.5255029201507568, "grad_norm": 1.085097074508667, "learning_rate": 7.432806422255918e-05} +{"ts": "2025-12-27T20:43:30", "event": "train_log", "step": 5566, "epoch": 2.3485232067510546, "progress_pct": 39.14, "epoch_pct": 39.14, "eta": "41:19:19", "max_grad_norm": 0.8, "loss": 0.5197238922119141, "grad_norm": 1.3244949579238892, "learning_rate": 7.430708112945369e-05} +{"ts": "2025-12-27T20:43:51", "event": "train_log", "step": 5568, "epoch": 2.349367088607595, "progress_pct": 39.16, "epoch_pct": 39.16, "eta": "41:18:24", "max_grad_norm": 0.8, "loss": 0.5576170682907104, "grad_norm": 1.3646879196166992, "learning_rate": 7.428609242917141e-05} +{"ts": "2025-12-27T20:44:12", "event": "train_log", "step": 5570, "epoch": 2.350210970464135, "progress_pct": 39.17, "epoch_pct": 39.17, "eta": "41:17:28", "max_grad_norm": 0.8, "loss": 0.6254662275314331, "grad_norm": 1.339190125465393, "learning_rate": 7.426509812655406e-05} +{"ts": "2025-12-27T20:44:33", "event": "train_log", "step": 5572, "epoch": 2.351054852320675, "progress_pct": 39.18, "epoch_pct": 39.18, "eta": "41:16:33", "max_grad_norm": 0.8, "loss": 0.6593500375747681, "grad_norm": 1.4624155759811401, "learning_rate": 7.424409822644457e-05} +{"ts": "2025-12-27T20:44:54", "event": "train_log", "step": 5574, "epoch": 2.3518987341772153, "progress_pct": 39.2, "epoch_pct": 39.2, "eta": "41:15:38", "max_grad_norm": 0.8, "loss": 0.6102238297462463, "grad_norm": 1.1931114196777344, "learning_rate": 7.422309273368722e-05} +{"ts": "2025-12-27T20:45:14", "event": "train_log", "step": 5576, "epoch": 2.3527426160337552, "progress_pct": 39.21, "epoch_pct": 39.21, "eta": "41:14:41", "max_grad_norm": 0.8, "loss": 0.6695854067802429, "grad_norm": 1.789340615272522, "learning_rate": 7.420208165312762e-05} +{"ts": "2025-12-27T20:45:35", "event": "train_log", "step": 5578, "epoch": 2.353586497890295, "progress_pct": 39.23, "epoch_pct": 39.23, "eta": "41:13:47", "max_grad_norm": 0.8, "loss": 0.578844428062439, "grad_norm": 1.2364262342453003, "learning_rate": 7.418106498961258e-05} +{"ts": "2025-12-27T20:45:58", "event": "train_log", "step": 5580, "epoch": 2.3544303797468356, "progress_pct": 39.24, "epoch_pct": 39.24, "eta": "41:12:54", "max_grad_norm": 0.8, "loss": 0.5717503428459167, "grad_norm": 1.1568509340286255, "learning_rate": 7.416004274799027e-05} +{"ts": "2025-12-27T20:46:18", "event": "train_log", "step": 5582, "epoch": 2.3552742616033755, "progress_pct": 39.25, "epoch_pct": 39.25, "eta": "41:11:58", "max_grad_norm": 0.8, "loss": 0.6170201897621155, "grad_norm": 1.1744630336761475, "learning_rate": 7.413901493311009e-05} +{"ts": "2025-12-27T20:46:41", "event": "train_log", "step": 5584, "epoch": 2.3561181434599154, "progress_pct": 39.27, "epoch_pct": 39.27, "eta": "41:11:05", "max_grad_norm": 0.8, "loss": 0.6482691764831543, "grad_norm": 1.0684332847595215, "learning_rate": 7.411798154982275e-05} +{"ts": "2025-12-27T20:47:02", "event": "train_log", "step": 5586, "epoch": 2.356962025316456, "progress_pct": 39.28, "epoch_pct": 39.28, "eta": "41:10:11", "max_grad_norm": 0.8, "loss": 0.572839617729187, "grad_norm": 1.046196460723877, "learning_rate": 7.409694260298025e-05} +{"ts": "2025-12-27T20:47:25", "event": "train_log", "step": 5588, "epoch": 2.3578059071729958, "progress_pct": 39.3, "epoch_pct": 39.3, "eta": "41:09:19", "max_grad_norm": 0.8, "loss": 0.5645976662635803, "grad_norm": 1.0110210180282593, "learning_rate": 7.407589809743591e-05} +{"ts": "2025-12-27T20:47:47", "event": "train_log", "step": 5590, "epoch": 2.3586497890295357, "progress_pct": 39.31, "epoch_pct": 39.31, "eta": "41:08:26", "max_grad_norm": 0.8, "loss": 0.5653133392333984, "grad_norm": 1.0801016092300415, "learning_rate": 7.405484803804425e-05} +{"ts": "2025-12-27T20:48:08", "event": "train_log", "step": 5592, "epoch": 2.359493670886076, "progress_pct": 39.32, "epoch_pct": 39.32, "eta": "41:07:30", "max_grad_norm": 0.8, "loss": 0.5972150564193726, "grad_norm": 1.0934380292892456, "learning_rate": 7.403379242966116e-05} +{"ts": "2025-12-27T20:48:29", "event": "train_log", "step": 5594, "epoch": 2.360337552742616, "progress_pct": 39.34, "epoch_pct": 39.34, "eta": "41:06:36", "max_grad_norm": 0.8, "loss": 0.5927542448043823, "grad_norm": 1.3722410202026367, "learning_rate": 7.40127312771437e-05} +{"ts": "2025-12-27T20:48:52", "event": "train_log", "step": 5596, "epoch": 2.361181434599156, "progress_pct": 39.35, "epoch_pct": 39.35, "eta": "41:05:44", "max_grad_norm": 0.8, "loss": 0.547027051448822, "grad_norm": 1.1567236185073853, "learning_rate": 7.399166458535032e-05} +{"ts": "2025-12-27T20:49:13", "event": "train_log", "step": 5598, "epoch": 2.3620253164556964, "progress_pct": 39.37, "epoch_pct": 39.37, "eta": "41:04:50", "max_grad_norm": 0.8, "loss": 0.5356617569923401, "grad_norm": 1.2254211902618408, "learning_rate": 7.397059235914067e-05} +{"ts": "2025-12-27T20:49:35", "event": "train_log", "step": 5600, "epoch": 2.3628691983122363, "progress_pct": 39.38, "epoch_pct": 39.38, "eta": "41:03:57", "max_grad_norm": 0.8, "loss": 0.5424175262451172, "grad_norm": 1.1529103517532349, "learning_rate": 7.394951460337575e-05} +{"ts": "2025-12-27T21:05:14", "event": "train_log", "step": 5600, "epoch": 2.3628691983122363, "progress_pct": 39.38, "epoch_pct": 39.38, "eta": "41:28:01", "max_grad_norm": 0.8, "eval_loss": 0.6851074695587158, "eval_runtime": 938.5536, "eval_samples_per_second": 2.245, "eval_steps_per_second": 2.245} +{"ts": "2025-12-27T21:05:33", "event": "train_log", "step": 5602, "epoch": 2.3637130801687762, "progress_pct": 39.4, "epoch_pct": 39.4, "eta": "41:27:04", "max_grad_norm": 0.8, "loss": 0.5834107398986816, "grad_norm": 1.2050299644470215, "learning_rate": 7.392843132291777e-05} +{"ts": "2025-12-27T21:05:55", "event": "train_log", "step": 5604, "epoch": 2.3645569620253166, "progress_pct": 39.41, "epoch_pct": 39.41, "eta": "41:26:10", "max_grad_norm": 0.8, "loss": 0.5445035099983215, "grad_norm": 1.264567494392395, "learning_rate": 7.390734252263024e-05} +{"ts": "2025-12-27T21:06:17", "event": "train_log", "step": 5606, "epoch": 2.3654008438818566, "progress_pct": 39.42, "epoch_pct": 39.42, "eta": "41:25:15", "max_grad_norm": 0.8, "loss": 0.6207653880119324, "grad_norm": 1.357791781425476, "learning_rate": 7.388624820737791e-05} +{"ts": "2025-12-27T21:06:38", "event": "train_log", "step": 5608, "epoch": 2.3662447257383965, "progress_pct": 39.44, "epoch_pct": 39.44, "eta": "41:24:20", "max_grad_norm": 0.8, "loss": 0.6628696322441101, "grad_norm": 1.2246928215026855, "learning_rate": 7.386514838202689e-05} +{"ts": "2025-12-27T21:07:00", "event": "train_log", "step": 5610, "epoch": 2.367088607594937, "progress_pct": 39.45, "epoch_pct": 39.45, "eta": "41:23:25", "max_grad_norm": 0.8, "loss": 0.5870704054832458, "grad_norm": 1.1455399990081787, "learning_rate": 7.384404305144447e-05} +{"ts": "2025-12-27T21:07:22", "event": "train_log", "step": 5612, "epoch": 2.367932489451477, "progress_pct": 39.47, "epoch_pct": 39.47, "eta": "41:22:31", "max_grad_norm": 0.8, "loss": 0.6160538792610168, "grad_norm": 1.2338638305664062, "learning_rate": 7.382293222049925e-05} +{"ts": "2025-12-27T21:07:43", "event": "train_log", "step": 5614, "epoch": 2.3687763713080168, "progress_pct": 39.48, "epoch_pct": 39.48, "eta": "41:21:36", "max_grad_norm": 0.8, "loss": 0.6274036765098572, "grad_norm": 1.231271505355835, "learning_rate": 7.38018158940611e-05} +{"ts": "2025-12-27T21:08:05", "event": "train_log", "step": 5616, "epoch": 2.369620253164557, "progress_pct": 39.49, "epoch_pct": 39.49, "eta": "41:20:41", "max_grad_norm": 0.8, "loss": 0.5623515248298645, "grad_norm": 1.022050380706787, "learning_rate": 7.378069407700114e-05} +{"ts": "2025-12-27T21:08:26", "event": "train_log", "step": 5618, "epoch": 2.370464135021097, "progress_pct": 39.51, "epoch_pct": 39.51, "eta": "41:19:46", "max_grad_norm": 0.8, "loss": 0.5505564212799072, "grad_norm": 1.2040951251983643, "learning_rate": 7.375956677419178e-05} +{"ts": "2025-12-27T21:08:46", "event": "train_log", "step": 5620, "epoch": 2.371308016877637, "progress_pct": 39.52, "epoch_pct": 39.52, "eta": "41:18:50", "max_grad_norm": 0.8, "loss": 0.6537002921104431, "grad_norm": 1.1754523515701294, "learning_rate": 7.373843399050668e-05} +{"ts": "2025-12-27T21:09:07", "event": "train_log", "step": 5622, "epoch": 2.3721518987341774, "progress_pct": 39.54, "epoch_pct": 39.54, "eta": "41:17:55", "max_grad_norm": 0.8, "loss": 0.6224458813667297, "grad_norm": 1.1710485219955444, "learning_rate": 7.371729573082073e-05} +{"ts": "2025-12-27T21:09:29", "event": "train_log", "step": 5624, "epoch": 2.3729957805907174, "progress_pct": 39.55, "epoch_pct": 39.55, "eta": "41:17:01", "max_grad_norm": 0.8, "loss": 0.6297177076339722, "grad_norm": 1.1629483699798584, "learning_rate": 7.36961520000102e-05} +{"ts": "2025-12-27T21:09:50", "event": "train_log", "step": 5626, "epoch": 2.3738396624472573, "progress_pct": 39.56, "epoch_pct": 39.56, "eta": "41:16:05", "max_grad_norm": 0.8, "loss": 0.5202008485794067, "grad_norm": 1.1069440841674805, "learning_rate": 7.367500280295248e-05} +{"ts": "2025-12-27T21:10:13", "event": "train_log", "step": 5628, "epoch": 2.3746835443037977, "progress_pct": 39.58, "epoch_pct": 39.58, "eta": "41:15:13", "max_grad_norm": 0.8, "loss": 0.5256102681159973, "grad_norm": 1.0068297386169434, "learning_rate": 7.36538481445263e-05} +{"ts": "2025-12-27T21:10:34", "event": "train_log", "step": 5630, "epoch": 2.3755274261603376, "progress_pct": 39.59, "epoch_pct": 39.59, "eta": "41:14:17", "max_grad_norm": 0.8, "loss": 0.5460903644561768, "grad_norm": 1.1103417873382568, "learning_rate": 7.363268802961161e-05} +{"ts": "2025-12-27T21:10:55", "event": "train_log", "step": 5632, "epoch": 2.3763713080168776, "progress_pct": 39.61, "epoch_pct": 39.61, "eta": "41:13:23", "max_grad_norm": 0.8, "loss": 0.5817124247550964, "grad_norm": 1.2885268926620483, "learning_rate": 7.361152246308969e-05} +{"ts": "2025-12-27T21:11:16", "event": "train_log", "step": 5634, "epoch": 2.377215189873418, "progress_pct": 39.62, "epoch_pct": 39.62, "eta": "41:12:28", "max_grad_norm": 0.8, "loss": 0.5415143966674805, "grad_norm": 1.233831524848938, "learning_rate": 7.359035144984302e-05} +{"ts": "2025-12-27T21:11:38", "event": "train_log", "step": 5636, "epoch": 2.378059071729958, "progress_pct": 39.63, "epoch_pct": 39.63, "eta": "41:11:33", "max_grad_norm": 0.8, "loss": 0.6837685108184814, "grad_norm": 1.3451908826828003, "learning_rate": 7.35691749947553e-05} +{"ts": "2025-12-27T21:12:00", "event": "train_log", "step": 5638, "epoch": 2.378902953586498, "progress_pct": 39.65, "epoch_pct": 39.65, "eta": "41:10:40", "max_grad_norm": 0.8, "loss": 0.5966196656227112, "grad_norm": 1.1320621967315674, "learning_rate": 7.354799310271159e-05} +{"ts": "2025-12-27T21:12:22", "event": "train_log", "step": 5640, "epoch": 2.379746835443038, "progress_pct": 39.66, "epoch_pct": 39.66, "eta": "41:09:47", "max_grad_norm": 0.8, "loss": 0.5607479214668274, "grad_norm": 1.1884461641311646, "learning_rate": 7.35268057785981e-05} +{"ts": "2025-12-27T21:12:45", "event": "train_log", "step": 5642, "epoch": 2.380590717299578, "progress_pct": 39.68, "epoch_pct": 39.68, "eta": "41:08:55", "max_grad_norm": 0.8, "loss": 0.595242977142334, "grad_norm": 1.2710856199264526, "learning_rate": 7.350561302730236e-05} +{"ts": "2025-12-27T21:13:06", "event": "train_log", "step": 5644, "epoch": 2.381434599156118, "progress_pct": 39.69, "epoch_pct": 39.69, "eta": "41:07:59", "max_grad_norm": 0.8, "loss": 0.6208752393722534, "grad_norm": 1.3110458850860596, "learning_rate": 7.348441485371314e-05} +{"ts": "2025-12-27T21:13:28", "event": "train_log", "step": 5646, "epoch": 2.382278481012658, "progress_pct": 39.7, "epoch_pct": 39.7, "eta": "41:07:05", "max_grad_norm": 0.8, "loss": 0.6173125505447388, "grad_norm": 1.1734380722045898, "learning_rate": 7.346321126272044e-05} +{"ts": "2025-12-27T21:13:51", "event": "train_log", "step": 5648, "epoch": 2.3831223628691984, "progress_pct": 39.72, "epoch_pct": 39.72, "eta": "41:06:14", "max_grad_norm": 0.8, "loss": 0.6013050675392151, "grad_norm": 1.2024762630462646, "learning_rate": 7.34420022592155e-05} +{"ts": "2025-12-27T21:14:14", "event": "train_log", "step": 5650, "epoch": 2.3839662447257384, "progress_pct": 39.73, "epoch_pct": 39.73, "eta": "41:05:22", "max_grad_norm": 0.8, "loss": 0.5919594764709473, "grad_norm": 1.1305288076400757, "learning_rate": 7.342078784809086e-05} +{"ts": "2025-12-27T21:14:36", "event": "train_log", "step": 5652, "epoch": 2.3848101265822783, "progress_pct": 39.75, "epoch_pct": 39.75, "eta": "41:04:28", "max_grad_norm": 0.8, "loss": 0.5399283766746521, "grad_norm": 1.075323462486267, "learning_rate": 7.339956803424028e-05} +{"ts": "2025-12-27T21:14:58", "event": "train_log", "step": 5654, "epoch": 2.3856540084388187, "progress_pct": 39.76, "epoch_pct": 39.76, "eta": "41:03:35", "max_grad_norm": 0.8, "loss": 0.6253576874732971, "grad_norm": 1.2035599946975708, "learning_rate": 7.337834282255873e-05} +{"ts": "2025-12-27T21:15:21", "event": "train_log", "step": 5656, "epoch": 2.3864978902953586, "progress_pct": 39.77, "epoch_pct": 39.77, "eta": "41:02:42", "max_grad_norm": 0.8, "loss": 0.5247007608413696, "grad_norm": 1.0572105646133423, "learning_rate": 7.335711221794251e-05} +{"ts": "2025-12-27T21:15:41", "event": "train_log", "step": 5658, "epoch": 2.3873417721518986, "progress_pct": 39.79, "epoch_pct": 39.79, "eta": "41:01:46", "max_grad_norm": 0.8, "loss": 0.5800243020057678, "grad_norm": 1.2701191902160645, "learning_rate": 7.333587622528906e-05} +{"ts": "2025-12-27T21:16:02", "event": "train_log", "step": 5660, "epoch": 2.388185654008439, "progress_pct": 39.8, "epoch_pct": 39.8, "eta": "41:00:51", "max_grad_norm": 0.8, "loss": 0.589645504951477, "grad_norm": 1.1772741079330444, "learning_rate": 7.331463484949716e-05} +{"ts": "2025-12-27T21:16:24", "event": "train_log", "step": 5662, "epoch": 2.389029535864979, "progress_pct": 39.82, "epoch_pct": 39.82, "eta": "40:59:58", "max_grad_norm": 0.8, "loss": 0.5820419192314148, "grad_norm": 1.0562703609466553, "learning_rate": 7.329338809546674e-05} +{"ts": "2025-12-27T21:16:45", "event": "train_log", "step": 5664, "epoch": 2.389873417721519, "progress_pct": 39.83, "epoch_pct": 39.83, "eta": "40:59:03", "max_grad_norm": 0.8, "loss": 0.591435432434082, "grad_norm": 1.1634355783462524, "learning_rate": 7.327213596809906e-05} +{"ts": "2025-12-27T21:17:07", "event": "train_log", "step": 5666, "epoch": 2.3907172995780592, "progress_pct": 39.85, "epoch_pct": 39.85, "eta": "40:58:09", "max_grad_norm": 0.8, "loss": 0.5630883574485779, "grad_norm": 1.2220302820205688, "learning_rate": 7.325087847229655e-05} +{"ts": "2025-12-27T21:17:28", "event": "train_log", "step": 5668, "epoch": 2.391561181434599, "progress_pct": 39.86, "epoch_pct": 39.86, "eta": "40:57:15", "max_grad_norm": 0.8, "loss": 0.6050130128860474, "grad_norm": 1.4087659120559692, "learning_rate": 7.322961561296294e-05} +{"ts": "2025-12-27T21:17:50", "event": "train_log", "step": 5670, "epoch": 2.392405063291139, "progress_pct": 39.87, "epoch_pct": 39.87, "eta": "40:56:21", "max_grad_norm": 0.8, "loss": 0.56146240234375, "grad_norm": 1.1126172542572021, "learning_rate": 7.320834739500313e-05} +{"ts": "2025-12-27T21:18:11", "event": "train_log", "step": 5672, "epoch": 2.3932489451476795, "progress_pct": 39.89, "epoch_pct": 39.89, "eta": "40:55:27", "max_grad_norm": 0.8, "loss": 0.5507852435112, "grad_norm": 0.99373859167099, "learning_rate": 7.31870738233233e-05} +{"ts": "2025-12-27T21:18:33", "event": "train_log", "step": 5674, "epoch": 2.3940928270042194, "progress_pct": 39.9, "epoch_pct": 39.9, "eta": "40:54:33", "max_grad_norm": 0.8, "loss": 0.5895347595214844, "grad_norm": 1.14408540725708, "learning_rate": 7.316579490283085e-05} +{"ts": "2025-12-27T21:18:55", "event": "train_log", "step": 5676, "epoch": 2.3949367088607594, "progress_pct": 39.92, "epoch_pct": 39.92, "eta": "40:53:40", "max_grad_norm": 0.8, "loss": 0.5304404497146606, "grad_norm": 1.1728581190109253, "learning_rate": 7.314451063843443e-05} +{"ts": "2025-12-27T21:19:17", "event": "train_log", "step": 5678, "epoch": 2.3957805907172998, "progress_pct": 39.93, "epoch_pct": 39.93, "eta": "40:52:47", "max_grad_norm": 0.8, "loss": 0.5805793404579163, "grad_norm": 1.1721378564834595, "learning_rate": 7.31232210350439e-05} +{"ts": "2025-12-27T21:19:38", "event": "train_log", "step": 5680, "epoch": 2.3966244725738397, "progress_pct": 39.94, "epoch_pct": 39.94, "eta": "40:51:52", "max_grad_norm": 0.8, "loss": 0.5671767592430115, "grad_norm": 1.0499866008758545, "learning_rate": 7.310192609757038e-05} +{"ts": "2025-12-27T21:19:59", "event": "train_log", "step": 5682, "epoch": 2.3974683544303796, "progress_pct": 39.96, "epoch_pct": 39.96, "eta": "40:50:58", "max_grad_norm": 0.8, "loss": 0.6335723400115967, "grad_norm": 1.0959177017211914, "learning_rate": 7.308062583092617e-05} +{"ts": "2025-12-27T21:20:21", "event": "train_log", "step": 5684, "epoch": 2.3983122362869196, "progress_pct": 39.97, "epoch_pct": 39.97, "eta": "40:50:04", "max_grad_norm": 0.8, "loss": 0.6032374501228333, "grad_norm": 1.31142258644104, "learning_rate": 7.305932024002487e-05} +{"ts": "2025-12-27T21:20:42", "event": "train_log", "step": 5686, "epoch": 2.39915611814346, "progress_pct": 39.99, "epoch_pct": 39.99, "eta": "40:49:10", "max_grad_norm": 0.8, "loss": 0.5492936372756958, "grad_norm": 0.9212818741798401, "learning_rate": 7.303800932978124e-05} +{"ts": "2025-12-27T21:21:04", "event": "train_log", "step": 5688, "epoch": 2.4, "progress_pct": 40.0, "epoch_pct": 40.0, "eta": "40:48:17", "max_grad_norm": 0.8, "loss": 0.5533297061920166, "grad_norm": 1.1956428289413452, "learning_rate": 7.301669310511132e-05} +{"ts": "2025-12-27T21:21:25", "event": "train_log", "step": 5690, "epoch": 2.40084388185654, "progress_pct": 40.01, "epoch_pct": 40.01, "eta": "40:47:22", "max_grad_norm": 0.8, "loss": 0.5859368443489075, "grad_norm": 1.4048634767532349, "learning_rate": 7.299537157093232e-05} +{"ts": "2025-12-27T21:21:49", "event": "train_log", "step": 5692, "epoch": 2.4016877637130802, "progress_pct": 40.03, "epoch_pct": 40.03, "eta": "40:46:31", "max_grad_norm": 0.8, "loss": 0.5099439024925232, "grad_norm": 1.0580679178237915, "learning_rate": 7.297404473216277e-05} +{"ts": "2025-12-27T21:22:10", "event": "train_log", "step": 5694, "epoch": 2.40253164556962, "progress_pct": 40.04, "epoch_pct": 40.04, "eta": "40:45:37", "max_grad_norm": 0.8, "loss": 0.5631486177444458, "grad_norm": 1.2450575828552246, "learning_rate": 7.29527125937223e-05} +{"ts": "2025-12-27T21:22:30", "event": "train_log", "step": 5696, "epoch": 2.40337552742616, "progress_pct": 40.06, "epoch_pct": 40.06, "eta": "40:44:42", "max_grad_norm": 0.8, "loss": 0.6045404672622681, "grad_norm": 1.338466763496399, "learning_rate": 7.293137516053187e-05} +{"ts": "2025-12-27T21:22:50", "event": "train_log", "step": 5698, "epoch": 2.4042194092827005, "progress_pct": 40.07, "epoch_pct": 40.07, "eta": "40:43:46", "max_grad_norm": 0.8, "loss": 0.6063475608825684, "grad_norm": 1.198588252067566, "learning_rate": 7.291003243751358e-05} +{"ts": "2025-12-27T21:23:11", "event": "train_log", "step": 5700, "epoch": 2.4050632911392404, "progress_pct": 40.08, "epoch_pct": 40.08, "eta": "40:42:51", "max_grad_norm": 0.8, "loss": 0.5734809041023254, "grad_norm": 1.2315080165863037, "learning_rate": 7.288868442959081e-05} +{"ts": "2025-12-27T21:38:52", "event": "train_log", "step": 5700, "epoch": 2.4050632911392404, "progress_pct": 40.08, "epoch_pct": 40.08, "eta": "41:06:18", "max_grad_norm": 0.8, "eval_loss": 0.6841402053833008, "eval_runtime": 941.6641, "eval_samples_per_second": 2.238, "eval_steps_per_second": 2.238} +{"ts": "2025-12-27T21:39:13", "event": "train_log", "step": 5702, "epoch": 2.4059071729957804, "progress_pct": 40.1, "epoch_pct": 40.1, "eta": "41:05:22", "max_grad_norm": 0.8, "loss": 0.5744594931602478, "grad_norm": 1.1494885683059692, "learning_rate": 7.286733114168812e-05} +{"ts": "2025-12-27T21:39:33", "event": "train_log", "step": 5704, "epoch": 2.4067510548523208, "progress_pct": 40.11, "epoch_pct": 40.11, "eta": "41:04:26", "max_grad_norm": 0.8, "loss": 0.611789882183075, "grad_norm": 1.3769505023956299, "learning_rate": 7.284597257873132e-05} +{"ts": "2025-12-27T21:39:54", "event": "train_log", "step": 5706, "epoch": 2.4075949367088607, "progress_pct": 40.13, "epoch_pct": 40.13, "eta": "41:03:31", "max_grad_norm": 0.8, "loss": 0.6091431975364685, "grad_norm": 1.2326449155807495, "learning_rate": 7.28246087456474e-05} +{"ts": "2025-12-27T21:40:16", "event": "train_log", "step": 5708, "epoch": 2.4084388185654007, "progress_pct": 40.14, "epoch_pct": 40.14, "eta": "41:02:37", "max_grad_norm": 0.8, "loss": 0.49431973695755005, "grad_norm": 1.1960830688476562, "learning_rate": 7.28032396473646e-05} +{"ts": "2025-12-27T21:40:40", "event": "train_log", "step": 5710, "epoch": 2.409282700421941, "progress_pct": 40.15, "epoch_pct": 40.15, "eta": "41:01:46", "max_grad_norm": 0.8, "loss": 0.5344718098640442, "grad_norm": 1.1672827005386353, "learning_rate": 7.278186528881237e-05} +{"ts": "2025-12-27T21:41:01", "event": "train_log", "step": 5712, "epoch": 2.410126582278481, "progress_pct": 40.17, "epoch_pct": 40.17, "eta": "41:00:51", "max_grad_norm": 0.8, "loss": 0.6011165380477905, "grad_norm": 1.1923719644546509, "learning_rate": 7.276048567492136e-05} +{"ts": "2025-12-27T21:41:22", "event": "train_log", "step": 5714, "epoch": 2.410970464135021, "progress_pct": 40.18, "epoch_pct": 40.18, "eta": "40:59:56", "max_grad_norm": 0.8, "loss": 0.6300925016403198, "grad_norm": 1.2314990758895874, "learning_rate": 7.273910081062341e-05} +{"ts": "2025-12-27T21:41:44", "event": "train_log", "step": 5716, "epoch": 2.4118143459915613, "progress_pct": 40.2, "epoch_pct": 40.2, "eta": "40:59:03", "max_grad_norm": 0.8, "loss": 0.56329345703125, "grad_norm": 0.8976680040359497, "learning_rate": 7.27177107008516e-05} +{"ts": "2025-12-27T21:42:05", "event": "train_log", "step": 5718, "epoch": 2.4126582278481012, "progress_pct": 40.21, "epoch_pct": 40.21, "eta": "40:58:07", "max_grad_norm": 0.8, "loss": 0.6266427040100098, "grad_norm": 1.2954038381576538, "learning_rate": 7.269631535054026e-05} +{"ts": "2025-12-27T21:42:27", "event": "train_log", "step": 5720, "epoch": 2.413502109704641, "progress_pct": 40.23, "epoch_pct": 40.23, "eta": "40:57:13", "max_grad_norm": 0.8, "loss": 0.6234018802642822, "grad_norm": 1.3357585668563843, "learning_rate": 7.267491476462485e-05} +{"ts": "2025-12-27T21:42:48", "event": "train_log", "step": 5722, "epoch": 2.4143459915611816, "progress_pct": 40.24, "epoch_pct": 40.24, "eta": "40:56:19", "max_grad_norm": 0.8, "loss": 0.5909059047698975, "grad_norm": 1.1913645267486572, "learning_rate": 7.265350894804209e-05} +{"ts": "2025-12-27T21:43:09", "event": "train_log", "step": 5724, "epoch": 2.4151898734177215, "progress_pct": 40.25, "epoch_pct": 40.25, "eta": "40:55:24", "max_grad_norm": 0.8, "loss": 0.5708479285240173, "grad_norm": 1.3425955772399902, "learning_rate": 7.263209790572986e-05} +{"ts": "2025-12-27T21:43:30", "event": "train_log", "step": 5726, "epoch": 2.4160337552742615, "progress_pct": 40.27, "epoch_pct": 40.27, "eta": "40:54:28", "max_grad_norm": 0.8, "loss": 0.5810034871101379, "grad_norm": 1.2258507013320923, "learning_rate": 7.261068164262734e-05} +{"ts": "2025-12-27T21:43:52", "event": "train_log", "step": 5728, "epoch": 2.416877637130802, "progress_pct": 40.28, "epoch_pct": 40.28, "eta": "40:53:35", "max_grad_norm": 0.8, "loss": 0.5939235687255859, "grad_norm": 1.348794937133789, "learning_rate": 7.258926016367479e-05} +{"ts": "2025-12-27T21:44:16", "event": "train_log", "step": 5730, "epoch": 2.4177215189873418, "progress_pct": 40.3, "epoch_pct": 40.3, "eta": "40:52:45", "max_grad_norm": 0.8, "loss": 0.6298259496688843, "grad_norm": 1.0896574258804321, "learning_rate": 7.256783347381375e-05} +{"ts": "2025-12-27T21:44:37", "event": "train_log", "step": 5732, "epoch": 2.4185654008438817, "progress_pct": 40.31, "epoch_pct": 40.31, "eta": "40:51:50", "max_grad_norm": 0.8, "loss": 0.5277430415153503, "grad_norm": 1.164866328239441, "learning_rate": 7.254640157798696e-05} +{"ts": "2025-12-27T21:44:59", "event": "train_log", "step": 5734, "epoch": 2.419409282700422, "progress_pct": 40.32, "epoch_pct": 40.32, "eta": "40:50:56", "max_grad_norm": 0.8, "loss": 0.5724055767059326, "grad_norm": 1.1215453147888184, "learning_rate": 7.252496448113833e-05} +{"ts": "2025-12-27T21:45:21", "event": "train_log", "step": 5736, "epoch": 2.420253164556962, "progress_pct": 40.34, "epoch_pct": 40.34, "eta": "40:50:04", "max_grad_norm": 0.8, "loss": 0.5439977645874023, "grad_norm": 1.0640764236450195, "learning_rate": 7.2503522188213e-05} +{"ts": "2025-12-27T21:45:42", "event": "train_log", "step": 5738, "epoch": 2.421097046413502, "progress_pct": 40.35, "epoch_pct": 40.35, "eta": "40:49:08", "max_grad_norm": 0.8, "loss": 0.7568614482879639, "grad_norm": 1.4874604940414429, "learning_rate": 7.248207470415729e-05} +{"ts": "2025-12-27T21:46:04", "event": "train_log", "step": 5740, "epoch": 2.4219409282700424, "progress_pct": 40.37, "epoch_pct": 40.37, "eta": "40:48:16", "max_grad_norm": 0.8, "loss": 0.6389632225036621, "grad_norm": 1.2611099481582642, "learning_rate": 7.246062203391873e-05} +{"ts": "2025-12-27T21:46:26", "event": "train_log", "step": 5742, "epoch": 2.4227848101265823, "progress_pct": 40.38, "epoch_pct": 40.38, "eta": "40:47:22", "max_grad_norm": 0.8, "loss": 0.6180628538131714, "grad_norm": 1.185644507408142, "learning_rate": 7.243916418244602e-05} +{"ts": "2025-12-27T21:46:47", "event": "train_log", "step": 5744, "epoch": 2.4236286919831223, "progress_pct": 40.39, "epoch_pct": 40.39, "eta": "40:46:27", "max_grad_norm": 0.8, "loss": 0.619799017906189, "grad_norm": 1.1648430824279785, "learning_rate": 7.241770115468909e-05} +{"ts": "2025-12-27T21:47:09", "event": "train_log", "step": 5746, "epoch": 2.4244725738396626, "progress_pct": 40.41, "epoch_pct": 40.41, "eta": "40:45:34", "max_grad_norm": 0.8, "loss": 0.6446201205253601, "grad_norm": 1.1974445581436157, "learning_rate": 7.239623295559903e-05} +{"ts": "2025-12-27T21:47:30", "event": "train_log", "step": 5748, "epoch": 2.4253164556962026, "progress_pct": 40.42, "epoch_pct": 40.42, "eta": "40:44:39", "max_grad_norm": 0.8, "loss": 0.5839580297470093, "grad_norm": 1.140477180480957, "learning_rate": 7.237475959012818e-05} +{"ts": "2025-12-27T21:47:53", "event": "train_log", "step": 5750, "epoch": 2.4261603375527425, "progress_pct": 40.44, "epoch_pct": 40.44, "eta": "40:43:47", "max_grad_norm": 0.8, "loss": 0.48815420269966125, "grad_norm": 1.1374423503875732, "learning_rate": 7.235328106322998e-05} +{"ts": "2025-12-27T21:48:14", "event": "train_log", "step": 5752, "epoch": 2.427004219409283, "progress_pct": 40.45, "epoch_pct": 40.45, "eta": "40:42:52", "max_grad_norm": 0.8, "loss": 0.638519287109375, "grad_norm": 1.411432147026062, "learning_rate": 7.233179737985916e-05} +{"ts": "2025-12-27T21:48:37", "event": "train_log", "step": 5754, "epoch": 2.427848101265823, "progress_pct": 40.46, "epoch_pct": 40.46, "eta": "40:42:00", "max_grad_norm": 0.8, "loss": 0.5776677131652832, "grad_norm": 1.1232497692108154, "learning_rate": 7.231030854497157e-05} +{"ts": "2025-12-27T21:49:00", "event": "train_log", "step": 5756, "epoch": 2.428691983122363, "progress_pct": 40.48, "epoch_pct": 40.48, "eta": "40:41:09", "max_grad_norm": 0.8, "loss": 0.5297027230262756, "grad_norm": 1.0815738439559937, "learning_rate": 7.228881456352428e-05} +{"ts": "2025-12-27T21:49:23", "event": "train_log", "step": 5758, "epoch": 2.429535864978903, "progress_pct": 40.49, "epoch_pct": 40.49, "eta": "40:40:17", "max_grad_norm": 0.8, "loss": 0.5630011558532715, "grad_norm": 1.2230733633041382, "learning_rate": 7.226731544047553e-05} +{"ts": "2025-12-27T21:49:43", "event": "train_log", "step": 5760, "epoch": 2.430379746835443, "progress_pct": 40.51, "epoch_pct": 40.51, "eta": "40:39:21", "max_grad_norm": 0.8, "loss": 0.5772101283073425, "grad_norm": 1.2033147811889648, "learning_rate": 7.224581118078476e-05} +{"ts": "2025-12-27T21:50:06", "event": "train_log", "step": 5762, "epoch": 2.431223628691983, "progress_pct": 40.52, "epoch_pct": 40.52, "eta": "40:38:30", "max_grad_norm": 0.8, "loss": 0.5412847399711609, "grad_norm": 1.2150053977966309, "learning_rate": 7.22243017894126e-05} +{"ts": "2025-12-27T21:50:28", "event": "train_log", "step": 5764, "epoch": 2.4320675105485234, "progress_pct": 40.53, "epoch_pct": 40.53, "eta": "40:37:37", "max_grad_norm": 0.8, "loss": 0.5568405389785767, "grad_norm": 1.0494824647903442, "learning_rate": 7.220278727132083e-05} +{"ts": "2025-12-27T21:50:52", "event": "train_log", "step": 5766, "epoch": 2.4329113924050634, "progress_pct": 40.55, "epoch_pct": 40.55, "eta": "40:36:46", "max_grad_norm": 0.8, "loss": 0.6022217869758606, "grad_norm": 1.2803306579589844, "learning_rate": 7.218126763147244e-05} +{"ts": "2025-12-27T21:51:14", "event": "train_log", "step": 5768, "epoch": 2.4337552742616033, "progress_pct": 40.56, "epoch_pct": 40.56, "eta": "40:35:54", "max_grad_norm": 0.8, "loss": 0.5568796396255493, "grad_norm": 1.0832798480987549, "learning_rate": 7.215974287483163e-05} +{"ts": "2025-12-27T21:51:37", "event": "train_log", "step": 5770, "epoch": 2.4345991561181437, "progress_pct": 40.58, "epoch_pct": 40.58, "eta": "40:35:02", "max_grad_norm": 0.8, "loss": 0.5607990026473999, "grad_norm": 1.1829264163970947, "learning_rate": 7.213821300636372e-05} +{"ts": "2025-12-27T21:51:58", "event": "train_log", "step": 5772, "epoch": 2.4354430379746836, "progress_pct": 40.59, "epoch_pct": 40.59, "eta": "40:34:07", "max_grad_norm": 0.8, "loss": 0.6382274031639099, "grad_norm": 2.3017473220825195, "learning_rate": 7.211667803103523e-05} +{"ts": "2025-12-27T21:52:19", "event": "train_log", "step": 5774, "epoch": 2.4362869198312236, "progress_pct": 40.6, "epoch_pct": 40.6, "eta": "40:33:13", "max_grad_norm": 0.8, "loss": 0.5748776793479919, "grad_norm": 1.1701387166976929, "learning_rate": 7.209513795381388e-05} +{"ts": "2025-12-27T21:52:41", "event": "train_log", "step": 5776, "epoch": 2.4371308016877635, "progress_pct": 40.62, "epoch_pct": 40.62, "eta": "40:32:20", "max_grad_norm": 0.8, "loss": 0.5760934352874756, "grad_norm": 1.0480856895446777, "learning_rate": 7.207359277966856e-05} +{"ts": "2025-12-27T21:53:03", "event": "train_log", "step": 5778, "epoch": 2.437974683544304, "progress_pct": 40.63, "epoch_pct": 40.63, "eta": "40:31:27", "max_grad_norm": 0.8, "loss": 0.6387208104133606, "grad_norm": 1.2263693809509277, "learning_rate": 7.20520425135693e-05} +{"ts": "2025-12-27T21:53:24", "event": "train_log", "step": 5780, "epoch": 2.438818565400844, "progress_pct": 40.65, "epoch_pct": 40.65, "eta": "40:30:32", "max_grad_norm": 0.8, "loss": 0.6078037619590759, "grad_norm": 1.219246506690979, "learning_rate": 7.203048716048737e-05} +{"ts": "2025-12-27T21:53:46", "event": "train_log", "step": 5782, "epoch": 2.439662447257384, "progress_pct": 40.66, "epoch_pct": 40.66, "eta": "40:29:39", "max_grad_norm": 0.8, "loss": 0.606924831867218, "grad_norm": 1.2452640533447266, "learning_rate": 7.200892672539515e-05} +{"ts": "2025-12-27T21:54:07", "event": "train_log", "step": 5784, "epoch": 2.440506329113924, "progress_pct": 40.68, "epoch_pct": 40.68, "eta": "40:28:45", "max_grad_norm": 0.8, "loss": 0.585297703742981, "grad_norm": 1.3469732999801636, "learning_rate": 7.198736121326621e-05} +{"ts": "2025-12-27T21:54:30", "event": "train_log", "step": 5786, "epoch": 2.441350210970464, "progress_pct": 40.69, "epoch_pct": 40.69, "eta": "40:27:53", "max_grad_norm": 0.8, "loss": 0.5849902033805847, "grad_norm": 1.151127576828003, "learning_rate": 7.196579062907533e-05} +{"ts": "2025-12-27T21:54:53", "event": "train_log", "step": 5788, "epoch": 2.442194092827004, "progress_pct": 40.7, "epoch_pct": 40.7, "eta": "40:27:02", "max_grad_norm": 0.8, "loss": 0.6150397062301636, "grad_norm": 1.0669564008712769, "learning_rate": 7.19442149777984e-05} +{"ts": "2025-12-27T21:55:13", "event": "train_log", "step": 5790, "epoch": 2.4430379746835444, "progress_pct": 40.72, "epoch_pct": 40.72, "eta": "40:26:07", "max_grad_norm": 0.8, "loss": 0.6324567794799805, "grad_norm": 1.1700209379196167, "learning_rate": 7.192263426441252e-05} +{"ts": "2025-12-27T21:55:35", "event": "train_log", "step": 5792, "epoch": 2.4438818565400844, "progress_pct": 40.73, "epoch_pct": 40.73, "eta": "40:25:14", "max_grad_norm": 0.8, "loss": 0.6202381253242493, "grad_norm": 1.2832094430923462, "learning_rate": 7.190104849389597e-05} +{"ts": "2025-12-27T21:55:57", "event": "train_log", "step": 5794, "epoch": 2.4447257383966243, "progress_pct": 40.75, "epoch_pct": 40.75, "eta": "40:24:21", "max_grad_norm": 0.8, "loss": 0.6156684756278992, "grad_norm": 1.2046177387237549, "learning_rate": 7.187945767122813e-05} +{"ts": "2025-12-27T21:56:19", "event": "train_log", "step": 5796, "epoch": 2.4455696202531647, "progress_pct": 40.76, "epoch_pct": 40.76, "eta": "40:23:29", "max_grad_norm": 0.8, "loss": 0.5763497352600098, "grad_norm": 1.031133770942688, "learning_rate": 7.185786180138961e-05} +{"ts": "2025-12-27T21:56:41", "event": "train_log", "step": 5798, "epoch": 2.4464135021097047, "progress_pct": 40.77, "epoch_pct": 40.77, "eta": "40:22:36", "max_grad_norm": 0.8, "loss": 0.5419677495956421, "grad_norm": 1.2803475856781006, "learning_rate": 7.183626088936216e-05} +{"ts": "2025-12-27T21:57:03", "event": "train_log", "step": 5800, "epoch": 2.4472573839662446, "progress_pct": 40.79, "epoch_pct": 40.79, "eta": "40:21:43", "max_grad_norm": 0.8, "loss": 0.629108190536499, "grad_norm": 1.2407588958740234, "learning_rate": 7.181465494012869e-05} +{"ts": "2025-12-27T22:09:41", "event": "train_log", "step": 5800, "epoch": 2.4472573839662446, "progress_pct": 40.79, "epoch_pct": 40.79, "eta": "40:40:04", "max_grad_norm": 0.8, "eval_loss": 0.6835155487060547, "eval_runtime": 758.407, "eval_samples_per_second": 2.778, "eval_steps_per_second": 2.778} +{"ts": "2025-12-27T22:09:53", "event": "train_log", "step": 5802, "epoch": 2.448101265822785, "progress_pct": 40.8, "epoch_pct": 40.8, "eta": "40:38:55", "max_grad_norm": 0.8, "loss": 0.6146516799926758, "grad_norm": 1.3525878190994263, "learning_rate": 7.17930439586733e-05} +{"ts": "2025-12-27T22:10:05", "event": "train_log", "step": 5804, "epoch": 2.448945147679325, "progress_pct": 40.82, "epoch_pct": 40.82, "eta": "40:37:48", "max_grad_norm": 0.8, "loss": 0.5796315670013428, "grad_norm": 1.255921721458435, "learning_rate": 7.177142794998121e-05} +{"ts": "2025-12-27T22:10:16", "event": "train_log", "step": 5806, "epoch": 2.449789029535865, "progress_pct": 40.83, "epoch_pct": 40.83, "eta": "40:36:40", "max_grad_norm": 0.8, "loss": 0.5978766679763794, "grad_norm": 1.2135448455810547, "learning_rate": 7.174980691903881e-05} +{"ts": "2025-12-27T22:10:29", "event": "train_log", "step": 5808, "epoch": 2.4506329113924052, "progress_pct": 40.84, "epoch_pct": 40.84, "eta": "40:35:32", "max_grad_norm": 0.8, "loss": 0.5941054821014404, "grad_norm": 1.117942214012146, "learning_rate": 7.172818087083367e-05} +{"ts": "2025-12-27T22:10:41", "event": "train_log", "step": 5810, "epoch": 2.451476793248945, "progress_pct": 40.86, "epoch_pct": 40.86, "eta": "40:34:25", "max_grad_norm": 0.8, "loss": 0.6213865876197815, "grad_norm": 1.2917672395706177, "learning_rate": 7.17065498103545e-05} +{"ts": "2025-12-27T22:10:53", "event": "train_log", "step": 5812, "epoch": 2.452320675105485, "progress_pct": 40.87, "epoch_pct": 40.87, "eta": "40:33:18", "max_grad_norm": 0.8, "loss": 0.627090573310852, "grad_norm": 1.2287952899932861, "learning_rate": 7.168491374259118e-05} +{"ts": "2025-12-27T22:11:06", "event": "train_log", "step": 5814, "epoch": 2.453164556962025, "progress_pct": 40.89, "epoch_pct": 40.89, "eta": "40:32:11", "max_grad_norm": 0.8, "loss": 0.605871319770813, "grad_norm": 1.2427480220794678, "learning_rate": 7.16632726725347e-05} +{"ts": "2025-12-27T22:11:18", "event": "train_log", "step": 5816, "epoch": 2.4540084388185655, "progress_pct": 40.9, "epoch_pct": 40.9, "eta": "40:31:03", "max_grad_norm": 0.8, "loss": 0.5961518883705139, "grad_norm": 1.2568929195404053, "learning_rate": 7.16416266051773e-05} +{"ts": "2025-12-27T22:11:31", "event": "train_log", "step": 5818, "epoch": 2.4548523206751054, "progress_pct": 40.91, "epoch_pct": 40.91, "eta": "40:29:57", "max_grad_norm": 0.8, "loss": 0.585054874420166, "grad_norm": 1.2202998399734497, "learning_rate": 7.161997554551226e-05} +{"ts": "2025-12-27T22:11:43", "event": "train_log", "step": 5820, "epoch": 2.4556962025316453, "progress_pct": 40.93, "epoch_pct": 40.93, "eta": "40:28:49", "max_grad_norm": 0.8, "loss": 0.6219096779823303, "grad_norm": 1.2326043844223022, "learning_rate": 7.159831949853409e-05} +{"ts": "2025-12-27T22:11:55", "event": "train_log", "step": 5822, "epoch": 2.4565400843881857, "progress_pct": 40.94, "epoch_pct": 40.94, "eta": "40:27:43", "max_grad_norm": 0.8, "loss": 0.641189455986023, "grad_norm": 1.2161623239517212, "learning_rate": 7.15766584692384e-05} +{"ts": "2025-12-27T22:12:08", "event": "train_log", "step": 5824, "epoch": 2.4573839662447257, "progress_pct": 40.96, "epoch_pct": 40.96, "eta": "40:26:37", "max_grad_norm": 0.8, "loss": 0.577190101146698, "grad_norm": 1.2391023635864258, "learning_rate": 7.1554992462622e-05} +{"ts": "2025-12-27T22:12:21", "event": "train_log", "step": 5826, "epoch": 2.4582278481012656, "progress_pct": 40.97, "epoch_pct": 40.97, "eta": "40:25:31", "max_grad_norm": 0.8, "loss": 0.5264694690704346, "grad_norm": 1.0883333683013916, "learning_rate": 7.153332148368281e-05} +{"ts": "2025-12-27T22:12:33", "event": "train_log", "step": 5828, "epoch": 2.459071729957806, "progress_pct": 40.98, "epoch_pct": 40.98, "eta": "40:24:24", "max_grad_norm": 0.8, "loss": 0.631437361240387, "grad_norm": 1.2129524946212769, "learning_rate": 7.15116455374199e-05} +{"ts": "2025-12-27T22:12:46", "event": "train_log", "step": 5830, "epoch": 2.459915611814346, "progress_pct": 41.0, "epoch_pct": 41.0, "eta": "40:23:18", "max_grad_norm": 0.8, "loss": 0.5025489926338196, "grad_norm": 1.0476374626159668, "learning_rate": 7.148996462883352e-05} +{"ts": "2025-12-27T22:13:00", "event": "train_log", "step": 5832, "epoch": 2.460759493670886, "progress_pct": 41.01, "epoch_pct": 41.01, "eta": "40:22:13", "max_grad_norm": 0.8, "loss": 0.5903586745262146, "grad_norm": 1.1389570236206055, "learning_rate": 7.146827876292502e-05} +{"ts": "2025-12-27T22:13:11", "event": "train_log", "step": 5834, "epoch": 2.4616033755274263, "progress_pct": 41.03, "epoch_pct": 41.03, "eta": "40:21:05", "max_grad_norm": 0.8, "loss": 0.633786141872406, "grad_norm": 1.4385539293289185, "learning_rate": 7.14465879446969e-05} +{"ts": "2025-12-27T22:13:24", "event": "train_log", "step": 5836, "epoch": 2.462447257383966, "progress_pct": 41.04, "epoch_pct": 41.04, "eta": "40:19:59", "max_grad_norm": 0.8, "loss": 0.5889136791229248, "grad_norm": 1.1184585094451904, "learning_rate": 7.142489217915283e-05} +{"ts": "2025-12-27T22:13:36", "event": "train_log", "step": 5838, "epoch": 2.463291139240506, "progress_pct": 41.05, "epoch_pct": 41.05, "eta": "40:18:52", "max_grad_norm": 0.8, "loss": 0.5774597525596619, "grad_norm": 1.2257685661315918, "learning_rate": 7.140319147129763e-05} +{"ts": "2025-12-27T22:13:49", "event": "train_log", "step": 5840, "epoch": 2.4641350210970465, "progress_pct": 41.07, "epoch_pct": 41.07, "eta": "40:17:47", "max_grad_norm": 0.8, "loss": 0.5220611095428467, "grad_norm": 0.9524238109588623, "learning_rate": 7.13814858261372e-05} +{"ts": "2025-12-27T22:14:02", "event": "train_log", "step": 5842, "epoch": 2.4649789029535865, "progress_pct": 41.08, "epoch_pct": 41.08, "eta": "40:16:41", "max_grad_norm": 0.8, "loss": 0.5724858641624451, "grad_norm": 1.2814422845840454, "learning_rate": 7.135977524867861e-05} +{"ts": "2025-12-27T22:14:15", "event": "train_log", "step": 5844, "epoch": 2.4658227848101264, "progress_pct": 41.1, "epoch_pct": 41.1, "eta": "40:15:35", "max_grad_norm": 0.8, "loss": 0.5469759702682495, "grad_norm": 1.0978140830993652, "learning_rate": 7.133805974393013e-05} +{"ts": "2025-12-27T22:14:27", "event": "train_log", "step": 5846, "epoch": 2.466666666666667, "progress_pct": 41.11, "epoch_pct": 41.11, "eta": "40:14:27", "max_grad_norm": 0.8, "loss": 0.6554312705993652, "grad_norm": 1.310279130935669, "learning_rate": 7.131633931690104e-05} +{"ts": "2025-12-27T22:14:39", "event": "train_log", "step": 5848, "epoch": 2.4675105485232067, "progress_pct": 41.13, "epoch_pct": 41.13, "eta": "40:13:20", "max_grad_norm": 0.8, "loss": 0.6166019439697266, "grad_norm": 1.286189317703247, "learning_rate": 7.129461397260187e-05} +{"ts": "2025-12-27T22:14:51", "event": "train_log", "step": 5850, "epoch": 2.4683544303797467, "progress_pct": 41.14, "epoch_pct": 41.14, "eta": "40:12:14", "max_grad_norm": 0.8, "loss": 0.6301121711730957, "grad_norm": 1.1586377620697021, "learning_rate": 7.127288371604424e-05} +{"ts": "2025-12-27T22:15:04", "event": "train_log", "step": 5852, "epoch": 2.469198312236287, "progress_pct": 41.15, "epoch_pct": 41.15, "eta": "40:11:08", "max_grad_norm": 0.8, "loss": 0.6022663712501526, "grad_norm": 1.1684564352035522, "learning_rate": 7.125114855224087e-05} +{"ts": "2025-12-27T22:15:16", "event": "train_log", "step": 5854, "epoch": 2.470042194092827, "progress_pct": 41.17, "epoch_pct": 41.17, "eta": "40:10:02", "max_grad_norm": 0.8, "loss": 0.5959302186965942, "grad_norm": 1.182511329650879, "learning_rate": 7.122940848620567e-05} +{"ts": "2025-12-27T22:15:29", "event": "train_log", "step": 5856, "epoch": 2.470886075949367, "progress_pct": 41.18, "epoch_pct": 41.18, "eta": "40:08:56", "max_grad_norm": 0.8, "loss": 0.6251413822174072, "grad_norm": 1.2383002042770386, "learning_rate": 7.120766352295366e-05} +{"ts": "2025-12-27T22:15:42", "event": "train_log", "step": 5858, "epoch": 2.4717299578059073, "progress_pct": 41.2, "epoch_pct": 41.2, "eta": "40:07:51", "max_grad_norm": 0.8, "loss": 0.6332544088363647, "grad_norm": 1.2001979351043701, "learning_rate": 7.118591366750097e-05} +{"ts": "2025-12-27T22:15:55", "event": "train_log", "step": 5860, "epoch": 2.4725738396624473, "progress_pct": 41.21, "epoch_pct": 41.21, "eta": "40:06:46", "max_grad_norm": 0.8, "loss": 0.5797795057296753, "grad_norm": 1.2166392803192139, "learning_rate": 7.116415892486488e-05} +{"ts": "2025-12-27T22:16:08", "event": "train_log", "step": 5862, "epoch": 2.473417721518987, "progress_pct": 41.22, "epoch_pct": 41.22, "eta": "40:05:40", "max_grad_norm": 0.8, "loss": 0.5335313081741333, "grad_norm": 1.2235382795333862, "learning_rate": 7.114239930006379e-05} +{"ts": "2025-12-27T22:16:21", "event": "train_log", "step": 5864, "epoch": 2.4742616033755276, "progress_pct": 41.24, "epoch_pct": 41.24, "eta": "40:04:35", "max_grad_norm": 0.8, "loss": 0.5536905527114868, "grad_norm": 1.2405973672866821, "learning_rate": 7.112063479811724e-05} +{"ts": "2025-12-27T22:16:34", "event": "train_log", "step": 5866, "epoch": 2.4751054852320675, "progress_pct": 41.25, "epoch_pct": 41.25, "eta": "40:03:30", "max_grad_norm": 0.8, "loss": 0.554654061794281, "grad_norm": 1.116328477859497, "learning_rate": 7.109886542404585e-05} +{"ts": "2025-12-27T22:16:45", "event": "train_log", "step": 5868, "epoch": 2.4759493670886075, "progress_pct": 41.27, "epoch_pct": 41.27, "eta": "40:02:23", "max_grad_norm": 0.8, "loss": 0.6017873287200928, "grad_norm": 1.2757837772369385, "learning_rate": 7.107709118287143e-05} +{"ts": "2025-12-27T22:16:58", "event": "train_log", "step": 5870, "epoch": 2.476793248945148, "progress_pct": 41.28, "epoch_pct": 41.28, "eta": "40:01:17", "max_grad_norm": 0.8, "loss": 0.6479908227920532, "grad_norm": 1.3445937633514404, "learning_rate": 7.105531207961686e-05} +{"ts": "2025-12-27T22:17:10", "event": "train_log", "step": 5872, "epoch": 2.477637130801688, "progress_pct": 41.29, "epoch_pct": 41.29, "eta": "40:00:11", "max_grad_norm": 0.8, "loss": 0.5829157829284668, "grad_norm": 1.1464542150497437, "learning_rate": 7.103352811930619e-05} +{"ts": "2025-12-27T22:17:23", "event": "train_log", "step": 5874, "epoch": 2.4784810126582277, "progress_pct": 41.31, "epoch_pct": 41.31, "eta": "39:59:05", "max_grad_norm": 0.8, "loss": 0.54380863904953, "grad_norm": 1.3275130987167358, "learning_rate": 7.101173930696453e-05} +{"ts": "2025-12-27T22:17:36", "event": "train_log", "step": 5876, "epoch": 2.479324894514768, "progress_pct": 41.32, "epoch_pct": 41.32, "eta": "39:58:00", "max_grad_norm": 0.8, "loss": 0.6313910484313965, "grad_norm": 1.006990909576416, "learning_rate": 7.098994564761813e-05} +{"ts": "2025-12-27T22:17:50", "event": "train_log", "step": 5878, "epoch": 2.480168776371308, "progress_pct": 41.34, "epoch_pct": 41.34, "eta": "39:56:56", "max_grad_norm": 0.8, "loss": 0.5343483090400696, "grad_norm": 1.1358299255371094, "learning_rate": 7.09681471462944e-05} +{"ts": "2025-12-27T22:18:03", "event": "train_log", "step": 5880, "epoch": 2.481012658227848, "progress_pct": 41.35, "epoch_pct": 41.35, "eta": "39:55:52", "max_grad_norm": 0.8, "loss": 0.49450409412384033, "grad_norm": 1.1456117630004883, "learning_rate": 7.094634380802184e-05} +{"ts": "2025-12-27T22:18:15", "event": "train_log", "step": 5882, "epoch": 2.4818565400843884, "progress_pct": 41.36, "epoch_pct": 41.36, "eta": "39:54:45", "max_grad_norm": 0.8, "loss": 0.6378757357597351, "grad_norm": 1.2961846590042114, "learning_rate": 7.092453563783003e-05} +{"ts": "2025-12-27T22:18:28", "event": "train_log", "step": 5884, "epoch": 2.4827004219409283, "progress_pct": 41.38, "epoch_pct": 41.38, "eta": "39:53:40", "max_grad_norm": 0.8, "loss": 0.5937124490737915, "grad_norm": 0.983889102935791, "learning_rate": 7.090272264074972e-05} +{"ts": "2025-12-27T22:18:41", "event": "train_log", "step": 5886, "epoch": 2.4835443037974683, "progress_pct": 41.39, "epoch_pct": 41.39, "eta": "39:52:36", "max_grad_norm": 0.8, "loss": 0.5301283597946167, "grad_norm": 1.0205817222595215, "learning_rate": 7.088090482181273e-05} +{"ts": "2025-12-27T22:18:54", "event": "train_log", "step": 5888, "epoch": 2.4843881856540087, "progress_pct": 41.41, "epoch_pct": 41.41, "eta": "39:51:32", "max_grad_norm": 0.8, "loss": 0.6191756129264832, "grad_norm": 1.1721397638320923, "learning_rate": 7.085908218605204e-05} +{"ts": "2025-12-27T22:19:06", "event": "train_log", "step": 5890, "epoch": 2.4852320675105486, "progress_pct": 41.42, "epoch_pct": 41.42, "eta": "39:50:25", "max_grad_norm": 0.8, "loss": 0.5928890109062195, "grad_norm": 1.2432814836502075, "learning_rate": 7.083725473850168e-05} +{"ts": "2025-12-27T22:19:19", "event": "train_log", "step": 5892, "epoch": 2.4860759493670885, "progress_pct": 41.43, "epoch_pct": 41.43, "eta": "39:49:19", "max_grad_norm": 0.8, "loss": 0.6136764287948608, "grad_norm": 1.252125859260559, "learning_rate": 7.081542248419686e-05} +{"ts": "2025-12-27T22:19:31", "event": "train_log", "step": 5894, "epoch": 2.486919831223629, "progress_pct": 41.45, "epoch_pct": 41.45, "eta": "39:48:13", "max_grad_norm": 0.8, "loss": 0.6084910035133362, "grad_norm": 1.3686699867248535, "learning_rate": 7.079358542817382e-05} +{"ts": "2025-12-27T22:19:43", "event": "train_log", "step": 5896, "epoch": 2.487763713080169, "progress_pct": 41.46, "epoch_pct": 41.46, "eta": "39:47:08", "max_grad_norm": 0.8, "loss": 0.5862250924110413, "grad_norm": 1.0877282619476318, "learning_rate": 7.077174357546996e-05} +{"ts": "2025-12-27T22:19:55", "event": "train_log", "step": 5898, "epoch": 2.488607594936709, "progress_pct": 41.48, "epoch_pct": 41.48, "eta": "39:46:02", "max_grad_norm": 0.8, "loss": 0.6300894021987915, "grad_norm": 1.164095401763916, "learning_rate": 7.074989693112381e-05} +{"ts": "2025-12-27T22:20:08", "event": "train_log", "step": 5900, "epoch": 2.489451476793249, "progress_pct": 41.49, "epoch_pct": 41.49, "eta": "39:44:58", "max_grad_norm": 0.8, "loss": 0.5508570075035095, "grad_norm": 1.1169507503509521, "learning_rate": 7.072804550017493e-05} +{"ts": "2025-12-27T22:28:42", "event": "train_log", "step": 5900, "epoch": 2.489451476793249, "progress_pct": 41.49, "epoch_pct": 41.49, "eta": "39:57:02", "max_grad_norm": 0.8, "eval_loss": 0.6820966005325317, "eval_runtime": 513.3515, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104} +{"ts": "2025-12-27T22:28:54", "event": "train_log", "step": 5902, "epoch": 2.490295358649789, "progress_pct": 41.5, "epoch_pct": 41.5, "eta": "39:55:56", "max_grad_norm": 0.8, "loss": 0.550847589969635, "grad_norm": 1.1718615293502808, "learning_rate": 7.070618928766406e-05} +{"ts": "2025-12-27T22:29:06", "event": "train_log", "step": 5904, "epoch": 2.491139240506329, "progress_pct": 41.52, "epoch_pct": 41.52, "eta": "39:54:49", "max_grad_norm": 0.8, "loss": 0.5663347840309143, "grad_norm": 1.4725650548934937, "learning_rate": 7.068432829863298e-05} +{"ts": "2025-12-27T22:29:19", "event": "train_log", "step": 5906, "epoch": 2.491983122362869, "progress_pct": 41.53, "epoch_pct": 41.53, "eta": "39:53:44", "max_grad_norm": 0.8, "loss": 0.5506191849708557, "grad_norm": 1.042083978652954, "learning_rate": 7.066246253812462e-05} +{"ts": "2025-12-27T22:29:31", "event": "train_log", "step": 5908, "epoch": 2.4928270042194094, "progress_pct": 41.55, "epoch_pct": 41.55, "eta": "39:52:39", "max_grad_norm": 0.8, "loss": 0.5656929612159729, "grad_norm": 1.2020974159240723, "learning_rate": 7.064059201118297e-05} +{"ts": "2025-12-27T22:29:45", "event": "train_log", "step": 5910, "epoch": 2.4936708860759493, "progress_pct": 41.56, "epoch_pct": 41.56, "eta": "39:51:34", "max_grad_norm": 0.8, "loss": 0.5159370303153992, "grad_norm": 1.1040663719177246, "learning_rate": 7.061871672285317e-05} +{"ts": "2025-12-27T22:29:57", "event": "train_log", "step": 5912, "epoch": 2.4945147679324893, "progress_pct": 41.58, "epoch_pct": 41.58, "eta": "39:50:29", "max_grad_norm": 0.8, "loss": 0.6161949634552002, "grad_norm": 1.3681589365005493, "learning_rate": 7.05968366781814e-05} +{"ts": "2025-12-27T22:30:09", "event": "train_log", "step": 5914, "epoch": 2.4953586497890297, "progress_pct": 41.59, "epoch_pct": 41.59, "eta": "39:49:23", "max_grad_norm": 0.8, "loss": 0.6357758641242981, "grad_norm": 1.26628839969635, "learning_rate": 7.057495188221498e-05} +{"ts": "2025-12-27T22:30:22", "event": "train_log", "step": 5916, "epoch": 2.4962025316455696, "progress_pct": 41.6, "epoch_pct": 41.6, "eta": "39:48:18", "max_grad_norm": 0.8, "loss": 0.5467366576194763, "grad_norm": 1.2714020013809204, "learning_rate": 7.05530623400023e-05} +{"ts": "2025-12-27T22:30:35", "event": "train_log", "step": 5918, "epoch": 2.4970464135021095, "progress_pct": 41.62, "epoch_pct": 41.62, "eta": "39:47:13", "max_grad_norm": 0.8, "loss": 0.592526376247406, "grad_norm": 1.2255018949508667, "learning_rate": 7.053116805659287e-05} +{"ts": "2025-12-27T22:30:47", "event": "train_log", "step": 5920, "epoch": 2.49789029535865, "progress_pct": 41.63, "epoch_pct": 41.63, "eta": "39:46:06", "max_grad_norm": 0.8, "loss": 0.5819981694221497, "grad_norm": 1.2816206216812134, "learning_rate": 7.050926903703729e-05} +{"ts": "2025-12-27T22:30:59", "event": "train_log", "step": 5922, "epoch": 2.49873417721519, "progress_pct": 41.65, "epoch_pct": 41.65, "eta": "39:45:01", "max_grad_norm": 0.8, "loss": 0.6037712693214417, "grad_norm": 1.1938221454620361, "learning_rate": 7.048736528638722e-05} +{"ts": "2025-12-27T22:31:12", "event": "train_log", "step": 5924, "epoch": 2.49957805907173, "progress_pct": 41.66, "epoch_pct": 41.66, "eta": "39:43:56", "max_grad_norm": 0.8, "loss": 0.5567215085029602, "grad_norm": 1.1330323219299316, "learning_rate": 7.046545680969545e-05} +{"ts": "2025-12-27T22:31:25", "event": "train_log", "step": 5926, "epoch": 2.50042194092827, "progress_pct": 41.67, "epoch_pct": 41.67, "eta": "39:42:51", "max_grad_norm": 0.8, "loss": 0.5626974105834961, "grad_norm": 1.233564019203186, "learning_rate": 7.044354361201585e-05} +{"ts": "2025-12-27T22:31:37", "event": "train_log", "step": 5928, "epoch": 2.50126582278481, "progress_pct": 41.69, "epoch_pct": 41.69, "eta": "39:41:46", "max_grad_norm": 0.8, "loss": 0.5672739744186401, "grad_norm": 1.1913540363311768, "learning_rate": 7.042162569840336e-05} +{"ts": "2025-12-27T22:31:50", "event": "train_log", "step": 5930, "epoch": 2.50210970464135, "progress_pct": 41.7, "epoch_pct": 41.7, "eta": "39:40:42", "max_grad_norm": 0.8, "loss": 0.5965602993965149, "grad_norm": 1.060952067375183, "learning_rate": 7.039970307391402e-05} +{"ts": "2025-12-27T22:32:03", "event": "train_log", "step": 5932, "epoch": 2.5029535864978905, "progress_pct": 41.72, "epoch_pct": 41.72, "eta": "39:39:37", "max_grad_norm": 0.8, "loss": 0.590932309627533, "grad_norm": 1.2003182172775269, "learning_rate": 7.037777574360497e-05} +{"ts": "2025-12-27T22:32:16", "event": "train_log", "step": 5934, "epoch": 2.5037974683544304, "progress_pct": 41.73, "epoch_pct": 41.73, "eta": "39:38:32", "max_grad_norm": 0.8, "loss": 0.5736868381500244, "grad_norm": 1.073434829711914, "learning_rate": 7.035584371253441e-05} +{"ts": "2025-12-27T22:32:28", "event": "train_log", "step": 5936, "epoch": 2.5046413502109703, "progress_pct": 41.74, "epoch_pct": 41.74, "eta": "39:37:27", "max_grad_norm": 0.8, "loss": 0.614703357219696, "grad_norm": 1.2641130685806274, "learning_rate": 7.033390698576166e-05} +{"ts": "2025-12-27T22:32:41", "event": "train_log", "step": 5938, "epoch": 2.5054852320675103, "progress_pct": 41.76, "epoch_pct": 41.76, "eta": "39:36:22", "max_grad_norm": 0.8, "loss": 0.5866397023200989, "grad_norm": 1.2406511306762695, "learning_rate": 7.031196556834708e-05} +{"ts": "2025-12-27T22:32:53", "event": "train_log", "step": 5940, "epoch": 2.5063291139240507, "progress_pct": 41.77, "epoch_pct": 41.77, "eta": "39:35:16", "max_grad_norm": 0.8, "loss": 0.5792667865753174, "grad_norm": 1.231619119644165, "learning_rate": 7.029001946535215e-05} +{"ts": "2025-12-27T22:33:05", "event": "train_log", "step": 5942, "epoch": 2.5071729957805906, "progress_pct": 41.79, "epoch_pct": 41.79, "eta": "39:34:11", "max_grad_norm": 0.8, "loss": 0.5686604976654053, "grad_norm": 1.419447660446167, "learning_rate": 7.026806868183939e-05} +{"ts": "2025-12-27T22:33:18", "event": "train_log", "step": 5944, "epoch": 2.5080168776371305, "progress_pct": 41.8, "epoch_pct": 41.8, "eta": "39:33:07", "max_grad_norm": 0.8, "loss": 0.5860661268234253, "grad_norm": 1.139244556427002, "learning_rate": 7.024611322287245e-05} +{"ts": "2025-12-27T22:33:32", "event": "train_log", "step": 5946, "epoch": 2.508860759493671, "progress_pct": 41.81, "epoch_pct": 41.81, "eta": "39:32:04", "max_grad_norm": 0.8, "loss": 0.5823250412940979, "grad_norm": 1.070517897605896, "learning_rate": 7.022415309351602e-05} +{"ts": "2025-12-27T22:33:46", "event": "train_log", "step": 5948, "epoch": 2.509704641350211, "progress_pct": 41.83, "epoch_pct": 41.83, "eta": "39:31:01", "max_grad_norm": 0.8, "loss": 0.5291389226913452, "grad_norm": 1.0775398015975952, "learning_rate": 7.020218829883589e-05} +{"ts": "2025-12-27T22:33:57", "event": "train_log", "step": 5950, "epoch": 2.510548523206751, "progress_pct": 41.84, "epoch_pct": 41.84, "eta": "39:29:54", "max_grad_norm": 0.8, "loss": 0.6215447783470154, "grad_norm": 1.339716911315918, "learning_rate": 7.018021884389892e-05} +{"ts": "2025-12-27T22:34:09", "event": "train_log", "step": 5952, "epoch": 2.511392405063291, "progress_pct": 41.86, "epoch_pct": 41.86, "eta": "39:28:49", "max_grad_norm": 0.8, "loss": 0.5419909358024597, "grad_norm": 1.3589707612991333, "learning_rate": 7.0158244733773e-05} +{"ts": "2025-12-27T22:34:22", "event": "train_log", "step": 5954, "epoch": 2.512236286919831, "progress_pct": 41.87, "epoch_pct": 41.87, "eta": "39:27:45", "max_grad_norm": 0.8, "loss": 0.5476977229118347, "grad_norm": 1.1664098501205444, "learning_rate": 7.01362659735272e-05} +{"ts": "2025-12-27T22:34:35", "event": "train_log", "step": 5956, "epoch": 2.513080168776371, "progress_pct": 41.88, "epoch_pct": 41.88, "eta": "39:26:41", "max_grad_norm": 0.8, "loss": 0.5896323919296265, "grad_norm": 1.1184223890304565, "learning_rate": 7.011428256823154e-05} +{"ts": "2025-12-27T22:34:47", "event": "train_log", "step": 5958, "epoch": 2.5139240506329115, "progress_pct": 41.9, "epoch_pct": 41.9, "eta": "39:25:35", "max_grad_norm": 0.8, "loss": 0.6353691220283508, "grad_norm": 1.4071170091629028, "learning_rate": 7.00922945229572e-05} +{"ts": "2025-12-27T22:34:58", "event": "train_log", "step": 5960, "epoch": 2.5147679324894514, "progress_pct": 41.91, "epoch_pct": 41.91, "eta": "39:24:29", "max_grad_norm": 0.8, "loss": 0.6605582237243652, "grad_norm": 1.3740885257720947, "learning_rate": 7.007030184277641e-05} +{"ts": "2025-12-27T22:35:11", "event": "train_log", "step": 5962, "epoch": 2.5156118143459913, "progress_pct": 41.93, "epoch_pct": 41.93, "eta": "39:23:25", "max_grad_norm": 0.8, "loss": 0.6399887800216675, "grad_norm": 1.071395754814148, "learning_rate": 7.004830453276241e-05} +{"ts": "2025-12-27T22:35:23", "event": "train_log", "step": 5964, "epoch": 2.5164556962025317, "progress_pct": 41.94, "epoch_pct": 41.94, "eta": "39:22:20", "max_grad_norm": 0.8, "loss": 0.5992775559425354, "grad_norm": 1.2292311191558838, "learning_rate": 7.002630259798962e-05} +{"ts": "2025-12-27T22:35:36", "event": "train_log", "step": 5966, "epoch": 2.5172995780590717, "progress_pct": 41.95, "epoch_pct": 41.95, "eta": "39:21:16", "max_grad_norm": 0.8, "loss": 0.5716721415519714, "grad_norm": 1.0133391618728638, "learning_rate": 7.000429604353341e-05} +{"ts": "2025-12-27T22:35:49", "event": "train_log", "step": 5968, "epoch": 2.5181434599156116, "progress_pct": 41.97, "epoch_pct": 41.97, "eta": "39:20:11", "max_grad_norm": 0.8, "loss": 0.5455520749092102, "grad_norm": 1.2669343948364258, "learning_rate": 6.998228487447032e-05} +{"ts": "2025-12-27T22:36:01", "event": "train_log", "step": 5970, "epoch": 2.518987341772152, "progress_pct": 41.98, "epoch_pct": 41.98, "eta": "39:19:06", "max_grad_norm": 0.8, "loss": 0.6411572694778442, "grad_norm": 1.2026386260986328, "learning_rate": 6.996026909587785e-05} +{"ts": "2025-12-27T22:36:12", "event": "train_log", "step": 5972, "epoch": 2.519831223628692, "progress_pct": 42.0, "epoch_pct": 42.0, "eta": "39:18:00", "max_grad_norm": 0.8, "loss": 0.6687750220298767, "grad_norm": 1.359923243522644, "learning_rate": 6.993824871283465e-05} +{"ts": "2025-12-27T22:36:24", "event": "train_log", "step": 5974, "epoch": 2.520675105485232, "progress_pct": 42.01, "epoch_pct": 42.01, "eta": "39:16:55", "max_grad_norm": 0.8, "loss": 0.6271382570266724, "grad_norm": 1.1265650987625122, "learning_rate": 6.99162237304204e-05} +{"ts": "2025-12-27T22:36:37", "event": "train_log", "step": 5976, "epoch": 2.5215189873417723, "progress_pct": 42.03, "epoch_pct": 42.03, "eta": "39:15:51", "max_grad_norm": 0.8, "loss": 0.6191279888153076, "grad_norm": 1.197667121887207, "learning_rate": 6.989419415371583e-05} +{"ts": "2025-12-27T22:36:49", "event": "train_log", "step": 5978, "epoch": 2.522362869198312, "progress_pct": 42.04, "epoch_pct": 42.04, "eta": "39:14:46", "max_grad_norm": 0.8, "loss": 0.6313687562942505, "grad_norm": 1.169992446899414, "learning_rate": 6.987215998780275e-05} +{"ts": "2025-12-27T22:37:02", "event": "train_log", "step": 5980, "epoch": 2.523206751054852, "progress_pct": 42.05, "epoch_pct": 42.05, "eta": "39:13:42", "max_grad_norm": 0.8, "loss": 0.6058336496353149, "grad_norm": 1.2706433534622192, "learning_rate": 6.9850121237764e-05} +{"ts": "2025-12-27T22:37:14", "event": "train_log", "step": 5982, "epoch": 2.5240506329113925, "progress_pct": 42.07, "epoch_pct": 42.07, "eta": "39:12:37", "max_grad_norm": 0.8, "loss": 0.6466464400291443, "grad_norm": 1.322376012802124, "learning_rate": 6.982807790868352e-05} +{"ts": "2025-12-27T22:37:26", "event": "train_log", "step": 5984, "epoch": 2.5248945147679325, "progress_pct": 42.08, "epoch_pct": 42.08, "eta": "39:11:33", "max_grad_norm": 0.8, "loss": 0.5730098485946655, "grad_norm": 1.2398571968078613, "learning_rate": 6.980603000564626e-05} +{"ts": "2025-12-27T22:37:39", "event": "train_log", "step": 5986, "epoch": 2.5257383966244724, "progress_pct": 42.1, "epoch_pct": 42.1, "eta": "39:10:30", "max_grad_norm": 0.8, "loss": 0.5305635333061218, "grad_norm": 1.2035216093063354, "learning_rate": 6.978397753373826e-05} +{"ts": "2025-12-27T22:37:52", "event": "train_log", "step": 5988, "epoch": 2.526582278481013, "progress_pct": 42.11, "epoch_pct": 42.11, "eta": "39:09:26", "max_grad_norm": 0.8, "loss": 0.5601096153259277, "grad_norm": 1.1951299905776978, "learning_rate": 6.976192049804661e-05} +{"ts": "2025-12-27T22:38:06", "event": "train_log", "step": 5990, "epoch": 2.5274261603375527, "progress_pct": 42.12, "epoch_pct": 42.12, "eta": "39:08:23", "max_grad_norm": 0.8, "loss": 0.5049516558647156, "grad_norm": 0.9950459599494934, "learning_rate": 6.973985890365945e-05} +{"ts": "2025-12-27T22:38:18", "event": "train_log", "step": 5992, "epoch": 2.5282700421940927, "progress_pct": 42.14, "epoch_pct": 42.14, "eta": "39:07:19", "max_grad_norm": 0.8, "loss": 0.5456960797309875, "grad_norm": 1.2581008672714233, "learning_rate": 6.971779275566593e-05} +{"ts": "2025-12-27T22:38:31", "event": "train_log", "step": 5994, "epoch": 2.529113924050633, "progress_pct": 42.15, "epoch_pct": 42.15, "eta": "39:06:15", "max_grad_norm": 0.8, "loss": 0.6026827096939087, "grad_norm": 1.2196903228759766, "learning_rate": 6.969572205915632e-05} +{"ts": "2025-12-27T22:38:44", "event": "train_log", "step": 5996, "epoch": 2.529957805907173, "progress_pct": 42.17, "epoch_pct": 42.17, "eta": "39:05:12", "max_grad_norm": 0.8, "loss": 0.597453236579895, "grad_norm": 1.3109357357025146, "learning_rate": 6.967364681922189e-05} +{"ts": "2025-12-27T22:38:57", "event": "train_log", "step": 5998, "epoch": 2.530801687763713, "progress_pct": 42.18, "epoch_pct": 42.18, "eta": "39:04:09", "max_grad_norm": 0.8, "loss": 0.5304323434829712, "grad_norm": 1.016904354095459, "learning_rate": 6.965156704095498e-05} +{"ts": "2025-12-27T22:39:10", "event": "train_log", "step": 6000, "epoch": 2.5316455696202533, "progress_pct": 42.19, "epoch_pct": 42.19, "eta": "39:03:05", "max_grad_norm": 0.8, "loss": 0.5748253464698792, "grad_norm": 1.2363858222961426, "learning_rate": 6.962948272944896e-05} +{"ts": "2025-12-27T22:47:43", "event": "train_log", "step": 6000, "epoch": 2.5316455696202533, "progress_pct": 42.19, "epoch_pct": 42.19, "eta": "39:14:49", "max_grad_norm": 0.8, "eval_loss": 0.6813357472419739, "eval_runtime": 513.5491, "eval_samples_per_second": 4.103, "eval_steps_per_second": 4.103} +{"ts": "2025-12-27T22:47:56", "event": "train_log", "step": 6002, "epoch": 2.5324894514767933, "progress_pct": 42.21, "epoch_pct": 42.21, "eta": "39:13:45", "max_grad_norm": 0.8, "loss": 0.613327145576477, "grad_norm": 1.1766576766967773, "learning_rate": 6.960739388979827e-05} +{"ts": "2025-12-27T22:48:08", "event": "train_log", "step": 6004, "epoch": 2.533333333333333, "progress_pct": 42.22, "epoch_pct": 42.22, "eta": "39:12:39", "max_grad_norm": 0.8, "loss": 0.6648217439651489, "grad_norm": 1.4065337181091309, "learning_rate": 6.95853005270984e-05} +{"ts": "2025-12-27T22:48:21", "event": "train_log", "step": 6006, "epoch": 2.5341772151898736, "progress_pct": 42.24, "epoch_pct": 42.24, "eta": "39:11:37", "max_grad_norm": 0.8, "loss": 0.5165349841117859, "grad_norm": 0.9513862133026123, "learning_rate": 6.956320264644582e-05} +{"ts": "2025-12-27T22:48:34", "event": "train_log", "step": 6008, "epoch": 2.5350210970464135, "progress_pct": 42.25, "epoch_pct": 42.25, "eta": "39:10:32", "max_grad_norm": 0.8, "loss": 0.5594159364700317, "grad_norm": 1.1104962825775146, "learning_rate": 6.95411002529381e-05} +{"ts": "2025-12-27T22:48:47", "event": "train_log", "step": 6010, "epoch": 2.5358649789029535, "progress_pct": 42.26, "epoch_pct": 42.26, "eta": "39:09:29", "max_grad_norm": 0.8, "loss": 0.5662833452224731, "grad_norm": 1.1698877811431885, "learning_rate": 6.951899335167386e-05} +{"ts": "2025-12-27T22:48:59", "event": "train_log", "step": 6012, "epoch": 2.536708860759494, "progress_pct": 42.28, "epoch_pct": 42.28, "eta": "39:08:25", "max_grad_norm": 0.8, "loss": 0.5780806541442871, "grad_norm": 1.2051950693130493, "learning_rate": 6.949688194775272e-05} +{"ts": "2025-12-27T22:49:12", "event": "train_log", "step": 6014, "epoch": 2.537552742616034, "progress_pct": 42.29, "epoch_pct": 42.29, "eta": "39:07:20", "max_grad_norm": 0.8, "loss": 0.6112543344497681, "grad_norm": 1.2434250116348267, "learning_rate": 6.947476604627536e-05} +{"ts": "2025-12-27T22:49:25", "event": "train_log", "step": 6016, "epoch": 2.5383966244725737, "progress_pct": 42.31, "epoch_pct": 42.31, "eta": "39:06:18", "max_grad_norm": 0.8, "loss": 0.5556519031524658, "grad_norm": 1.1473076343536377, "learning_rate": 6.945264565234348e-05} +{"ts": "2025-12-27T22:49:37", "event": "train_log", "step": 6018, "epoch": 2.539240506329114, "progress_pct": 42.32, "epoch_pct": 42.32, "eta": "39:05:13", "max_grad_norm": 0.8, "loss": 0.6664283275604248, "grad_norm": 1.3139631748199463, "learning_rate": 6.943052077105987e-05} +{"ts": "2025-12-27T22:49:49", "event": "train_log", "step": 6020, "epoch": 2.540084388185654, "progress_pct": 42.33, "epoch_pct": 42.33, "eta": "39:04:08", "max_grad_norm": 0.8, "loss": 0.6358945369720459, "grad_norm": 1.3407402038574219, "learning_rate": 6.940839140752825e-05} +{"ts": "2025-12-27T22:50:01", "event": "train_log", "step": 6022, "epoch": 2.540928270042194, "progress_pct": 42.35, "epoch_pct": 42.35, "eta": "39:03:04", "max_grad_norm": 0.8, "loss": 0.6310063600540161, "grad_norm": 1.2223491668701172, "learning_rate": 6.938625756685352e-05} +{"ts": "2025-12-27T22:50:14", "event": "train_log", "step": 6024, "epoch": 2.5417721518987344, "progress_pct": 42.36, "epoch_pct": 42.36, "eta": "39:02:00", "max_grad_norm": 0.8, "loss": 0.6090726256370544, "grad_norm": 1.3984094858169556, "learning_rate": 6.936411925414146e-05} +{"ts": "2025-12-27T22:50:27", "event": "train_log", "step": 6026, "epoch": 2.5426160337552743, "progress_pct": 42.38, "epoch_pct": 42.38, "eta": "39:00:56", "max_grad_norm": 0.8, "loss": 0.585586428642273, "grad_norm": 1.1876440048217773, "learning_rate": 6.9341976474499e-05} +{"ts": "2025-12-27T22:50:39", "event": "train_log", "step": 6028, "epoch": 2.5434599156118143, "progress_pct": 42.39, "epoch_pct": 42.39, "eta": "38:59:52", "max_grad_norm": 0.8, "loss": 0.6382114887237549, "grad_norm": 1.2213155031204224, "learning_rate": 6.931982923303402e-05} +{"ts": "2025-12-27T22:50:51", "event": "train_log", "step": 6030, "epoch": 2.5443037974683547, "progress_pct": 42.41, "epoch_pct": 42.41, "eta": "38:58:48", "max_grad_norm": 0.8, "loss": 0.5851555466651917, "grad_norm": 1.0637959241867065, "learning_rate": 6.92976775348555e-05} +{"ts": "2025-12-27T22:51:04", "event": "train_log", "step": 6032, "epoch": 2.5451476793248946, "progress_pct": 42.42, "epoch_pct": 42.42, "eta": "38:57:44", "max_grad_norm": 0.8, "loss": 0.5867910385131836, "grad_norm": 1.150227665901184, "learning_rate": 6.927552138507337e-05} +{"ts": "2025-12-27T22:51:16", "event": "train_log", "step": 6034, "epoch": 2.5459915611814345, "progress_pct": 42.43, "epoch_pct": 42.43, "eta": "38:56:40", "max_grad_norm": 0.8, "loss": 0.5876969695091248, "grad_norm": 1.1405255794525146, "learning_rate": 6.925336078879865e-05} +{"ts": "2025-12-27T22:51:29", "event": "train_log", "step": 6036, "epoch": 2.546835443037975, "progress_pct": 42.45, "epoch_pct": 42.45, "eta": "38:55:38", "max_grad_norm": 0.8, "loss": 0.626306414604187, "grad_norm": 1.0269757509231567, "learning_rate": 6.923119575114339e-05} +{"ts": "2025-12-27T22:51:42", "event": "train_log", "step": 6038, "epoch": 2.547679324894515, "progress_pct": 42.46, "epoch_pct": 42.46, "eta": "38:54:34", "max_grad_norm": 0.8, "loss": 0.645074188709259, "grad_norm": 1.1978809833526611, "learning_rate": 6.920902627722059e-05} +{"ts": "2025-12-27T22:51:54", "event": "train_log", "step": 6040, "epoch": 2.548523206751055, "progress_pct": 42.48, "epoch_pct": 42.48, "eta": "38:53:30", "max_grad_norm": 0.8, "loss": 0.6284276247024536, "grad_norm": 1.1684149503707886, "learning_rate": 6.918685237214435e-05} +{"ts": "2025-12-27T22:52:07", "event": "train_log", "step": 6042, "epoch": 2.549367088607595, "progress_pct": 42.49, "epoch_pct": 42.49, "eta": "38:52:27", "max_grad_norm": 0.8, "loss": 0.5770997405052185, "grad_norm": 1.2538992166519165, "learning_rate": 6.916467404102977e-05} +{"ts": "2025-12-27T22:52:20", "event": "train_log", "step": 6044, "epoch": 2.550210970464135, "progress_pct": 42.5, "epoch_pct": 42.5, "eta": "38:51:24", "max_grad_norm": 0.8, "loss": 0.5501131415367126, "grad_norm": 1.2381856441497803, "learning_rate": 6.914249128899294e-05} +{"ts": "2025-12-27T22:52:34", "event": "train_log", "step": 6046, "epoch": 2.551054852320675, "progress_pct": 42.52, "epoch_pct": 42.52, "eta": "38:50:22", "max_grad_norm": 0.8, "loss": 0.5362627506256104, "grad_norm": 1.0487099885940552, "learning_rate": 6.912030412115101e-05} +{"ts": "2025-12-27T22:52:46", "event": "train_log", "step": 6048, "epoch": 2.5518987341772155, "progress_pct": 42.53, "epoch_pct": 42.53, "eta": "38:49:17", "max_grad_norm": 0.8, "loss": 0.6694624423980713, "grad_norm": 1.3471804857254028, "learning_rate": 6.909811254262213e-05} +{"ts": "2025-12-27T22:52:57", "event": "train_log", "step": 6050, "epoch": 2.5527426160337554, "progress_pct": 42.55, "epoch_pct": 42.55, "eta": "38:48:12", "max_grad_norm": 0.8, "loss": 0.642368733882904, "grad_norm": 1.4262096881866455, "learning_rate": 6.907591655852547e-05} +{"ts": "2025-12-27T22:53:10", "event": "train_log", "step": 6052, "epoch": 2.5535864978902953, "progress_pct": 42.56, "epoch_pct": 42.56, "eta": "38:47:09", "max_grad_norm": 0.8, "loss": 0.6266166567802429, "grad_norm": 1.171004295349121, "learning_rate": 6.905371617398122e-05} +{"ts": "2025-12-27T22:53:23", "event": "train_log", "step": 6054, "epoch": 2.5544303797468353, "progress_pct": 42.57, "epoch_pct": 42.57, "eta": "38:46:07", "max_grad_norm": 0.8, "loss": 0.5518985986709595, "grad_norm": 1.1249992847442627, "learning_rate": 6.90315113941106e-05} +{"ts": "2025-12-27T22:53:36", "event": "train_log", "step": 6056, "epoch": 2.5552742616033757, "progress_pct": 42.59, "epoch_pct": 42.59, "eta": "38:45:03", "max_grad_norm": 0.8, "loss": 0.5367884039878845, "grad_norm": 1.3049964904785156, "learning_rate": 6.900930222403579e-05} +{"ts": "2025-12-27T22:53:48", "event": "train_log", "step": 6058, "epoch": 2.5561181434599156, "progress_pct": 42.6, "epoch_pct": 42.6, "eta": "38:44:01", "max_grad_norm": 0.8, "loss": 0.6057673096656799, "grad_norm": 1.3548237085342407, "learning_rate": 6.898708866888005e-05} +{"ts": "2025-12-27T22:54:01", "event": "train_log", "step": 6060, "epoch": 2.5569620253164556, "progress_pct": 42.62, "epoch_pct": 42.62, "eta": "38:42:57", "max_grad_norm": 0.8, "loss": 0.5493726134300232, "grad_norm": 1.1422157287597656, "learning_rate": 6.89648707337676e-05} +{"ts": "2025-12-27T22:54:14", "event": "train_log", "step": 6062, "epoch": 2.557805907172996, "progress_pct": 42.63, "epoch_pct": 42.63, "eta": "38:41:55", "max_grad_norm": 0.8, "loss": 0.5055251717567444, "grad_norm": 1.0179574489593506, "learning_rate": 6.89426484238237e-05} +{"ts": "2025-12-27T22:54:27", "event": "train_log", "step": 6064, "epoch": 2.558649789029536, "progress_pct": 42.64, "epoch_pct": 42.64, "eta": "38:40:52", "max_grad_norm": 0.8, "loss": 0.6099714040756226, "grad_norm": 1.2062081098556519, "learning_rate": 6.89204217441746e-05} +{"ts": "2025-12-27T22:54:39", "event": "train_log", "step": 6066, "epoch": 2.559493670886076, "progress_pct": 42.66, "epoch_pct": 42.66, "eta": "38:39:48", "max_grad_norm": 0.8, "loss": 0.6432347893714905, "grad_norm": 1.3043999671936035, "learning_rate": 6.889819069994759e-05} +{"ts": "2025-12-27T22:54:52", "event": "train_log", "step": 6068, "epoch": 2.5603375527426158, "progress_pct": 42.67, "epoch_pct": 42.67, "eta": "38:38:45", "max_grad_norm": 0.8, "loss": 0.6052974462509155, "grad_norm": 1.241347074508667, "learning_rate": 6.887595529627093e-05} +{"ts": "2025-12-27T22:55:04", "event": "train_log", "step": 6070, "epoch": 2.561181434599156, "progress_pct": 42.69, "epoch_pct": 42.69, "eta": "38:37:41", "max_grad_norm": 0.8, "loss": 0.6239711046218872, "grad_norm": 1.2502845525741577, "learning_rate": 6.88537155382739e-05} +{"ts": "2025-12-27T22:55:16", "event": "train_log", "step": 6072, "epoch": 2.562025316455696, "progress_pct": 42.7, "epoch_pct": 42.7, "eta": "38:36:38", "max_grad_norm": 0.8, "loss": 0.5462124347686768, "grad_norm": 1.0815852880477905, "learning_rate": 6.883147143108679e-05} +{"ts": "2025-12-27T22:55:29", "event": "train_log", "step": 6074, "epoch": 2.562869198312236, "progress_pct": 42.71, "epoch_pct": 42.71, "eta": "38:35:35", "max_grad_norm": 0.8, "loss": 0.5727240443229675, "grad_norm": 1.1990602016448975, "learning_rate": 6.880922297984087e-05} +{"ts": "2025-12-27T22:55:41", "event": "train_log", "step": 6076, "epoch": 2.5637130801687764, "progress_pct": 42.73, "epoch_pct": 42.73, "eta": "38:34:32", "max_grad_norm": 0.8, "loss": 0.5160089731216431, "grad_norm": 1.016781210899353, "learning_rate": 6.878697018966846e-05} +{"ts": "2025-12-27T22:55:54", "event": "train_log", "step": 6078, "epoch": 2.5645569620253164, "progress_pct": 42.74, "epoch_pct": 42.74, "eta": "38:33:29", "max_grad_norm": 0.8, "loss": 0.6344075798988342, "grad_norm": 1.1946886777877808, "learning_rate": 6.876471306570286e-05} +{"ts": "2025-12-27T22:56:07", "event": "train_log", "step": 6080, "epoch": 2.5654008438818563, "progress_pct": 42.76, "epoch_pct": 42.76, "eta": "38:32:27", "max_grad_norm": 0.8, "loss": 0.6142247319221497, "grad_norm": 1.1460139751434326, "learning_rate": 6.87424516130783e-05} +{"ts": "2025-12-27T22:56:20", "event": "train_log", "step": 6082, "epoch": 2.5662447257383967, "progress_pct": 42.77, "epoch_pct": 42.77, "eta": "38:31:24", "max_grad_norm": 0.8, "loss": 0.6330769658088684, "grad_norm": 1.3636937141418457, "learning_rate": 6.872018583693013e-05} +{"ts": "2025-12-27T22:56:32", "event": "train_log", "step": 6084, "epoch": 2.5670886075949366, "progress_pct": 42.78, "epoch_pct": 42.78, "eta": "38:30:21", "max_grad_norm": 0.8, "loss": 0.6386255621910095, "grad_norm": 1.3545513153076172, "learning_rate": 6.869791574239463e-05} +{"ts": "2025-12-27T22:56:45", "event": "train_log", "step": 6086, "epoch": 2.5679324894514766, "progress_pct": 42.8, "epoch_pct": 42.8, "eta": "38:29:19", "max_grad_norm": 0.8, "loss": 0.5527385473251343, "grad_norm": 1.1196715831756592, "learning_rate": 6.867564133460904e-05} +{"ts": "2025-12-27T22:56:58", "event": "train_log", "step": 6088, "epoch": 2.568776371308017, "progress_pct": 42.81, "epoch_pct": 42.81, "eta": "38:28:17", "max_grad_norm": 0.8, "loss": 0.5689145922660828, "grad_norm": 1.0583977699279785, "learning_rate": 6.865336261871168e-05} +{"ts": "2025-12-27T22:57:11", "event": "train_log", "step": 6090, "epoch": 2.569620253164557, "progress_pct": 42.83, "epoch_pct": 42.83, "eta": "38:27:15", "max_grad_norm": 0.8, "loss": 0.5756540298461914, "grad_norm": 1.2963348627090454, "learning_rate": 6.86310795998418e-05} +{"ts": "2025-12-27T22:57:23", "event": "train_log", "step": 6092, "epoch": 2.570464135021097, "progress_pct": 42.84, "epoch_pct": 42.84, "eta": "38:26:11", "max_grad_norm": 0.8, "loss": 0.6062834858894348, "grad_norm": 1.122214436531067, "learning_rate": 6.860879228313968e-05} +{"ts": "2025-12-27T22:57:36", "event": "train_log", "step": 6094, "epoch": 2.571308016877637, "progress_pct": 42.86, "epoch_pct": 42.86, "eta": "38:25:09", "max_grad_norm": 0.8, "loss": 0.5526617169380188, "grad_norm": 1.1313230991363525, "learning_rate": 6.858650067374657e-05} +{"ts": "2025-12-27T22:57:49", "event": "train_log", "step": 6096, "epoch": 2.572151898734177, "progress_pct": 42.87, "epoch_pct": 42.87, "eta": "38:24:06", "max_grad_norm": 0.8, "loss": 0.5911332964897156, "grad_norm": 1.6992650032043457, "learning_rate": 6.856420477680471e-05} +{"ts": "2025-12-27T22:58:01", "event": "train_log", "step": 6098, "epoch": 2.572995780590717, "progress_pct": 42.88, "epoch_pct": 42.88, "eta": "38:23:04", "max_grad_norm": 0.8, "loss": 0.5730270743370056, "grad_norm": 1.2622860670089722, "learning_rate": 6.854190459745735e-05} +{"ts": "2025-12-27T22:58:13", "event": "train_log", "step": 6100, "epoch": 2.5738396624472575, "progress_pct": 42.9, "epoch_pct": 42.9, "eta": "38:22:00", "max_grad_norm": 0.8, "loss": 0.597838282585144, "grad_norm": 1.1420512199401855, "learning_rate": 6.851960014084868e-05} +{"ts": "2025-12-27T23:06:47", "event": "train_log", "step": 6100, "epoch": 2.5738396624472575, "progress_pct": 42.9, "epoch_pct": 42.9, "eta": "38:33:24", "max_grad_norm": 0.8, "eval_loss": 0.6812278628349304, "eval_runtime": 513.4749, "eval_samples_per_second": 4.103, "eval_steps_per_second": 4.103} +{"ts": "2025-12-27T23:07:00", "event": "train_log", "step": 6102, "epoch": 2.5746835443037974, "progress_pct": 42.91, "epoch_pct": 42.91, "eta": "38:32:21", "max_grad_norm": 0.8, "loss": 0.6048991084098816, "grad_norm": 1.129335641860962, "learning_rate": 6.849729141212396e-05} +{"ts": "2025-12-27T23:07:12", "event": "train_log", "step": 6104, "epoch": 2.5755274261603374, "progress_pct": 42.93, "epoch_pct": 42.93, "eta": "38:31:19", "max_grad_norm": 0.8, "loss": 0.6359057426452637, "grad_norm": 1.161284327507019, "learning_rate": 6.847497841642935e-05} +{"ts": "2025-12-27T23:07:25", "event": "train_log", "step": 6106, "epoch": 2.5763713080168777, "progress_pct": 42.94, "epoch_pct": 42.94, "eta": "38:30:16", "max_grad_norm": 0.8, "loss": 0.5858902335166931, "grad_norm": 1.285344123840332, "learning_rate": 6.845266115891203e-05} +{"ts": "2025-12-27T23:07:38", "event": "train_log", "step": 6108, "epoch": 2.5772151898734177, "progress_pct": 42.95, "epoch_pct": 42.95, "eta": "38:29:13", "max_grad_norm": 0.8, "loss": 0.5742247700691223, "grad_norm": 1.085143804550171, "learning_rate": 6.843033964472018e-05} +{"ts": "2025-12-27T23:07:51", "event": "train_log", "step": 6110, "epoch": 2.5780590717299576, "progress_pct": 42.97, "epoch_pct": 42.97, "eta": "38:28:11", "max_grad_norm": 0.8, "loss": 0.6738532185554504, "grad_norm": 1.1920831203460693, "learning_rate": 6.840801387900291e-05} +{"ts": "2025-12-27T23:08:03", "event": "train_log", "step": 6112, "epoch": 2.578902953586498, "progress_pct": 42.98, "epoch_pct": 42.98, "eta": "38:27:07", "max_grad_norm": 0.8, "loss": 0.6046389937400818, "grad_norm": 1.2750232219696045, "learning_rate": 6.838568386691042e-05} +{"ts": "2025-12-27T23:08:15", "event": "train_log", "step": 6114, "epoch": 2.579746835443038, "progress_pct": 43.0, "epoch_pct": 43.0, "eta": "38:26:05", "max_grad_norm": 0.8, "loss": 0.6231611967086792, "grad_norm": 1.1027764081954956, "learning_rate": 6.836334961359373e-05} +{"ts": "2025-12-27T23:08:28", "event": "train_log", "step": 6116, "epoch": 2.580590717299578, "progress_pct": 43.01, "epoch_pct": 43.01, "eta": "38:25:02", "max_grad_norm": 0.8, "loss": 0.5848191380500793, "grad_norm": 1.2996546030044556, "learning_rate": 6.834101112420497e-05} +{"ts": "2025-12-27T23:08:41", "event": "train_log", "step": 6118, "epoch": 2.5814345991561183, "progress_pct": 43.02, "epoch_pct": 43.02, "eta": "38:24:00", "max_grad_norm": 0.8, "loss": 0.6160622835159302, "grad_norm": 1.2683454751968384, "learning_rate": 6.831866840389719e-05} +{"ts": "2025-12-27T23:08:53", "event": "train_log", "step": 6120, "epoch": 2.5822784810126582, "progress_pct": 43.04, "epoch_pct": 43.04, "eta": "38:22:57", "max_grad_norm": 0.8, "loss": 0.5220097899436951, "grad_norm": 1.049797534942627, "learning_rate": 6.829632145782441e-05} +{"ts": "2025-12-27T23:09:06", "event": "train_log", "step": 6122, "epoch": 2.583122362869198, "progress_pct": 43.05, "epoch_pct": 43.05, "eta": "38:21:55", "max_grad_norm": 0.8, "loss": 0.5709835290908813, "grad_norm": 1.1798468828201294, "learning_rate": 6.827397029114168e-05} +{"ts": "2025-12-27T23:09:20", "event": "train_log", "step": 6124, "epoch": 2.5839662447257385, "progress_pct": 43.07, "epoch_pct": 43.07, "eta": "38:20:54", "max_grad_norm": 0.8, "loss": 0.5086703300476074, "grad_norm": 1.0136369466781616, "learning_rate": 6.825161490900495e-05} +{"ts": "2025-12-27T23:09:33", "event": "train_log", "step": 6126, "epoch": 2.5848101265822785, "progress_pct": 43.08, "epoch_pct": 43.08, "eta": "38:19:51", "max_grad_norm": 0.8, "loss": 0.5904423594474792, "grad_norm": 1.147735595703125, "learning_rate": 6.822925531657119e-05} +{"ts": "2025-12-27T23:09:46", "event": "train_log", "step": 6128, "epoch": 2.5856540084388184, "progress_pct": 43.09, "epoch_pct": 43.09, "eta": "38:18:50", "max_grad_norm": 0.8, "loss": 0.5002011060714722, "grad_norm": 0.9979357123374939, "learning_rate": 6.820689151899833e-05} +{"ts": "2025-12-27T23:09:58", "event": "train_log", "step": 6130, "epoch": 2.586497890295359, "progress_pct": 43.11, "epoch_pct": 43.11, "eta": "38:17:47", "max_grad_norm": 0.8, "loss": 0.5694814920425415, "grad_norm": 1.4129728078842163, "learning_rate": 6.818452352144527e-05} +{"ts": "2025-12-27T23:10:11", "event": "train_log", "step": 6132, "epoch": 2.5873417721518988, "progress_pct": 43.12, "epoch_pct": 43.12, "eta": "38:16:45", "max_grad_norm": 0.8, "loss": 0.5448270440101624, "grad_norm": 1.1388975381851196, "learning_rate": 6.816215132907186e-05} +{"ts": "2025-12-27T23:10:24", "event": "train_log", "step": 6134, "epoch": 2.5881856540084387, "progress_pct": 43.14, "epoch_pct": 43.14, "eta": "38:15:43", "max_grad_norm": 0.8, "loss": 0.6184739470481873, "grad_norm": 1.268865942955017, "learning_rate": 6.813977494703896e-05} +{"ts": "2025-12-27T23:10:35", "event": "train_log", "step": 6136, "epoch": 2.589029535864979, "progress_pct": 43.15, "epoch_pct": 43.15, "eta": "38:14:39", "max_grad_norm": 0.8, "loss": 0.6493034958839417, "grad_norm": 1.2403846979141235, "learning_rate": 6.811739438050835e-05} +{"ts": "2025-12-27T23:10:48", "event": "train_log", "step": 6138, "epoch": 2.589873417721519, "progress_pct": 43.16, "epoch_pct": 43.16, "eta": "38:13:37", "max_grad_norm": 0.8, "loss": 0.6168854236602783, "grad_norm": 1.108298659324646, "learning_rate": 6.809500963464282e-05} +{"ts": "2025-12-27T23:11:01", "event": "train_log", "step": 6140, "epoch": 2.590717299578059, "progress_pct": 43.18, "epoch_pct": 43.18, "eta": "38:12:35", "max_grad_norm": 0.8, "loss": 0.5734958052635193, "grad_norm": 1.106427788734436, "learning_rate": 6.807262071460609e-05} +{"ts": "2025-12-27T23:11:15", "event": "train_log", "step": 6142, "epoch": 2.5915611814345993, "progress_pct": 43.19, "epoch_pct": 43.19, "eta": "38:11:34", "max_grad_norm": 0.8, "loss": 0.5422238111495972, "grad_norm": 1.147791862487793, "learning_rate": 6.805022762556286e-05} +{"ts": "2025-12-27T23:11:28", "event": "train_log", "step": 6144, "epoch": 2.5924050632911393, "progress_pct": 43.21, "epoch_pct": 43.21, "eta": "38:10:33", "max_grad_norm": 0.8, "loss": 0.6511701345443726, "grad_norm": 1.214465856552124, "learning_rate": 6.802783037267874e-05} +{"ts": "2025-12-27T23:11:41", "event": "train_log", "step": 6146, "epoch": 2.5932489451476792, "progress_pct": 43.22, "epoch_pct": 43.22, "eta": "38:09:31", "max_grad_norm": 0.8, "loss": 0.5978493094444275, "grad_norm": 1.087735891342163, "learning_rate": 6.800542896112043e-05} +{"ts": "2025-12-27T23:11:54", "event": "train_log", "step": 6148, "epoch": 2.5940928270042196, "progress_pct": 43.23, "epoch_pct": 43.23, "eta": "38:08:29", "max_grad_norm": 0.8, "loss": 0.5656765699386597, "grad_norm": 1.0772241353988647, "learning_rate": 6.798302339605544e-05} +{"ts": "2025-12-27T23:12:07", "event": "train_log", "step": 6150, "epoch": 2.5949367088607596, "progress_pct": 43.25, "epoch_pct": 43.25, "eta": "38:07:28", "max_grad_norm": 0.8, "loss": 0.6147777438163757, "grad_norm": 1.1666499376296997, "learning_rate": 6.796061368265231e-05} +{"ts": "2025-12-27T23:12:20", "event": "train_log", "step": 6152, "epoch": 2.5957805907172995, "progress_pct": 43.26, "epoch_pct": 43.26, "eta": "38:06:27", "max_grad_norm": 0.8, "loss": 0.502659022808075, "grad_norm": 0.9949467182159424, "learning_rate": 6.793819982608057e-05} +{"ts": "2025-12-27T23:12:32", "event": "train_log", "step": 6154, "epoch": 2.59662447257384, "progress_pct": 43.28, "epoch_pct": 43.28, "eta": "38:05:23", "max_grad_norm": 0.8, "loss": 0.6019812226295471, "grad_norm": 1.311484456062317, "learning_rate": 6.791578183151061e-05} +{"ts": "2025-12-27T23:12:45", "event": "train_log", "step": 6156, "epoch": 2.59746835443038, "progress_pct": 43.29, "epoch_pct": 43.29, "eta": "38:04:22", "max_grad_norm": 0.8, "loss": 0.625690221786499, "grad_norm": 0.9594855904579163, "learning_rate": 6.789335970411387e-05} +{"ts": "2025-12-27T23:12:57", "event": "train_log", "step": 6158, "epoch": 2.5983122362869198, "progress_pct": 43.31, "epoch_pct": 43.31, "eta": "38:03:19", "max_grad_norm": 0.8, "loss": 0.628356397151947, "grad_norm": 1.2252063751220703, "learning_rate": 6.78709334490627e-05} +{"ts": "2025-12-27T23:13:10", "event": "train_log", "step": 6160, "epoch": 2.59915611814346, "progress_pct": 43.32, "epoch_pct": 43.32, "eta": "38:02:18", "max_grad_norm": 0.8, "loss": 0.5447192192077637, "grad_norm": 1.089603304862976, "learning_rate": 6.784850307153043e-05} +{"ts": "2025-12-27T23:13:23", "event": "train_log", "step": 6162, "epoch": 2.6, "progress_pct": 43.33, "epoch_pct": 43.33, "eta": "38:01:16", "max_grad_norm": 0.8, "loss": 0.5400487184524536, "grad_norm": 1.1035163402557373, "learning_rate": 6.782606857669125e-05} +{"ts": "2025-12-27T23:13:36", "event": "train_log", "step": 6164, "epoch": 2.60084388185654, "progress_pct": 43.35, "epoch_pct": 43.35, "eta": "38:00:15", "max_grad_norm": 0.8, "loss": 0.5795643329620361, "grad_norm": 1.2329976558685303, "learning_rate": 6.780362996972042e-05} +{"ts": "2025-12-27T23:13:48", "event": "train_log", "step": 6166, "epoch": 2.6016877637130804, "progress_pct": 43.36, "epoch_pct": 43.36, "eta": "37:59:13", "max_grad_norm": 0.8, "loss": 0.5664985775947571, "grad_norm": 1.2984000444412231, "learning_rate": 6.778118725579408e-05} +{"ts": "2025-12-27T23:14:01", "event": "train_log", "step": 6168, "epoch": 2.6025316455696204, "progress_pct": 43.38, "epoch_pct": 43.38, "eta": "37:58:11", "max_grad_norm": 0.8, "loss": 0.5406283140182495, "grad_norm": 1.3563600778579712, "learning_rate": 6.775874044008933e-05} +{"ts": "2025-12-27T23:14:13", "event": "train_log", "step": 6170, "epoch": 2.6033755274261603, "progress_pct": 43.39, "epoch_pct": 43.39, "eta": "37:57:08", "max_grad_norm": 0.8, "loss": 0.5362374782562256, "grad_norm": 1.1897385120391846, "learning_rate": 6.773628952778421e-05} +{"ts": "2025-12-27T23:14:26", "event": "train_log", "step": 6172, "epoch": 2.6042194092827007, "progress_pct": 43.4, "epoch_pct": 43.4, "eta": "37:56:07", "max_grad_norm": 0.8, "loss": 0.5942689180374146, "grad_norm": 1.1492685079574585, "learning_rate": 6.771383452405773e-05} +{"ts": "2025-12-27T23:14:38", "event": "train_log", "step": 6174, "epoch": 2.6050632911392406, "progress_pct": 43.42, "epoch_pct": 43.42, "eta": "37:55:04", "max_grad_norm": 0.8, "loss": 0.6144227981567383, "grad_norm": 1.2306408882141113, "learning_rate": 6.769137543408985e-05} +{"ts": "2025-12-27T23:14:51", "event": "train_log", "step": 6176, "epoch": 2.6059071729957806, "progress_pct": 43.43, "epoch_pct": 43.43, "eta": "37:54:04", "max_grad_norm": 0.8, "loss": 0.5147640705108643, "grad_norm": 1.1260589361190796, "learning_rate": 6.766891226306143e-05} +{"ts": "2025-12-27T23:15:03", "event": "train_log", "step": 6178, "epoch": 2.606751054852321, "progress_pct": 43.45, "epoch_pct": 43.45, "eta": "37:53:01", "max_grad_norm": 0.8, "loss": 0.6822091341018677, "grad_norm": 1.214007019996643, "learning_rate": 6.764644501615427e-05} +{"ts": "2025-12-27T23:15:16", "event": "train_log", "step": 6180, "epoch": 2.607594936708861, "progress_pct": 43.46, "epoch_pct": 43.46, "eta": "37:52:00", "max_grad_norm": 0.8, "loss": 0.5330857038497925, "grad_norm": 1.2251341342926025, "learning_rate": 6.762397369855116e-05} +{"ts": "2025-12-27T23:15:29", "event": "train_log", "step": 6182, "epoch": 2.608438818565401, "progress_pct": 43.47, "epoch_pct": 43.47, "eta": "37:50:59", "max_grad_norm": 0.8, "loss": 0.58979332447052, "grad_norm": 1.3556525707244873, "learning_rate": 6.760149831543578e-05} +{"ts": "2025-12-27T23:15:42", "event": "train_log", "step": 6184, "epoch": 2.6092827004219408, "progress_pct": 43.49, "epoch_pct": 43.49, "eta": "37:49:57", "max_grad_norm": 0.8, "loss": 0.5667334198951721, "grad_norm": 1.286598563194275, "learning_rate": 6.757901887199278e-05} +{"ts": "2025-12-27T23:15:55", "event": "train_log", "step": 6186, "epoch": 2.610126582278481, "progress_pct": 43.5, "epoch_pct": 43.5, "eta": "37:48:57", "max_grad_norm": 0.8, "loss": 0.6028750538825989, "grad_norm": 1.2515888214111328, "learning_rate": 6.755653537340776e-05} +{"ts": "2025-12-27T23:16:08", "event": "train_log", "step": 6188, "epoch": 2.610970464135021, "progress_pct": 43.52, "epoch_pct": 43.52, "eta": "37:47:56", "max_grad_norm": 0.8, "loss": 0.604102611541748, "grad_norm": 1.1090617179870605, "learning_rate": 6.753404782486719e-05} +{"ts": "2025-12-27T23:16:21", "event": "train_log", "step": 6190, "epoch": 2.611814345991561, "progress_pct": 43.53, "epoch_pct": 43.53, "eta": "37:46:54", "max_grad_norm": 0.8, "loss": 0.5486276745796204, "grad_norm": 1.1782273054122925, "learning_rate": 6.751155623155853e-05} +{"ts": "2025-12-27T23:16:32", "event": "train_log", "step": 6192, "epoch": 2.6126582278481014, "progress_pct": 43.54, "epoch_pct": 43.54, "eta": "37:45:52", "max_grad_norm": 0.8, "loss": 0.630682110786438, "grad_norm": 1.5475431680679321, "learning_rate": 6.748906059867018e-05} +{"ts": "2025-12-27T23:16:45", "event": "train_log", "step": 6194, "epoch": 2.6135021097046414, "progress_pct": 43.56, "epoch_pct": 43.56, "eta": "37:44:50", "max_grad_norm": 0.8, "loss": 0.571597695350647, "grad_norm": 1.237891435623169, "learning_rate": 6.746656093139143e-05} +{"ts": "2025-12-27T23:16:58", "event": "train_log", "step": 6196, "epoch": 2.6143459915611813, "progress_pct": 43.57, "epoch_pct": 43.57, "eta": "37:43:49", "max_grad_norm": 0.8, "loss": 0.6020040512084961, "grad_norm": 1.2367130517959595, "learning_rate": 6.744405723491253e-05} +{"ts": "2025-12-27T23:17:11", "event": "train_log", "step": 6198, "epoch": 2.6151898734177212, "progress_pct": 43.59, "epoch_pct": 43.59, "eta": "37:42:49", "max_grad_norm": 0.8, "loss": 0.5520704984664917, "grad_norm": 1.0747612714767456, "learning_rate": 6.742154951442464e-05} +{"ts": "2025-12-27T23:17:23", "event": "train_log", "step": 6200, "epoch": 2.6160337552742616, "progress_pct": 43.6, "epoch_pct": 43.6, "eta": "37:41:46", "max_grad_norm": 0.8, "loss": 0.7312755584716797, "grad_norm": 1.3944035768508911, "learning_rate": 6.739903777511985e-05} +{"ts": "2025-12-27T23:25:56", "event": "train_log", "step": 6200, "epoch": 2.6160337552742616, "progress_pct": 43.6, "epoch_pct": 43.6, "eta": "37:52:50", "max_grad_norm": 0.8, "eval_loss": 0.6795271039009094, "eval_runtime": 513.2393, "eval_samples_per_second": 4.105, "eval_steps_per_second": 4.105} +{"ts": "2025-12-27T23:26:08", "event": "train_log", "step": 6202, "epoch": 2.6168776371308016, "progress_pct": 43.61, "epoch_pct": 43.61, "eta": "37:51:48", "max_grad_norm": 0.8, "loss": 0.617123007774353, "grad_norm": 1.3716613054275513, "learning_rate": 6.737652202219121e-05} +{"ts": "2025-12-27T23:26:21", "event": "train_log", "step": 6204, "epoch": 2.6177215189873415, "progress_pct": 43.63, "epoch_pct": 43.63, "eta": "37:50:47", "max_grad_norm": 0.8, "loss": 0.5791950225830078, "grad_norm": 1.1962300539016724, "learning_rate": 6.735400226083267e-05} +{"ts": "2025-12-27T23:26:33", "event": "train_log", "step": 6206, "epoch": 2.618565400843882, "progress_pct": 43.64, "epoch_pct": 43.64, "eta": "37:49:44", "max_grad_norm": 0.8, "loss": 0.5941018462181091, "grad_norm": 1.2570394277572632, "learning_rate": 6.733147849623909e-05} +{"ts": "2025-12-27T23:26:46", "event": "train_log", "step": 6208, "epoch": 2.619409282700422, "progress_pct": 43.66, "epoch_pct": 43.66, "eta": "37:48:43", "max_grad_norm": 0.8, "loss": 0.5417253971099854, "grad_norm": 1.2903523445129395, "learning_rate": 6.730895073360628e-05} +{"ts": "2025-12-27T23:26:59", "event": "train_log", "step": 6210, "epoch": 2.620253164556962, "progress_pct": 43.67, "epoch_pct": 43.67, "eta": "37:47:42", "max_grad_norm": 0.8, "loss": 0.536359965801239, "grad_norm": 1.0618562698364258, "learning_rate": 6.728641897813096e-05} +{"ts": "2025-12-27T23:27:11", "event": "train_log", "step": 6212, "epoch": 2.621097046413502, "progress_pct": 43.68, "epoch_pct": 43.68, "eta": "37:46:40", "max_grad_norm": 0.8, "loss": 0.6409479975700378, "grad_norm": 1.307300090789795, "learning_rate": 6.726388323501077e-05} +{"ts": "2025-12-27T23:27:23", "event": "train_log", "step": 6214, "epoch": 2.621940928270042, "progress_pct": 43.7, "epoch_pct": 43.7, "eta": "37:45:38", "max_grad_norm": 0.8, "loss": 0.66277676820755, "grad_norm": 1.3672584295272827, "learning_rate": 6.72413435094443e-05} +{"ts": "2025-12-27T23:27:36", "event": "train_log", "step": 6216, "epoch": 2.622784810126582, "progress_pct": 43.71, "epoch_pct": 43.71, "eta": "37:44:37", "max_grad_norm": 0.8, "loss": 0.6193054914474487, "grad_norm": 1.2156232595443726, "learning_rate": 6.721879980663098e-05} +{"ts": "2025-12-27T23:27:49", "event": "train_log", "step": 6218, "epoch": 2.6236286919831224, "progress_pct": 43.73, "epoch_pct": 43.73, "eta": "37:43:35", "max_grad_norm": 0.8, "loss": 0.5773701667785645, "grad_norm": 1.1575636863708496, "learning_rate": 6.719625213177124e-05} +{"ts": "2025-12-27T23:28:01", "event": "train_log", "step": 6220, "epoch": 2.6244725738396624, "progress_pct": 43.74, "epoch_pct": 43.74, "eta": "37:42:33", "max_grad_norm": 0.8, "loss": 0.6913977265357971, "grad_norm": 1.2327474355697632, "learning_rate": 6.71737004900664e-05} +{"ts": "2025-12-27T23:28:13", "event": "train_log", "step": 6222, "epoch": 2.6253164556962023, "progress_pct": 43.76, "epoch_pct": 43.76, "eta": "37:41:31", "max_grad_norm": 0.8, "loss": 0.5773524045944214, "grad_norm": 1.1316778659820557, "learning_rate": 6.715114488671869e-05} +{"ts": "2025-12-27T23:28:26", "event": "train_log", "step": 6224, "epoch": 2.6261603375527427, "progress_pct": 43.77, "epoch_pct": 43.77, "eta": "37:40:30", "max_grad_norm": 0.8, "loss": 0.5554601550102234, "grad_norm": 1.1508816480636597, "learning_rate": 6.712858532693125e-05} +{"ts": "2025-12-27T23:28:38", "event": "train_log", "step": 6226, "epoch": 2.6270042194092826, "progress_pct": 43.78, "epoch_pct": 43.78, "eta": "37:39:28", "max_grad_norm": 0.8, "loss": 0.6090670824050903, "grad_norm": 1.2404967546463013, "learning_rate": 6.710602181590812e-05} +{"ts": "2025-12-27T23:28:50", "event": "train_log", "step": 6228, "epoch": 2.6278481012658226, "progress_pct": 43.8, "epoch_pct": 43.8, "eta": "37:38:26", "max_grad_norm": 0.8, "loss": 0.5546537637710571, "grad_norm": 1.0721718072891235, "learning_rate": 6.70834543588543e-05} +{"ts": "2025-12-27T23:29:02", "event": "train_log", "step": 6230, "epoch": 2.628691983122363, "progress_pct": 43.81, "epoch_pct": 43.81, "eta": "37:37:24", "max_grad_norm": 0.8, "loss": 0.5939876437187195, "grad_norm": 1.2788114547729492, "learning_rate": 6.706088296097564e-05} +{"ts": "2025-12-27T23:29:15", "event": "train_log", "step": 6232, "epoch": 2.629535864978903, "progress_pct": 43.83, "epoch_pct": 43.83, "eta": "37:36:23", "max_grad_norm": 0.8, "loss": 0.5291836857795715, "grad_norm": 1.1952526569366455, "learning_rate": 6.703830762747896e-05} +{"ts": "2025-12-27T23:29:28", "event": "train_log", "step": 6234, "epoch": 2.630379746835443, "progress_pct": 43.84, "epoch_pct": 43.84, "eta": "37:35:22", "max_grad_norm": 0.8, "loss": 0.518436074256897, "grad_norm": 1.0261807441711426, "learning_rate": 6.701572836357191e-05} +{"ts": "2025-12-27T23:29:41", "event": "train_log", "step": 6236, "epoch": 2.6312236286919832, "progress_pct": 43.85, "epoch_pct": 43.85, "eta": "37:34:22", "max_grad_norm": 0.8, "loss": 0.5830684900283813, "grad_norm": 1.1804791688919067, "learning_rate": 6.699314517446316e-05} +{"ts": "2025-12-27T23:29:53", "event": "train_log", "step": 6238, "epoch": 2.632067510548523, "progress_pct": 43.87, "epoch_pct": 43.87, "eta": "37:33:21", "max_grad_norm": 0.8, "loss": 0.5899971127510071, "grad_norm": 1.2079823017120361, "learning_rate": 6.697055806536214e-05} +{"ts": "2025-12-27T23:30:05", "event": "train_log", "step": 6240, "epoch": 2.632911392405063, "progress_pct": 43.88, "epoch_pct": 43.88, "eta": "37:32:19", "max_grad_norm": 0.8, "loss": 0.6533132791519165, "grad_norm": 1.1989154815673828, "learning_rate": 6.694796704147932e-05} +{"ts": "2025-12-27T23:30:18", "event": "train_log", "step": 6242, "epoch": 2.6337552742616035, "progress_pct": 43.9, "epoch_pct": 43.9, "eta": "37:31:18", "max_grad_norm": 0.8, "loss": 0.5341002345085144, "grad_norm": 1.0621024370193481, "learning_rate": 6.692537210802598e-05} +{"ts": "2025-12-27T23:30:30", "event": "train_log", "step": 6244, "epoch": 2.6345991561181434, "progress_pct": 43.91, "epoch_pct": 43.91, "eta": "37:30:16", "max_grad_norm": 0.8, "loss": 0.6795719861984253, "grad_norm": 1.2911880016326904, "learning_rate": 6.690277327021436e-05} +{"ts": "2025-12-27T23:30:42", "event": "train_log", "step": 6246, "epoch": 2.6354430379746834, "progress_pct": 43.92, "epoch_pct": 43.92, "eta": "37:29:14", "max_grad_norm": 0.8, "loss": 0.5390555262565613, "grad_norm": 1.3586145639419556, "learning_rate": 6.688017053325757e-05} +{"ts": "2025-12-27T23:30:54", "event": "train_log", "step": 6248, "epoch": 2.6362869198312238, "progress_pct": 43.94, "epoch_pct": 43.94, "eta": "37:28:13", "max_grad_norm": 0.8, "loss": 0.5935586094856262, "grad_norm": 1.31569242477417, "learning_rate": 6.685756390236964e-05} +{"ts": "2025-12-27T23:31:07", "event": "train_log", "step": 6250, "epoch": 2.6371308016877637, "progress_pct": 43.95, "epoch_pct": 43.95, "eta": "37:27:12", "max_grad_norm": 0.8, "loss": 0.5845919847488403, "grad_norm": 1.0801384449005127, "learning_rate": 6.683495338276547e-05} +{"ts": "2025-12-27T23:31:20", "event": "train_log", "step": 6252, "epoch": 2.6379746835443036, "progress_pct": 43.97, "epoch_pct": 43.97, "eta": "37:26:12", "max_grad_norm": 0.8, "loss": 0.6017906665802002, "grad_norm": 1.179715633392334, "learning_rate": 6.681233897966087e-05} +{"ts": "2025-12-27T23:31:32", "event": "train_log", "step": 6254, "epoch": 2.638818565400844, "progress_pct": 43.98, "epoch_pct": 43.98, "eta": "37:25:10", "max_grad_norm": 0.8, "loss": 0.6637946367263794, "grad_norm": 1.1927930116653442, "learning_rate": 6.678972069827255e-05} +{"ts": "2025-12-27T23:31:45", "event": "train_log", "step": 6256, "epoch": 2.639662447257384, "progress_pct": 43.99, "epoch_pct": 43.99, "eta": "37:24:10", "max_grad_norm": 0.8, "loss": 0.5572535991668701, "grad_norm": 1.2167247533798218, "learning_rate": 6.676709854381812e-05} +{"ts": "2025-12-27T23:31:58", "event": "train_log", "step": 6258, "epoch": 2.640506329113924, "progress_pct": 44.01, "epoch_pct": 44.01, "eta": "37:23:09", "max_grad_norm": 0.8, "loss": 0.5426514148712158, "grad_norm": 1.2026311159133911, "learning_rate": 6.674447252151608e-05} +{"ts": "2025-12-27T23:32:10", "event": "train_log", "step": 6260, "epoch": 2.6413502109704643, "progress_pct": 44.02, "epoch_pct": 44.02, "eta": "37:22:08", "max_grad_norm": 0.8, "loss": 0.5123113989830017, "grad_norm": 1.101891279220581, "learning_rate": 6.672184263658579e-05} +{"ts": "2025-12-27T23:32:22", "event": "train_log", "step": 6262, "epoch": 2.6421940928270042, "progress_pct": 44.04, "epoch_pct": 44.04, "eta": "37:21:06", "max_grad_norm": 0.8, "loss": 0.6018276214599609, "grad_norm": 1.3467986583709717, "learning_rate": 6.669920889424758e-05} +{"ts": "2025-12-27T23:32:35", "event": "train_log", "step": 6264, "epoch": 2.643037974683544, "progress_pct": 44.05, "epoch_pct": 44.05, "eta": "37:20:06", "max_grad_norm": 0.8, "loss": 0.5618380308151245, "grad_norm": 1.2477779388427734, "learning_rate": 6.667657129972257e-05} +{"ts": "2025-12-27T23:32:48", "event": "train_log", "step": 6266, "epoch": 2.6438818565400846, "progress_pct": 44.06, "epoch_pct": 44.06, "eta": "37:19:06", "max_grad_norm": 0.8, "loss": 0.5541924834251404, "grad_norm": 1.1284273862838745, "learning_rate": 6.665392985823287e-05} +{"ts": "2025-12-27T23:33:00", "event": "train_log", "step": 6268, "epoch": 2.6447257383966245, "progress_pct": 44.08, "epoch_pct": 44.08, "eta": "37:18:05", "max_grad_norm": 0.8, "loss": 0.5534335970878601, "grad_norm": 1.2376370429992676, "learning_rate": 6.663128457500137e-05} +{"ts": "2025-12-27T23:33:12", "event": "train_log", "step": 6270, "epoch": 2.6455696202531644, "progress_pct": 44.09, "epoch_pct": 44.09, "eta": "37:17:04", "max_grad_norm": 0.8, "loss": 0.6160520315170288, "grad_norm": 1.3205965757369995, "learning_rate": 6.660863545525196e-05} +{"ts": "2025-12-27T23:33:25", "event": "train_log", "step": 6272, "epoch": 2.646413502109705, "progress_pct": 44.11, "epoch_pct": 44.11, "eta": "37:16:03", "max_grad_norm": 0.8, "loss": 0.6035991311073303, "grad_norm": 1.175926685333252, "learning_rate": 6.65859825042093e-05} +{"ts": "2025-12-27T23:33:37", "event": "train_log", "step": 6274, "epoch": 2.6472573839662448, "progress_pct": 44.12, "epoch_pct": 44.12, "eta": "37:15:02", "max_grad_norm": 0.8, "loss": 0.6101992130279541, "grad_norm": 1.2805176973342896, "learning_rate": 6.656332572709901e-05} +{"ts": "2025-12-27T23:33:49", "event": "train_log", "step": 6276, "epoch": 2.6481012658227847, "progress_pct": 44.14, "epoch_pct": 44.14, "eta": "37:14:01", "max_grad_norm": 0.8, "loss": 0.5665684342384338, "grad_norm": 1.2493922710418701, "learning_rate": 6.65406651291476e-05} +{"ts": "2025-12-27T23:34:01", "event": "train_log", "step": 6278, "epoch": 2.648945147679325, "progress_pct": 44.15, "epoch_pct": 44.15, "eta": "37:12:59", "max_grad_norm": 0.8, "loss": 0.682868242263794, "grad_norm": 1.3103299140930176, "learning_rate": 6.65180007155824e-05} +{"ts": "2025-12-27T23:34:13", "event": "train_log", "step": 6280, "epoch": 2.649789029535865, "progress_pct": 44.16, "epoch_pct": 44.16, "eta": "37:11:58", "max_grad_norm": 0.8, "loss": 0.6398087739944458, "grad_norm": 1.3098952770233154, "learning_rate": 6.649533249163167e-05} +{"ts": "2025-12-27T23:34:25", "event": "train_log", "step": 6282, "epoch": 2.650632911392405, "progress_pct": 44.18, "epoch_pct": 44.18, "eta": "37:10:56", "max_grad_norm": 0.8, "loss": 0.5410205721855164, "grad_norm": 1.230396032333374, "learning_rate": 6.647266046252454e-05} +{"ts": "2025-12-27T23:34:37", "event": "train_log", "step": 6284, "epoch": 2.6514767932489454, "progress_pct": 44.19, "epoch_pct": 44.19, "eta": "37:09:56", "max_grad_norm": 0.8, "loss": 0.6019781231880188, "grad_norm": 1.1755880117416382, "learning_rate": 6.6449984633491e-05} +{"ts": "2025-12-27T23:34:49", "event": "train_log", "step": 6286, "epoch": 2.6523206751054853, "progress_pct": 44.21, "epoch_pct": 44.21, "eta": "37:08:55", "max_grad_norm": 0.8, "loss": 0.5327204465866089, "grad_norm": 1.1013081073760986, "learning_rate": 6.642730500976193e-05} +{"ts": "2025-12-27T23:35:01", "event": "train_log", "step": 6288, "epoch": 2.6531645569620252, "progress_pct": 44.22, "epoch_pct": 44.22, "eta": "37:07:54", "max_grad_norm": 0.8, "loss": 0.6458070278167725, "grad_norm": 1.1285136938095093, "learning_rate": 6.640462159656908e-05} +{"ts": "2025-12-27T23:35:13", "event": "train_log", "step": 6290, "epoch": 2.6540084388185656, "progress_pct": 44.23, "epoch_pct": 44.23, "eta": "37:06:53", "max_grad_norm": 0.8, "loss": 0.6038496494293213, "grad_norm": 1.5320124626159668, "learning_rate": 6.638193439914512e-05} +{"ts": "2025-12-27T23:35:26", "event": "train_log", "step": 6292, "epoch": 2.6548523206751056, "progress_pct": 44.25, "epoch_pct": 44.25, "eta": "37:05:53", "max_grad_norm": 0.8, "loss": 0.5353283286094666, "grad_norm": 1.0231032371520996, "learning_rate": 6.635924342272349e-05} +{"ts": "2025-12-27T23:35:39", "event": "train_log", "step": 6294, "epoch": 2.6556962025316455, "progress_pct": 44.26, "epoch_pct": 44.26, "eta": "37:04:52", "max_grad_norm": 0.8, "loss": 0.644368588924408, "grad_norm": 1.1871505975723267, "learning_rate": 6.633654867253858e-05} +{"ts": "2025-12-27T23:35:51", "event": "train_log", "step": 6296, "epoch": 2.656540084388186, "progress_pct": 44.28, "epoch_pct": 44.28, "eta": "37:03:53", "max_grad_norm": 0.8, "loss": 0.5251830220222473, "grad_norm": 1.0641425848007202, "learning_rate": 6.631385015382565e-05} +{"ts": "2025-12-27T23:36:05", "event": "train_log", "step": 6298, "epoch": 2.657383966244726, "progress_pct": 44.29, "epoch_pct": 44.29, "eta": "37:02:54", "max_grad_norm": 0.8, "loss": 0.527733564376831, "grad_norm": 0.8980898261070251, "learning_rate": 6.62911478718208e-05} +{"ts": "2025-12-27T23:36:18", "event": "train_log", "step": 6300, "epoch": 2.6582278481012658, "progress_pct": 44.3, "epoch_pct": 44.3, "eta": "37:01:53", "max_grad_norm": 0.8, "loss": 0.5868222117424011, "grad_norm": 1.1694822311401367, "learning_rate": 6.626844183176102e-05} +{"ts": "2025-12-27T23:44:50", "event": "train_log", "step": 6300, "epoch": 2.6582278481012658, "progress_pct": 44.3, "epoch_pct": 44.3, "eta": "37:12:38", "max_grad_norm": 0.8, "eval_loss": 0.6781066656112671, "eval_runtime": 512.3669, "eval_samples_per_second": 4.112, "eval_steps_per_second": 4.112} +{"ts": "2025-12-27T23:45:02", "event": "train_log", "step": 6302, "epoch": 2.659071729957806, "progress_pct": 44.32, "epoch_pct": 44.32, "eta": "37:11:37", "max_grad_norm": 0.8, "loss": 0.5965607166290283, "grad_norm": 1.3010352849960327, "learning_rate": 6.624573203888413e-05} +{"ts": "2025-12-27T23:45:15", "event": "train_log", "step": 6304, "epoch": 2.659915611814346, "progress_pct": 44.33, "epoch_pct": 44.33, "eta": "37:10:36", "max_grad_norm": 0.8, "loss": 0.5776658654212952, "grad_norm": 1.074964165687561, "learning_rate": 6.62230184984289e-05} +{"ts": "2025-12-27T23:45:28", "event": "train_log", "step": 6306, "epoch": 2.660759493670886, "progress_pct": 44.35, "epoch_pct": 44.35, "eta": "37:09:36", "max_grad_norm": 0.8, "loss": 0.584223210811615, "grad_norm": 1.0930451154708862, "learning_rate": 6.620030121563484e-05} +{"ts": "2025-12-27T23:45:41", "event": "train_log", "step": 6308, "epoch": 2.6616033755274264, "progress_pct": 44.36, "epoch_pct": 44.36, "eta": "37:08:36", "max_grad_norm": 0.8, "loss": 0.534063994884491, "grad_norm": 1.1418803930282593, "learning_rate": 6.617758019574243e-05} +{"ts": "2025-12-27T23:45:54", "event": "train_log", "step": 6310, "epoch": 2.6624472573839664, "progress_pct": 44.37, "epoch_pct": 44.37, "eta": "37:07:36", "max_grad_norm": 0.8, "loss": 0.5719610452651978, "grad_norm": 1.1602790355682373, "learning_rate": 6.615485544399298e-05} +{"ts": "2025-12-27T23:46:06", "event": "train_log", "step": 6312, "epoch": 2.6632911392405063, "progress_pct": 44.39, "epoch_pct": 44.39, "eta": "37:06:36", "max_grad_norm": 0.8, "loss": 0.5489934682846069, "grad_norm": 1.0926544666290283, "learning_rate": 6.613212696562863e-05} +{"ts": "2025-12-27T23:46:19", "event": "train_log", "step": 6314, "epoch": 2.6641350210970463, "progress_pct": 44.4, "epoch_pct": 44.4, "eta": "37:05:35", "max_grad_norm": 0.8, "loss": 0.5568612217903137, "grad_norm": 1.2560242414474487, "learning_rate": 6.610939476589239e-05} +{"ts": "2025-12-27T23:46:32", "event": "train_log", "step": 6316, "epoch": 2.6649789029535866, "progress_pct": 44.42, "epoch_pct": 44.42, "eta": "37:04:36", "max_grad_norm": 0.8, "loss": 0.6019266247749329, "grad_norm": 1.110960602760315, "learning_rate": 6.60866588500282e-05} +{"ts": "2025-12-27T23:46:44", "event": "train_log", "step": 6318, "epoch": 2.6658227848101266, "progress_pct": 44.43, "epoch_pct": 44.43, "eta": "37:03:35", "max_grad_norm": 0.8, "loss": 0.6083081364631653, "grad_norm": 1.333012342453003, "learning_rate": 6.606391922328074e-05} +{"ts": "2025-12-27T23:46:57", "event": "train_log", "step": 6320, "epoch": 2.6666666666666665, "progress_pct": 44.44, "epoch_pct": 44.44, "eta": "37:02:35", "max_grad_norm": 0.8, "loss": 0.5586183071136475, "grad_norm": 1.1256170272827148, "learning_rate": 6.604117589089564e-05} +{"ts": "2025-12-27T23:47:08", "event": "train_log", "step": 6322, "epoch": 2.667510548523207, "progress_pct": 44.46, "epoch_pct": 44.46, "eta": "37:01:34", "max_grad_norm": 0.8, "loss": 0.5676470994949341, "grad_norm": 1.2877609729766846, "learning_rate": 6.601842885811934e-05} +{"ts": "2025-12-27T23:47:21", "event": "train_log", "step": 6324, "epoch": 2.668354430379747, "progress_pct": 44.47, "epoch_pct": 44.47, "eta": "37:00:33", "max_grad_norm": 0.8, "loss": 0.6470263600349426, "grad_norm": 1.305034875869751, "learning_rate": 6.599567813019914e-05} +{"ts": "2025-12-27T23:47:34", "event": "train_log", "step": 6326, "epoch": 2.669198312236287, "progress_pct": 44.49, "epoch_pct": 44.49, "eta": "36:59:33", "max_grad_norm": 0.8, "loss": 0.588540256023407, "grad_norm": 1.1695195436477661, "learning_rate": 6.597292371238318e-05} +{"ts": "2025-12-27T23:47:46", "event": "train_log", "step": 6328, "epoch": 2.670042194092827, "progress_pct": 44.5, "epoch_pct": 44.5, "eta": "36:58:33", "max_grad_norm": 0.8, "loss": 0.602922260761261, "grad_norm": 1.084652304649353, "learning_rate": 6.59501656099205e-05} +{"ts": "2025-12-27T23:47:59", "event": "train_log", "step": 6330, "epoch": 2.670886075949367, "progress_pct": 44.51, "epoch_pct": 44.51, "eta": "36:57:33", "max_grad_norm": 0.8, "loss": 0.5613425970077515, "grad_norm": 1.1664962768554688, "learning_rate": 6.592740382806094e-05} +{"ts": "2025-12-27T23:48:11", "event": "train_log", "step": 6332, "epoch": 2.671729957805907, "progress_pct": 44.53, "epoch_pct": 44.53, "eta": "36:56:32", "max_grad_norm": 0.8, "loss": 0.5850927829742432, "grad_norm": 1.2208726406097412, "learning_rate": 6.590463837205522e-05} +{"ts": "2025-12-27T23:48:24", "event": "train_log", "step": 6334, "epoch": 2.672573839662447, "progress_pct": 44.54, "epoch_pct": 44.54, "eta": "36:55:33", "max_grad_norm": 0.8, "loss": 0.503675639629364, "grad_norm": 1.0662479400634766, "learning_rate": 6.588186924715488e-05} +{"ts": "2025-12-27T23:48:36", "event": "train_log", "step": 6336, "epoch": 2.6734177215189874, "progress_pct": 44.56, "epoch_pct": 44.56, "eta": "36:54:32", "max_grad_norm": 0.8, "loss": 0.6245100498199463, "grad_norm": 1.5318000316619873, "learning_rate": 6.58590964586123e-05} +{"ts": "2025-12-27T23:48:48", "event": "train_log", "step": 6338, "epoch": 2.6742616033755273, "progress_pct": 44.57, "epoch_pct": 44.57, "eta": "36:53:32", "max_grad_norm": 0.8, "loss": 0.6556243896484375, "grad_norm": 1.402784824371338, "learning_rate": 6.583632001168077e-05} +{"ts": "2025-12-27T23:49:00", "event": "train_log", "step": 6340, "epoch": 2.6751054852320673, "progress_pct": 44.59, "epoch_pct": 44.59, "eta": "36:52:31", "max_grad_norm": 0.8, "loss": 0.6398119926452637, "grad_norm": 1.2293213605880737, "learning_rate": 6.581353991161435e-05} +{"ts": "2025-12-27T23:49:12", "event": "train_log", "step": 6342, "epoch": 2.6759493670886076, "progress_pct": 44.6, "epoch_pct": 44.6, "eta": "36:51:31", "max_grad_norm": 0.8, "loss": 0.5792493224143982, "grad_norm": 1.2687599658966064, "learning_rate": 6.579075616366797e-05} +{"ts": "2025-12-27T23:49:25", "event": "train_log", "step": 6344, "epoch": 2.6767932489451476, "progress_pct": 44.61, "epoch_pct": 44.61, "eta": "36:50:30", "max_grad_norm": 0.8, "loss": 0.6669304966926575, "grad_norm": 1.2112480401992798, "learning_rate": 6.576796877309741e-05} +{"ts": "2025-12-27T23:49:37", "event": "train_log", "step": 6346, "epoch": 2.6776371308016875, "progress_pct": 44.63, "epoch_pct": 44.63, "eta": "36:49:30", "max_grad_norm": 0.8, "loss": 0.6012452840805054, "grad_norm": 1.3074487447738647, "learning_rate": 6.574517774515929e-05} +{"ts": "2025-12-27T23:49:49", "event": "train_log", "step": 6348, "epoch": 2.678481012658228, "progress_pct": 44.64, "epoch_pct": 44.64, "eta": "36:48:30", "max_grad_norm": 0.8, "loss": 0.6556297540664673, "grad_norm": 1.3157081604003906, "learning_rate": 6.572238308511106e-05} +{"ts": "2025-12-27T23:50:03", "event": "train_log", "step": 6350, "epoch": 2.679324894514768, "progress_pct": 44.66, "epoch_pct": 44.66, "eta": "36:47:31", "max_grad_norm": 0.8, "loss": 0.5607976317405701, "grad_norm": 1.0735292434692383, "learning_rate": 6.569958479821099e-05} +{"ts": "2025-12-27T23:50:15", "event": "train_log", "step": 6352, "epoch": 2.680168776371308, "progress_pct": 44.67, "epoch_pct": 44.67, "eta": "36:46:32", "max_grad_norm": 0.8, "loss": 0.6040812730789185, "grad_norm": 1.1896809339523315, "learning_rate": 6.567678288971825e-05} +{"ts": "2025-12-27T23:50:28", "event": "train_log", "step": 6354, "epoch": 2.681012658227848, "progress_pct": 44.68, "epoch_pct": 44.68, "eta": "36:45:31", "max_grad_norm": 0.8, "loss": 0.5807676911354065, "grad_norm": 1.1350760459899902, "learning_rate": 6.565397736489274e-05} +{"ts": "2025-12-27T23:50:40", "event": "train_log", "step": 6356, "epoch": 2.681856540084388, "progress_pct": 44.7, "epoch_pct": 44.7, "eta": "36:44:31", "max_grad_norm": 0.8, "loss": 0.5877989530563354, "grad_norm": 1.3865782022476196, "learning_rate": 6.563116822899532e-05} +{"ts": "2025-12-27T23:50:52", "event": "train_log", "step": 6358, "epoch": 2.682700421940928, "progress_pct": 44.71, "epoch_pct": 44.71, "eta": "36:43:31", "max_grad_norm": 0.8, "loss": 0.614531397819519, "grad_norm": 1.218682050704956, "learning_rate": 6.560835548728758e-05} +{"ts": "2025-12-27T23:51:06", "event": "train_log", "step": 6360, "epoch": 2.6835443037974684, "progress_pct": 44.73, "epoch_pct": 44.73, "eta": "36:42:33", "max_grad_norm": 0.8, "loss": 0.5880973935127258, "grad_norm": 1.06162691116333, "learning_rate": 6.5585539145032e-05} +{"ts": "2025-12-27T23:51:19", "event": "train_log", "step": 6362, "epoch": 2.6843881856540084, "progress_pct": 44.74, "epoch_pct": 44.74, "eta": "36:41:34", "max_grad_norm": 0.8, "loss": 0.5795428156852722, "grad_norm": 1.264328956604004, "learning_rate": 6.556271920749187e-05} +{"ts": "2025-12-27T23:51:31", "event": "train_log", "step": 6364, "epoch": 2.6852320675105483, "progress_pct": 44.75, "epoch_pct": 44.75, "eta": "36:40:34", "max_grad_norm": 0.8, "loss": 0.5927176475524902, "grad_norm": 1.335652470588684, "learning_rate": 6.553989567993129e-05} +{"ts": "2025-12-27T23:51:44", "event": "train_log", "step": 6366, "epoch": 2.6860759493670887, "progress_pct": 44.77, "epoch_pct": 44.77, "eta": "36:39:35", "max_grad_norm": 0.8, "loss": 0.5814473628997803, "grad_norm": 1.1110745668411255, "learning_rate": 6.551706856761524e-05} +{"ts": "2025-12-27T23:51:56", "event": "train_log", "step": 6368, "epoch": 2.6869198312236287, "progress_pct": 44.78, "epoch_pct": 44.78, "eta": "36:38:35", "max_grad_norm": 0.8, "loss": 0.557738184928894, "grad_norm": 1.1731220483779907, "learning_rate": 6.549423787580947e-05} +{"ts": "2025-12-27T23:52:09", "event": "train_log", "step": 6370, "epoch": 2.6877637130801686, "progress_pct": 44.8, "epoch_pct": 44.8, "eta": "36:37:36", "max_grad_norm": 0.8, "loss": 0.5947291254997253, "grad_norm": 1.2679874897003174, "learning_rate": 6.54714036097806e-05} +{"ts": "2025-12-27T23:52:21", "event": "train_log", "step": 6372, "epoch": 2.688607594936709, "progress_pct": 44.81, "epoch_pct": 44.81, "eta": "36:36:36", "max_grad_norm": 0.8, "loss": 0.5769563317298889, "grad_norm": 1.112322211265564, "learning_rate": 6.544856577479606e-05} +{"ts": "2025-12-27T23:52:33", "event": "train_log", "step": 6374, "epoch": 2.689451476793249, "progress_pct": 44.82, "epoch_pct": 44.82, "eta": "36:35:36", "max_grad_norm": 0.8, "loss": 0.6077675223350525, "grad_norm": 1.3385759592056274, "learning_rate": 6.542572437612408e-05} +{"ts": "2025-12-27T23:52:47", "event": "train_log", "step": 6376, "epoch": 2.690295358649789, "progress_pct": 44.84, "epoch_pct": 44.84, "eta": "36:34:38", "max_grad_norm": 0.8, "loss": 0.5600538849830627, "grad_norm": 1.0953450202941895, "learning_rate": 6.540287941903375e-05} +{"ts": "2025-12-27T23:53:00", "event": "train_log", "step": 6378, "epoch": 2.6911392405063292, "progress_pct": 44.85, "epoch_pct": 44.85, "eta": "36:33:38", "max_grad_norm": 0.8, "loss": 0.5828459858894348, "grad_norm": 1.2455042600631714, "learning_rate": 6.538003090879495e-05} +{"ts": "2025-12-27T23:53:12", "event": "train_log", "step": 6380, "epoch": 2.691983122362869, "progress_pct": 44.87, "epoch_pct": 44.87, "eta": "36:32:39", "max_grad_norm": 0.8, "loss": 0.5844002366065979, "grad_norm": 1.2563562393188477, "learning_rate": 6.53571788506784e-05} +{"ts": "2025-12-27T23:53:24", "event": "train_log", "step": 6382, "epoch": 2.692827004219409, "progress_pct": 44.88, "epoch_pct": 44.88, "eta": "36:31:38", "max_grad_norm": 0.8, "loss": 0.6632003784179688, "grad_norm": 1.3466061353683472, "learning_rate": 6.533432324995563e-05} +{"ts": "2025-12-27T23:53:37", "event": "train_log", "step": 6384, "epoch": 2.6936708860759495, "progress_pct": 44.89, "epoch_pct": 44.89, "eta": "36:30:40", "max_grad_norm": 0.8, "loss": 0.5532103180885315, "grad_norm": 1.2467784881591797, "learning_rate": 6.531146411189899e-05} +{"ts": "2025-12-27T23:53:49", "event": "train_log", "step": 6386, "epoch": 2.6945147679324895, "progress_pct": 44.91, "epoch_pct": 44.91, "eta": "36:29:40", "max_grad_norm": 0.8, "loss": 0.5722881555557251, "grad_norm": 1.344250202178955, "learning_rate": 6.528860144178163e-05} +{"ts": "2025-12-27T23:54:01", "event": "train_log", "step": 6388, "epoch": 2.6953586497890294, "progress_pct": 44.92, "epoch_pct": 44.92, "eta": "36:28:40", "max_grad_norm": 0.8, "loss": 0.6424282789230347, "grad_norm": 1.3688865900039673, "learning_rate": 6.526573524487756e-05} +{"ts": "2025-12-27T23:54:14", "event": "train_log", "step": 6390, "epoch": 2.6962025316455698, "progress_pct": 44.94, "epoch_pct": 44.94, "eta": "36:27:41", "max_grad_norm": 0.8, "loss": 0.5986620783805847, "grad_norm": 1.4252339601516724, "learning_rate": 6.524286552646153e-05} +{"ts": "2025-12-27T23:54:26", "event": "train_log", "step": 6392, "epoch": 2.6970464135021097, "progress_pct": 44.95, "epoch_pct": 44.95, "eta": "36:26:41", "max_grad_norm": 0.8, "loss": 0.6466318368911743, "grad_norm": 1.4102380275726318, "learning_rate": 6.52199922918092e-05} +{"ts": "2025-12-27T23:54:38", "event": "train_log", "step": 6394, "epoch": 2.6978902953586497, "progress_pct": 44.96, "epoch_pct": 44.96, "eta": "36:25:42", "max_grad_norm": 0.8, "loss": 0.6259894371032715, "grad_norm": 1.184442400932312, "learning_rate": 6.519711554619692e-05} +{"ts": "2025-12-27T23:54:51", "event": "train_log", "step": 6396, "epoch": 2.69873417721519, "progress_pct": 44.98, "epoch_pct": 44.98, "eta": "36:24:42", "max_grad_norm": 0.8, "loss": 0.5682622194290161, "grad_norm": 1.2751896381378174, "learning_rate": 6.517423529490198e-05} +{"ts": "2025-12-27T23:55:03", "event": "train_log", "step": 6398, "epoch": 2.69957805907173, "progress_pct": 44.99, "epoch_pct": 44.99, "eta": "36:23:43", "max_grad_norm": 0.8, "loss": 0.573390007019043, "grad_norm": 1.3333114385604858, "learning_rate": 6.515135154320236e-05} +{"ts": "2025-12-27T23:55:15", "event": "train_log", "step": 6400, "epoch": 2.70042194092827, "progress_pct": 45.01, "epoch_pct": 45.01, "eta": "36:22:44", "max_grad_norm": 0.8, "loss": 0.5839408040046692, "grad_norm": 1.2505477666854858, "learning_rate": 6.512846429637693e-05} +{"ts": "2025-12-28T00:03:48", "event": "train_log", "step": 6400, "epoch": 2.70042194092827, "progress_pct": 45.01, "epoch_pct": 45.01, "eta": "36:33:10", "max_grad_norm": 0.8, "eval_loss": 0.6764505505561829, "eval_runtime": 512.7682, "eval_samples_per_second": 4.109, "eval_steps_per_second": 4.109} +{"ts": "2025-12-28T00:04:00", "event": "train_log", "step": 6402, "epoch": 2.7012658227848103, "progress_pct": 45.02, "epoch_pct": 45.02, "eta": "36:32:10", "max_grad_norm": 0.8, "loss": 0.6000106334686279, "grad_norm": 1.2822065353393555, "learning_rate": 6.510557355970534e-05} +{"ts": "2025-12-28T00:04:13", "event": "train_log", "step": 6404, "epoch": 2.7021097046413503, "progress_pct": 45.04, "epoch_pct": 45.04, "eta": "36:31:11", "max_grad_norm": 0.8, "loss": 0.5796633362770081, "grad_norm": 1.2144463062286377, "learning_rate": 6.508267933846803e-05} +{"ts": "2025-12-28T00:04:25", "event": "train_log", "step": 6406, "epoch": 2.70295358649789, "progress_pct": 45.05, "epoch_pct": 45.05, "eta": "36:30:11", "max_grad_norm": 0.8, "loss": 0.5976626873016357, "grad_norm": 1.189985990524292, "learning_rate": 6.505978163794628e-05} +{"ts": "2025-12-28T00:04:38", "event": "train_log", "step": 6408, "epoch": 2.7037974683544306, "progress_pct": 45.06, "epoch_pct": 45.06, "eta": "36:29:13", "max_grad_norm": 0.8, "loss": 0.5054599642753601, "grad_norm": 1.0484727621078491, "learning_rate": 6.503688046342212e-05} +{"ts": "2025-12-28T00:04:51", "event": "train_log", "step": 6410, "epoch": 2.7046413502109705, "progress_pct": 45.08, "epoch_pct": 45.08, "eta": "36:28:13", "max_grad_norm": 0.8, "loss": 0.6539149284362793, "grad_norm": 1.4333025217056274, "learning_rate": 6.501397582017844e-05} +{"ts": "2025-12-28T00:05:03", "event": "train_log", "step": 6412, "epoch": 2.7054852320675105, "progress_pct": 45.09, "epoch_pct": 45.09, "eta": "36:27:14", "max_grad_norm": 0.8, "loss": 0.5220640301704407, "grad_norm": 1.1808522939682007, "learning_rate": 6.499106771349887e-05} +{"ts": "2025-12-28T00:05:16", "event": "train_log", "step": 6414, "epoch": 2.706329113924051, "progress_pct": 45.11, "epoch_pct": 45.11, "eta": "36:26:15", "max_grad_norm": 0.8, "loss": 0.6019118428230286, "grad_norm": 2.8626298904418945, "learning_rate": 6.496815614866791e-05} +{"ts": "2025-12-28T00:05:29", "event": "train_log", "step": 6416, "epoch": 2.707172995780591, "progress_pct": 45.12, "epoch_pct": 45.12, "eta": "36:25:16", "max_grad_norm": 0.8, "loss": 0.5754269361495972, "grad_norm": 1.1092768907546997, "learning_rate": 6.494524113097078e-05} +{"ts": "2025-12-28T00:05:42", "event": "train_log", "step": 6418, "epoch": 2.7080168776371307, "progress_pct": 45.13, "epoch_pct": 45.13, "eta": "36:24:17", "max_grad_norm": 0.8, "loss": 0.5548025369644165, "grad_norm": 1.2416579723358154, "learning_rate": 6.492232266569353e-05} +{"ts": "2025-12-28T00:05:54", "event": "train_log", "step": 6420, "epoch": 2.708860759493671, "progress_pct": 45.15, "epoch_pct": 45.15, "eta": "36:23:18", "max_grad_norm": 0.8, "loss": 0.5706405639648438, "grad_norm": 1.012360692024231, "learning_rate": 6.489940075812306e-05} +{"ts": "2025-12-28T00:06:07", "event": "train_log", "step": 6422, "epoch": 2.709704641350211, "progress_pct": 45.16, "epoch_pct": 45.16, "eta": "36:22:19", "max_grad_norm": 0.8, "loss": 0.5862169861793518, "grad_norm": 1.376641869544983, "learning_rate": 6.487647541354698e-05} +{"ts": "2025-12-28T00:06:19", "event": "train_log", "step": 6424, "epoch": 2.710548523206751, "progress_pct": 45.18, "epoch_pct": 45.18, "eta": "36:21:20", "max_grad_norm": 0.8, "loss": 0.5928428769111633, "grad_norm": 1.2425684928894043, "learning_rate": 6.485354663725374e-05} +{"ts": "2025-12-28T00:06:32", "event": "train_log", "step": 6426, "epoch": 2.7113924050632914, "progress_pct": 45.19, "epoch_pct": 45.19, "eta": "36:20:21", "max_grad_norm": 0.8, "loss": 0.5903078317642212, "grad_norm": 1.0926302671432495, "learning_rate": 6.483061443453254e-05} +{"ts": "2025-12-28T00:06:45", "event": "train_log", "step": 6428, "epoch": 2.7122362869198313, "progress_pct": 45.2, "epoch_pct": 45.2, "eta": "36:19:22", "max_grad_norm": 0.8, "loss": 0.5848883986473083, "grad_norm": 1.3698115348815918, "learning_rate": 6.480767881067342e-05} +{"ts": "2025-12-28T00:06:58", "event": "train_log", "step": 6430, "epoch": 2.7130801687763713, "progress_pct": 45.22, "epoch_pct": 45.22, "eta": "36:18:23", "max_grad_norm": 0.8, "loss": 0.5285207629203796, "grad_norm": 1.2949504852294922, "learning_rate": 6.478473977096718e-05} +{"ts": "2025-12-28T00:07:10", "event": "train_log", "step": 6432, "epoch": 2.7139240506329116, "progress_pct": 45.23, "epoch_pct": 45.23, "eta": "36:17:25", "max_grad_norm": 0.8, "loss": 0.5965171456336975, "grad_norm": 1.3662208318710327, "learning_rate": 6.476179732070543e-05} +{"ts": "2025-12-28T00:07:22", "event": "train_log", "step": 6434, "epoch": 2.7147679324894516, "progress_pct": 45.25, "epoch_pct": 45.25, "eta": "36:16:25", "max_grad_norm": 0.8, "loss": 0.6549378037452698, "grad_norm": 1.3127343654632568, "learning_rate": 6.473885146518055e-05} +{"ts": "2025-12-28T00:07:35", "event": "train_log", "step": 6436, "epoch": 2.7156118143459915, "progress_pct": 45.26, "epoch_pct": 45.26, "eta": "36:15:26", "max_grad_norm": 0.8, "loss": 0.574461042881012, "grad_norm": 1.199431300163269, "learning_rate": 6.471590220968568e-05} +{"ts": "2025-12-28T00:07:47", "event": "train_log", "step": 6438, "epoch": 2.716455696202532, "progress_pct": 45.27, "epoch_pct": 45.27, "eta": "36:14:27", "max_grad_norm": 0.8, "loss": 0.6142178177833557, "grad_norm": 1.1624091863632202, "learning_rate": 6.469294955951481e-05} +{"ts": "2025-12-28T00:07:59", "event": "train_log", "step": 6440, "epoch": 2.717299578059072, "progress_pct": 45.29, "epoch_pct": 45.29, "eta": "36:13:27", "max_grad_norm": 0.8, "loss": 0.5775829553604126, "grad_norm": 1.2685147523880005, "learning_rate": 6.466999351996266e-05} +{"ts": "2025-12-28T00:08:12", "event": "train_log", "step": 6442, "epoch": 2.718143459915612, "progress_pct": 45.3, "epoch_pct": 45.3, "eta": "36:12:28", "max_grad_norm": 0.8, "loss": 0.5400159955024719, "grad_norm": 1.0987834930419922, "learning_rate": 6.464703409632476e-05} +{"ts": "2025-12-28T00:08:23", "event": "train_log", "step": 6444, "epoch": 2.7189873417721517, "progress_pct": 45.32, "epoch_pct": 45.32, "eta": "36:11:29", "max_grad_norm": 0.8, "loss": 0.558712899684906, "grad_norm": 1.2638986110687256, "learning_rate": 6.462407129389736e-05} +{"ts": "2025-12-28T00:08:36", "event": "train_log", "step": 6446, "epoch": 2.719831223628692, "progress_pct": 45.33, "epoch_pct": 45.33, "eta": "36:10:30", "max_grad_norm": 0.8, "loss": 0.5465238094329834, "grad_norm": 1.174168586730957, "learning_rate": 6.46011051179776e-05} +{"ts": "2025-12-28T00:08:48", "event": "train_log", "step": 6448, "epoch": 2.720675105485232, "progress_pct": 45.34, "epoch_pct": 45.34, "eta": "36:09:31", "max_grad_norm": 0.8, "loss": 0.629173219203949, "grad_norm": 1.2185649871826172, "learning_rate": 6.457813557386331e-05} +{"ts": "2025-12-28T00:09:01", "event": "train_log", "step": 6450, "epoch": 2.721518987341772, "progress_pct": 45.36, "epoch_pct": 45.36, "eta": "36:08:32", "max_grad_norm": 0.8, "loss": 0.5557543039321899, "grad_norm": 1.1563167572021484, "learning_rate": 6.455516266685311e-05} +{"ts": "2025-12-28T00:09:13", "event": "train_log", "step": 6452, "epoch": 2.7223628691983124, "progress_pct": 45.37, "epoch_pct": 45.37, "eta": "36:07:33", "max_grad_norm": 0.8, "loss": 0.6350696682929993, "grad_norm": 1.2934051752090454, "learning_rate": 6.453218640224642e-05} +{"ts": "2025-12-28T00:09:27", "event": "train_log", "step": 6454, "epoch": 2.7232067510548523, "progress_pct": 45.39, "epoch_pct": 45.39, "eta": "36:06:35", "max_grad_norm": 0.8, "loss": 0.544219434261322, "grad_norm": 1.045218825340271, "learning_rate": 6.450920678534342e-05} +{"ts": "2025-12-28T00:09:38", "event": "train_log", "step": 6456, "epoch": 2.7240506329113923, "progress_pct": 45.4, "epoch_pct": 45.4, "eta": "36:05:36", "max_grad_norm": 0.8, "loss": 0.6312481760978699, "grad_norm": 1.3102771043777466, "learning_rate": 6.44862238214451e-05} +{"ts": "2025-12-28T00:09:51", "event": "train_log", "step": 6458, "epoch": 2.7248945147679327, "progress_pct": 45.41, "epoch_pct": 45.41, "eta": "36:04:37", "max_grad_norm": 0.8, "loss": 0.5772860050201416, "grad_norm": 1.3338704109191895, "learning_rate": 6.446323751585312e-05} +{"ts": "2025-12-28T00:10:03", "event": "train_log", "step": 6460, "epoch": 2.7257383966244726, "progress_pct": 45.43, "epoch_pct": 45.43, "eta": "36:03:38", "max_grad_norm": 0.8, "loss": 0.5450227856636047, "grad_norm": 1.1826046705245972, "learning_rate": 6.444024787387003e-05} +{"ts": "2025-12-28T00:10:17", "event": "train_log", "step": 6462, "epoch": 2.7265822784810125, "progress_pct": 45.44, "epoch_pct": 45.44, "eta": "36:02:41", "max_grad_norm": 0.8, "loss": 0.5775642395019531, "grad_norm": 1.2449530363082886, "learning_rate": 6.441725490079908e-05} +{"ts": "2025-12-28T00:10:30", "event": "train_log", "step": 6464, "epoch": 2.7274261603375525, "progress_pct": 45.46, "epoch_pct": 45.46, "eta": "36:01:43", "max_grad_norm": 0.8, "loss": 0.5795316100120544, "grad_norm": 1.1204898357391357, "learning_rate": 6.439425860194432e-05} +{"ts": "2025-12-28T00:10:42", "event": "train_log", "step": 6466, "epoch": 2.728270042194093, "progress_pct": 45.47, "epoch_pct": 45.47, "eta": "36:00:44", "max_grad_norm": 0.8, "loss": 0.6187583804130554, "grad_norm": 1.179542064666748, "learning_rate": 6.437125898261056e-05} +{"ts": "2025-12-28T00:10:54", "event": "train_log", "step": 6468, "epoch": 2.729113924050633, "progress_pct": 45.49, "epoch_pct": 45.49, "eta": "35:59:45", "max_grad_norm": 0.8, "loss": 0.581790566444397, "grad_norm": 1.2231724262237549, "learning_rate": 6.434825604810333e-05} +{"ts": "2025-12-28T00:11:08", "event": "train_log", "step": 6470, "epoch": 2.7299578059071727, "progress_pct": 45.5, "epoch_pct": 45.5, "eta": "35:58:48", "max_grad_norm": 0.8, "loss": 0.5470858812332153, "grad_norm": 1.178859829902649, "learning_rate": 6.432524980372902e-05} +{"ts": "2025-12-28T00:11:19", "event": "train_log", "step": 6472, "epoch": 2.730801687763713, "progress_pct": 45.51, "epoch_pct": 45.51, "eta": "35:57:48", "max_grad_norm": 0.8, "loss": 0.591381311416626, "grad_norm": 1.2092641592025757, "learning_rate": 6.430224025479469e-05} +{"ts": "2025-12-28T00:11:31", "event": "train_log", "step": 6474, "epoch": 2.731645569620253, "progress_pct": 45.53, "epoch_pct": 45.53, "eta": "35:56:49", "max_grad_norm": 0.8, "loss": 0.6809561252593994, "grad_norm": 1.395704746246338, "learning_rate": 6.42792274066082e-05} +{"ts": "2025-12-28T00:11:43", "event": "train_log", "step": 6476, "epoch": 2.732489451476793, "progress_pct": 45.54, "epoch_pct": 45.54, "eta": "35:55:50", "max_grad_norm": 0.8, "loss": 0.5667102932929993, "grad_norm": 1.1937509775161743, "learning_rate": 6.42562112644782e-05} +{"ts": "2025-12-28T00:11:56", "event": "train_log", "step": 6478, "epoch": 2.7333333333333334, "progress_pct": 45.56, "epoch_pct": 45.56, "eta": "35:54:51", "max_grad_norm": 0.8, "loss": 0.5832397937774658, "grad_norm": 1.2181694507598877, "learning_rate": 6.423319183371405e-05} +{"ts": "2025-12-28T00:12:09", "event": "train_log", "step": 6480, "epoch": 2.7341772151898733, "progress_pct": 45.57, "epoch_pct": 45.57, "eta": "35:53:55", "max_grad_norm": 0.8, "loss": 0.5432526469230652, "grad_norm": 0.9961143732070923, "learning_rate": 6.42101691196259e-05} +{"ts": "2025-12-28T00:12:21", "event": "train_log", "step": 6482, "epoch": 2.7350210970464133, "progress_pct": 45.58, "epoch_pct": 45.58, "eta": "35:52:56", "max_grad_norm": 0.8, "loss": 0.5740163326263428, "grad_norm": 1.2029842138290405, "learning_rate": 6.418714312752466e-05} +{"ts": "2025-12-28T00:12:33", "event": "train_log", "step": 6484, "epoch": 2.7358649789029537, "progress_pct": 45.6, "epoch_pct": 45.6, "eta": "35:51:57", "max_grad_norm": 0.8, "loss": 0.6384599804878235, "grad_norm": 1.4317080974578857, "learning_rate": 6.416411386272196e-05} +{"ts": "2025-12-28T00:12:45", "event": "train_log", "step": 6486, "epoch": 2.7367088607594936, "progress_pct": 45.61, "epoch_pct": 45.61, "eta": "35:50:58", "max_grad_norm": 0.8, "loss": 0.6619245409965515, "grad_norm": 1.2837908267974854, "learning_rate": 6.414108133053022e-05} +{"ts": "2025-12-28T00:12:58", "event": "train_log", "step": 6488, "epoch": 2.7375527426160335, "progress_pct": 45.63, "epoch_pct": 45.63, "eta": "35:50:00", "max_grad_norm": 0.8, "loss": 0.5453745126724243, "grad_norm": 1.1140583753585815, "learning_rate": 6.41180455362626e-05} +{"ts": "2025-12-28T00:13:11", "event": "train_log", "step": 6490, "epoch": 2.738396624472574, "progress_pct": 45.64, "epoch_pct": 45.64, "eta": "35:49:02", "max_grad_norm": 0.8, "loss": 0.6225460171699524, "grad_norm": 1.1226048469543457, "learning_rate": 6.409500648523302e-05} +{"ts": "2025-12-28T00:13:23", "event": "train_log", "step": 6492, "epoch": 2.739240506329114, "progress_pct": 45.65, "epoch_pct": 45.65, "eta": "35:48:04", "max_grad_norm": 0.8, "loss": 0.5767168402671814, "grad_norm": 1.2367178201675415, "learning_rate": 6.407196418275613e-05} +{"ts": "2025-12-28T00:13:35", "event": "train_log", "step": 6494, "epoch": 2.740084388185654, "progress_pct": 45.67, "epoch_pct": 45.67, "eta": "35:47:05", "max_grad_norm": 0.8, "loss": 0.6131237745285034, "grad_norm": 1.4078115224838257, "learning_rate": 6.404891863414736e-05} +{"ts": "2025-12-28T00:13:48", "event": "train_log", "step": 6496, "epoch": 2.740928270042194, "progress_pct": 45.68, "epoch_pct": 45.68, "eta": "35:46:07", "max_grad_norm": 0.8, "loss": 0.5236409306526184, "grad_norm": 1.21550452709198, "learning_rate": 6.40258698447229e-05} +{"ts": "2025-12-28T00:14:00", "event": "train_log", "step": 6498, "epoch": 2.741772151898734, "progress_pct": 45.7, "epoch_pct": 45.7, "eta": "35:45:09", "max_grad_norm": 0.8, "loss": 0.5483267307281494, "grad_norm": 1.22257661819458, "learning_rate": 6.400281781979962e-05} +{"ts": "2025-12-28T00:14:13", "event": "train_log", "step": 6500, "epoch": 2.742616033755274, "progress_pct": 45.71, "epoch_pct": 45.71, "eta": "35:44:11", "max_grad_norm": 0.8, "loss": 0.6161116361618042, "grad_norm": 1.1525336503982544, "learning_rate": 6.39797625646952e-05} +{"ts": "2025-12-28T00:22:46", "event": "train_log", "step": 6500, "epoch": 2.742616033755274, "progress_pct": 45.71, "epoch_pct": 45.71, "eta": "35:54:21", "max_grad_norm": 0.8, "eval_loss": 0.6768895387649536, "eval_runtime": 513.0657, "eval_samples_per_second": 4.107, "eval_steps_per_second": 4.107} +{"ts": "2025-12-28T00:23:00", "event": "train_log", "step": 6502, "epoch": 2.7434599156118145, "progress_pct": 45.72, "epoch_pct": 45.72, "eta": "35:53:24", "max_grad_norm": 0.8, "loss": 0.5587809681892395, "grad_norm": 1.094993233680725, "learning_rate": 6.395670408472804e-05} +{"ts": "2025-12-28T00:23:13", "event": "train_log", "step": 6504, "epoch": 2.7443037974683544, "progress_pct": 45.74, "epoch_pct": 45.74, "eta": "35:52:26", "max_grad_norm": 0.8, "loss": 0.6118067502975464, "grad_norm": 1.1560120582580566, "learning_rate": 6.393364238521731e-05} +{"ts": "2025-12-28T00:23:25", "event": "train_log", "step": 6506, "epoch": 2.7451476793248943, "progress_pct": 45.75, "epoch_pct": 45.75, "eta": "35:51:27", "max_grad_norm": 0.8, "loss": 0.6314222812652588, "grad_norm": 1.3500670194625854, "learning_rate": 6.391057747148285e-05} +{"ts": "2025-12-28T00:23:37", "event": "train_log", "step": 6508, "epoch": 2.7459915611814347, "progress_pct": 45.77, "epoch_pct": 45.77, "eta": "35:50:29", "max_grad_norm": 0.8, "loss": 0.5695898532867432, "grad_norm": 1.2182261943817139, "learning_rate": 6.388750934884535e-05} +{"ts": "2025-12-28T00:23:50", "event": "train_log", "step": 6510, "epoch": 2.7468354430379747, "progress_pct": 45.78, "epoch_pct": 45.78, "eta": "35:49:30", "max_grad_norm": 0.8, "loss": 0.5848485827445984, "grad_norm": 1.3393630981445312, "learning_rate": 6.386443802262616e-05} +{"ts": "2025-12-28T00:24:02", "event": "train_log", "step": 6512, "epoch": 2.7476793248945146, "progress_pct": 45.79, "epoch_pct": 45.79, "eta": "35:48:31", "max_grad_norm": 0.8, "loss": 0.5920066237449646, "grad_norm": 1.412109375, "learning_rate": 6.384136349814737e-05} +{"ts": "2025-12-28T00:24:15", "event": "train_log", "step": 6514, "epoch": 2.748523206751055, "progress_pct": 45.81, "epoch_pct": 45.81, "eta": "35:47:33", "max_grad_norm": 0.8, "loss": 0.5770407319068909, "grad_norm": 1.174395203590393, "learning_rate": 6.381828578073186e-05} +{"ts": "2025-12-28T00:24:27", "event": "train_log", "step": 6516, "epoch": 2.749367088607595, "progress_pct": 45.82, "epoch_pct": 45.82, "eta": "35:46:35", "max_grad_norm": 0.8, "loss": 0.5780549049377441, "grad_norm": 1.2811627388000488, "learning_rate": 6.37952048757032e-05} +{"ts": "2025-12-28T00:24:40", "event": "train_log", "step": 6518, "epoch": 2.750210970464135, "progress_pct": 45.84, "epoch_pct": 45.84, "eta": "35:45:38", "max_grad_norm": 0.8, "loss": 0.5276137590408325, "grad_norm": 1.0966699123382568, "learning_rate": 6.377212078838573e-05} +{"ts": "2025-12-28T00:24:53", "event": "train_log", "step": 6520, "epoch": 2.7510548523206753, "progress_pct": 45.85, "epoch_pct": 45.85, "eta": "35:44:40", "max_grad_norm": 0.8, "loss": 0.5744844675064087, "grad_norm": 1.082350730895996, "learning_rate": 6.374903352410449e-05} +{"ts": "2025-12-28T00:25:05", "event": "train_log", "step": 6522, "epoch": 2.751898734177215, "progress_pct": 45.86, "epoch_pct": 45.86, "eta": "35:43:42", "max_grad_norm": 0.8, "loss": 0.6084962487220764, "grad_norm": 1.342262864112854, "learning_rate": 6.372594308818527e-05} +{"ts": "2025-12-28T00:25:18", "event": "train_log", "step": 6524, "epoch": 2.752742616033755, "progress_pct": 45.88, "epoch_pct": 45.88, "eta": "35:42:44", "max_grad_norm": 0.8, "loss": 0.5551698803901672, "grad_norm": 1.1922634840011597, "learning_rate": 6.370284948595458e-05} +{"ts": "2025-12-28T00:25:30", "event": "train_log", "step": 6526, "epoch": 2.7535864978902955, "progress_pct": 45.89, "epoch_pct": 45.89, "eta": "35:41:45", "max_grad_norm": 0.8, "loss": 0.6398477554321289, "grad_norm": 1.1368752717971802, "learning_rate": 6.36797527227397e-05} +{"ts": "2025-12-28T00:25:43", "event": "train_log", "step": 6528, "epoch": 2.7544303797468355, "progress_pct": 45.91, "epoch_pct": 45.91, "eta": "35:40:48", "max_grad_norm": 0.8, "loss": 0.6201474666595459, "grad_norm": 1.1748154163360596, "learning_rate": 6.365665280386857e-05} +{"ts": "2025-12-28T00:25:55", "event": "train_log", "step": 6530, "epoch": 2.7552742616033754, "progress_pct": 45.92, "epoch_pct": 45.92, "eta": "35:39:49", "max_grad_norm": 0.8, "loss": 0.6196629405021667, "grad_norm": 1.2439727783203125, "learning_rate": 6.363354973466993e-05} +{"ts": "2025-12-28T00:26:09", "event": "train_log", "step": 6532, "epoch": 2.756118143459916, "progress_pct": 45.94, "epoch_pct": 45.94, "eta": "35:38:52", "max_grad_norm": 0.8, "loss": 0.6379110813140869, "grad_norm": 1.146153211593628, "learning_rate": 6.36104435204732e-05} +{"ts": "2025-12-28T00:26:21", "event": "train_log", "step": 6534, "epoch": 2.7569620253164557, "progress_pct": 45.95, "epoch_pct": 45.95, "eta": "35:37:54", "max_grad_norm": 0.8, "loss": 0.5695750713348389, "grad_norm": 1.118996024131775, "learning_rate": 6.358733416660854e-05} +{"ts": "2025-12-28T00:26:34", "event": "train_log", "step": 6536, "epoch": 2.7578059071729957, "progress_pct": 45.96, "epoch_pct": 45.96, "eta": "35:36:57", "max_grad_norm": 0.8, "loss": 0.5846145153045654, "grad_norm": 1.219043493270874, "learning_rate": 6.356422167840685e-05} +{"ts": "2025-12-28T00:26:47", "event": "train_log", "step": 6538, "epoch": 2.758649789029536, "progress_pct": 45.98, "epoch_pct": 45.98, "eta": "35:36:00", "max_grad_norm": 0.8, "loss": 0.5762830972671509, "grad_norm": 1.120754361152649, "learning_rate": 6.354110606119973e-05} +{"ts": "2025-12-28T00:27:00", "event": "train_log", "step": 6540, "epoch": 2.759493670886076, "progress_pct": 45.99, "epoch_pct": 45.99, "eta": "35:35:02", "max_grad_norm": 0.8, "loss": 0.605473518371582, "grad_norm": 1.0562269687652588, "learning_rate": 6.351798732031949e-05} +{"ts": "2025-12-28T00:27:12", "event": "train_log", "step": 6542, "epoch": 2.760337552742616, "progress_pct": 46.01, "epoch_pct": 46.01, "eta": "35:34:04", "max_grad_norm": 0.8, "loss": 0.6314473748207092, "grad_norm": 1.3034429550170898, "learning_rate": 6.34948654610992e-05} +{"ts": "2025-12-28T00:27:25", "event": "train_log", "step": 6544, "epoch": 2.7611814345991563, "progress_pct": 46.02, "epoch_pct": 46.02, "eta": "35:33:07", "max_grad_norm": 0.8, "loss": 0.5332847237586975, "grad_norm": 1.1129206418991089, "learning_rate": 6.347174048887263e-05} +{"ts": "2025-12-28T00:27:38", "event": "train_log", "step": 6546, "epoch": 2.7620253164556963, "progress_pct": 46.03, "epoch_pct": 46.03, "eta": "35:32:09", "max_grad_norm": 0.8, "loss": 0.6015381813049316, "grad_norm": 1.068705439567566, "learning_rate": 6.344861240897423e-05} +{"ts": "2025-12-28T00:27:51", "event": "train_log", "step": 6548, "epoch": 2.762869198312236, "progress_pct": 46.05, "epoch_pct": 46.05, "eta": "35:31:12", "max_grad_norm": 0.8, "loss": 0.5989309549331665, "grad_norm": 1.161868691444397, "learning_rate": 6.342548122673925e-05} +{"ts": "2025-12-28T00:28:03", "event": "train_log", "step": 6550, "epoch": 2.7637130801687766, "progress_pct": 46.06, "epoch_pct": 46.06, "eta": "35:30:14", "max_grad_norm": 0.8, "loss": 0.5843837261199951, "grad_norm": 1.1323082447052002, "learning_rate": 6.340234694750359e-05} +{"ts": "2025-12-28T00:28:16", "event": "train_log", "step": 6552, "epoch": 2.7645569620253165, "progress_pct": 46.08, "epoch_pct": 46.08, "eta": "35:29:17", "max_grad_norm": 0.8, "loss": 0.603590726852417, "grad_norm": 1.2302695512771606, "learning_rate": 6.337920957660388e-05} +{"ts": "2025-12-28T00:28:28", "event": "train_log", "step": 6554, "epoch": 2.7654008438818565, "progress_pct": 46.09, "epoch_pct": 46.09, "eta": "35:28:19", "max_grad_norm": 0.8, "loss": 0.6207526326179504, "grad_norm": 1.2483820915222168, "learning_rate": 6.335606911937749e-05} +{"ts": "2025-12-28T00:28:41", "event": "train_log", "step": 6556, "epoch": 2.766244725738397, "progress_pct": 46.1, "epoch_pct": 46.1, "eta": "35:27:22", "max_grad_norm": 0.8, "loss": 0.5964639782905579, "grad_norm": 1.353147029876709, "learning_rate": 6.333292558116245e-05} +{"ts": "2025-12-28T00:28:55", "event": "train_log", "step": 6558, "epoch": 2.767088607594937, "progress_pct": 46.12, "epoch_pct": 46.12, "eta": "35:26:25", "max_grad_norm": 0.8, "loss": 0.5078298449516296, "grad_norm": 1.2074922323226929, "learning_rate": 6.330977896729755e-05} +{"ts": "2025-12-28T00:29:07", "event": "train_log", "step": 6560, "epoch": 2.7679324894514767, "progress_pct": 46.13, "epoch_pct": 46.13, "eta": "35:25:28", "max_grad_norm": 0.8, "loss": 0.5649725198745728, "grad_norm": 1.208228588104248, "learning_rate": 6.328662928312225e-05} +{"ts": "2025-12-28T00:29:20", "event": "train_log", "step": 6562, "epoch": 2.768776371308017, "progress_pct": 46.15, "epoch_pct": 46.15, "eta": "35:24:31", "max_grad_norm": 0.8, "loss": 0.5552892684936523, "grad_norm": 1.2749123573303223, "learning_rate": 6.326347653397676e-05} +{"ts": "2025-12-28T00:29:33", "event": "train_log", "step": 6564, "epoch": 2.769620253164557, "progress_pct": 46.16, "epoch_pct": 46.16, "eta": "35:23:33", "max_grad_norm": 0.8, "loss": 0.6514022350311279, "grad_norm": 1.1484880447387695, "learning_rate": 6.324032072520197e-05} +{"ts": "2025-12-28T00:29:45", "event": "train_log", "step": 6566, "epoch": 2.770464135021097, "progress_pct": 46.17, "epoch_pct": 46.17, "eta": "35:22:35", "max_grad_norm": 0.8, "loss": 0.5342835783958435, "grad_norm": 1.1836612224578857, "learning_rate": 6.321716186213946e-05} +{"ts": "2025-12-28T00:29:58", "event": "train_log", "step": 6568, "epoch": 2.7713080168776374, "progress_pct": 46.19, "epoch_pct": 46.19, "eta": "35:21:38", "max_grad_norm": 0.8, "loss": 0.6427282691001892, "grad_norm": 1.1626124382019043, "learning_rate": 6.319399995013154e-05} +{"ts": "2025-12-28T00:30:11", "event": "train_log", "step": 6570, "epoch": 2.7721518987341773, "progress_pct": 46.2, "epoch_pct": 46.2, "eta": "35:20:41", "max_grad_norm": 0.8, "loss": 0.5326613187789917, "grad_norm": 1.0736790895462036, "learning_rate": 6.317083499452123e-05} +{"ts": "2025-12-28T00:30:23", "event": "train_log", "step": 6572, "epoch": 2.7729957805907173, "progress_pct": 46.22, "epoch_pct": 46.22, "eta": "35:19:44", "max_grad_norm": 0.8, "loss": 0.543228268623352, "grad_norm": 1.1652518510818481, "learning_rate": 6.314766700065227e-05} +{"ts": "2025-12-28T00:30:36", "event": "train_log", "step": 6574, "epoch": 2.7738396624472577, "progress_pct": 46.23, "epoch_pct": 46.23, "eta": "35:18:47", "max_grad_norm": 0.8, "loss": 0.5558459758758545, "grad_norm": 1.232256531715393, "learning_rate": 6.3124495973869e-05} +{"ts": "2025-12-28T00:30:48", "event": "train_log", "step": 6576, "epoch": 2.7746835443037976, "progress_pct": 46.24, "epoch_pct": 46.24, "eta": "35:17:49", "max_grad_norm": 0.8, "loss": 0.6432561874389648, "grad_norm": 1.3306560516357422, "learning_rate": 6.310132191951659e-05} +{"ts": "2025-12-28T00:31:00", "event": "train_log", "step": 6578, "epoch": 2.7755274261603375, "progress_pct": 46.26, "epoch_pct": 46.26, "eta": "35:16:51", "max_grad_norm": 0.8, "loss": 0.6424768567085266, "grad_norm": 1.3863320350646973, "learning_rate": 6.307814484294083e-05} +{"ts": "2025-12-28T00:31:13", "event": "train_log", "step": 6580, "epoch": 2.7763713080168775, "progress_pct": 46.27, "epoch_pct": 46.27, "eta": "35:15:55", "max_grad_norm": 0.8, "loss": 0.5481483936309814, "grad_norm": 1.186691164970398, "learning_rate": 6.305496474948822e-05} +{"ts": "2025-12-28T00:31:26", "event": "train_log", "step": 6582, "epoch": 2.777215189873418, "progress_pct": 46.29, "epoch_pct": 46.29, "eta": "35:14:58", "max_grad_norm": 0.8, "loss": 0.5352432727813721, "grad_norm": 1.2820651531219482, "learning_rate": 6.303178164450596e-05} +{"ts": "2025-12-28T00:31:38", "event": "train_log", "step": 6584, "epoch": 2.778059071729958, "progress_pct": 46.3, "epoch_pct": 46.3, "eta": "35:14:00", "max_grad_norm": 0.8, "loss": 0.6270323991775513, "grad_norm": 1.1904656887054443, "learning_rate": 6.300859553334196e-05} +{"ts": "2025-12-28T00:31:51", "event": "train_log", "step": 6586, "epoch": 2.7789029535864977, "progress_pct": 46.32, "epoch_pct": 46.32, "eta": "35:13:03", "max_grad_norm": 0.8, "loss": 0.5700342059135437, "grad_norm": 1.1635342836380005, "learning_rate": 6.29854064213448e-05} +{"ts": "2025-12-28T00:32:03", "event": "train_log", "step": 6588, "epoch": 2.779746835443038, "progress_pct": 46.33, "epoch_pct": 46.33, "eta": "35:12:05", "max_grad_norm": 0.8, "loss": 0.5618587136268616, "grad_norm": 1.1065751314163208, "learning_rate": 6.296221431386379e-05} +{"ts": "2025-12-28T00:32:15", "event": "train_log", "step": 6590, "epoch": 2.780590717299578, "progress_pct": 46.34, "epoch_pct": 46.34, "eta": "35:11:07", "max_grad_norm": 0.8, "loss": 0.5982993841171265, "grad_norm": 1.3106048107147217, "learning_rate": 6.293901921624885e-05} +{"ts": "2025-12-28T00:32:27", "event": "train_log", "step": 6592, "epoch": 2.781434599156118, "progress_pct": 46.36, "epoch_pct": 46.36, "eta": "35:10:10", "max_grad_norm": 0.8, "loss": 0.6210941076278687, "grad_norm": 1.210839867591858, "learning_rate": 6.291582113385071e-05} +{"ts": "2025-12-28T00:32:40", "event": "train_log", "step": 6594, "epoch": 2.782278481012658, "progress_pct": 46.37, "epoch_pct": 46.37, "eta": "35:09:13", "max_grad_norm": 0.8, "loss": 0.5711221694946289, "grad_norm": 1.1407668590545654, "learning_rate": 6.289262007202066e-05} +{"ts": "2025-12-28T00:32:53", "event": "train_log", "step": 6596, "epoch": 2.7831223628691983, "progress_pct": 46.39, "epoch_pct": 46.39, "eta": "35:08:16", "max_grad_norm": 0.8, "loss": 0.5741305947303772, "grad_norm": 1.2315012216567993, "learning_rate": 6.286941603611078e-05} +{"ts": "2025-12-28T00:33:05", "event": "train_log", "step": 6598, "epoch": 2.7839662447257383, "progress_pct": 46.4, "epoch_pct": 46.4, "eta": "35:07:19", "max_grad_norm": 0.8, "loss": 0.5329633951187134, "grad_norm": 1.3056857585906982, "learning_rate": 6.284620903147377e-05} +{"ts": "2025-12-28T00:33:18", "event": "train_log", "step": 6600, "epoch": 2.7848101265822782, "progress_pct": 46.41, "epoch_pct": 46.41, "eta": "35:06:22", "max_grad_norm": 0.8, "loss": 0.6097646951675415, "grad_norm": 1.1501489877700806, "learning_rate": 6.282299906346306e-05} +{"ts": "2025-12-28T00:41:50", "event": "train_log", "step": 6600, "epoch": 2.7848101265822782, "progress_pct": 46.41, "epoch_pct": 46.41, "eta": "35:16:14", "max_grad_norm": 0.8, "eval_loss": 0.6737648844718933, "eval_runtime": 512.921, "eval_samples_per_second": 4.108, "eval_steps_per_second": 4.108} +{"ts": "2025-12-28T00:42:03", "event": "train_log", "step": 6602, "epoch": 2.7856540084388186, "progress_pct": 46.43, "epoch_pct": 46.43, "eta": "35:15:17", "max_grad_norm": 0.8, "loss": 0.5561007857322693, "grad_norm": 1.0871381759643555, "learning_rate": 6.279978613743275e-05} +{"ts": "2025-12-28T00:42:16", "event": "train_log", "step": 6604, "epoch": 2.7864978902953585, "progress_pct": 46.44, "epoch_pct": 46.44, "eta": "35:14:20", "max_grad_norm": 0.8, "loss": 0.5803903341293335, "grad_norm": 1.188563585281372, "learning_rate": 6.277657025873758e-05} +{"ts": "2025-12-28T00:42:29", "event": "train_log", "step": 6606, "epoch": 2.7873417721518985, "progress_pct": 46.46, "epoch_pct": 46.46, "eta": "35:13:24", "max_grad_norm": 0.8, "loss": 0.5143039226531982, "grad_norm": 1.1444810628890991, "learning_rate": 6.275335143273305e-05} +{"ts": "2025-12-28T00:42:42", "event": "train_log", "step": 6608, "epoch": 2.788185654008439, "progress_pct": 46.47, "epoch_pct": 46.47, "eta": "35:12:27", "max_grad_norm": 0.8, "loss": 0.543094277381897, "grad_norm": 1.096595287322998, "learning_rate": 6.273012966477526e-05} +{"ts": "2025-12-28T00:42:56", "event": "train_log", "step": 6610, "epoch": 2.789029535864979, "progress_pct": 46.48, "epoch_pct": 46.48, "eta": "35:11:31", "max_grad_norm": 0.8, "loss": 0.5597999095916748, "grad_norm": 1.195801019668579, "learning_rate": 6.270690496022105e-05} +{"ts": "2025-12-28T00:43:08", "event": "train_log", "step": 6612, "epoch": 2.7898734177215188, "progress_pct": 46.5, "epoch_pct": 46.5, "eta": "35:10:33", "max_grad_norm": 0.8, "loss": 0.5496288537979126, "grad_norm": 1.236894965171814, "learning_rate": 6.26836773244279e-05} +{"ts": "2025-12-28T00:43:20", "event": "train_log", "step": 6614, "epoch": 2.790717299578059, "progress_pct": 46.51, "epoch_pct": 46.51, "eta": "35:09:36", "max_grad_norm": 0.8, "loss": 0.6104549169540405, "grad_norm": 1.1474205255508423, "learning_rate": 6.2660446762754e-05} +{"ts": "2025-12-28T00:43:32", "event": "train_log", "step": 6616, "epoch": 2.791561181434599, "progress_pct": 46.53, "epoch_pct": 46.53, "eta": "35:08:38", "max_grad_norm": 0.8, "loss": 0.6186942458152771, "grad_norm": 1.1649401187896729, "learning_rate": 6.263721328055818e-05} +{"ts": "2025-12-28T00:43:46", "event": "train_log", "step": 6618, "epoch": 2.792405063291139, "progress_pct": 46.54, "epoch_pct": 46.54, "eta": "35:07:42", "max_grad_norm": 0.8, "loss": 0.5332194566726685, "grad_norm": 1.1187876462936401, "learning_rate": 6.261397688319993e-05} +{"ts": "2025-12-28T00:43:59", "event": "train_log", "step": 6620, "epoch": 2.7932489451476794, "progress_pct": 46.55, "epoch_pct": 46.55, "eta": "35:06:45", "max_grad_norm": 0.8, "loss": 0.6478220224380493, "grad_norm": 1.2765967845916748, "learning_rate": 6.25907375760395e-05} +{"ts": "2025-12-28T00:44:11", "event": "train_log", "step": 6622, "epoch": 2.7940928270042193, "progress_pct": 46.57, "epoch_pct": 46.57, "eta": "35:05:48", "max_grad_norm": 0.8, "loss": 0.6406530141830444, "grad_norm": 1.232173204421997, "learning_rate": 6.256749536443771e-05} +{"ts": "2025-12-28T00:44:24", "event": "train_log", "step": 6624, "epoch": 2.7949367088607593, "progress_pct": 46.58, "epoch_pct": 46.58, "eta": "35:04:51", "max_grad_norm": 0.8, "loss": 0.6082814931869507, "grad_norm": 1.045032262802124, "learning_rate": 6.254425025375612e-05} +{"ts": "2025-12-28T00:44:36", "event": "train_log", "step": 6626, "epoch": 2.7957805907172997, "progress_pct": 46.6, "epoch_pct": 46.6, "eta": "35:03:54", "max_grad_norm": 0.8, "loss": 0.6527243852615356, "grad_norm": 1.2285528182983398, "learning_rate": 6.252100224935689e-05} +{"ts": "2025-12-28T00:44:49", "event": "train_log", "step": 6628, "epoch": 2.7966244725738396, "progress_pct": 46.61, "epoch_pct": 46.61, "eta": "35:02:57", "max_grad_norm": 0.8, "loss": 0.5787529945373535, "grad_norm": 1.1741310358047485, "learning_rate": 6.24977513566029e-05} +{"ts": "2025-12-28T00:45:01", "event": "train_log", "step": 6630, "epoch": 2.7974683544303796, "progress_pct": 46.62, "epoch_pct": 46.62, "eta": "35:02:01", "max_grad_norm": 0.8, "loss": 0.5816542506217957, "grad_norm": 1.1933153867721558, "learning_rate": 6.247449758085773e-05} +{"ts": "2025-12-28T00:45:13", "event": "train_log", "step": 6632, "epoch": 2.79831223628692, "progress_pct": 46.64, "epoch_pct": 46.64, "eta": "35:01:03", "max_grad_norm": 0.8, "loss": 0.61644446849823, "grad_norm": 1.3991938829421997, "learning_rate": 6.245124092748552e-05} +{"ts": "2025-12-28T00:45:26", "event": "train_log", "step": 6634, "epoch": 2.79915611814346, "progress_pct": 46.65, "epoch_pct": 46.65, "eta": "35:00:06", "max_grad_norm": 0.8, "loss": 0.5762863755226135, "grad_norm": 1.1720032691955566, "learning_rate": 6.242798140185117e-05} +{"ts": "2025-12-28T00:45:38", "event": "train_log", "step": 6636, "epoch": 2.8, "progress_pct": 46.67, "epoch_pct": 46.67, "eta": "34:59:08", "max_grad_norm": 0.8, "loss": 0.656046986579895, "grad_norm": 1.2190258502960205, "learning_rate": 6.240471900932019e-05} +{"ts": "2025-12-28T00:45:50", "event": "train_log", "step": 6638, "epoch": 2.80084388185654, "progress_pct": 46.68, "epoch_pct": 46.68, "eta": "34:58:11", "max_grad_norm": 0.8, "loss": 0.5192724466323853, "grad_norm": 1.128190040588379, "learning_rate": 6.238145375525877e-05} +{"ts": "2025-12-28T00:46:03", "event": "train_log", "step": 6640, "epoch": 2.80168776371308, "progress_pct": 46.69, "epoch_pct": 46.69, "eta": "34:57:14", "max_grad_norm": 0.8, "loss": 0.6037933826446533, "grad_norm": 1.2625527381896973, "learning_rate": 6.235818564503377e-05} +{"ts": "2025-12-28T00:46:15", "event": "train_log", "step": 6642, "epoch": 2.80253164556962, "progress_pct": 46.71, "epoch_pct": 46.71, "eta": "34:56:17", "max_grad_norm": 0.8, "loss": 0.6108730435371399, "grad_norm": 1.2483288049697876, "learning_rate": 6.233491468401268e-05} +{"ts": "2025-12-28T00:46:27", "event": "train_log", "step": 6644, "epoch": 2.8033755274261605, "progress_pct": 46.72, "epoch_pct": 46.72, "eta": "34:55:21", "max_grad_norm": 0.8, "loss": 0.6408922672271729, "grad_norm": 1.3986961841583252, "learning_rate": 6.231164087756367e-05} +{"ts": "2025-12-28T00:46:40", "event": "train_log", "step": 6646, "epoch": 2.8042194092827004, "progress_pct": 46.74, "epoch_pct": 46.74, "eta": "34:54:24", "max_grad_norm": 0.8, "loss": 0.648504376411438, "grad_norm": 1.2224489450454712, "learning_rate": 6.228836423105556e-05} +{"ts": "2025-12-28T00:46:52", "event": "train_log", "step": 6648, "epoch": 2.8050632911392404, "progress_pct": 46.75, "epoch_pct": 46.75, "eta": "34:53:27", "max_grad_norm": 0.8, "loss": 0.5769880414009094, "grad_norm": 1.2060397863388062, "learning_rate": 6.226508474985782e-05} +{"ts": "2025-12-28T00:47:05", "event": "train_log", "step": 6650, "epoch": 2.8059071729957807, "progress_pct": 46.77, "epoch_pct": 46.77, "eta": "34:52:30", "max_grad_norm": 0.8, "loss": 0.6585965752601624, "grad_norm": 1.262581467628479, "learning_rate": 6.224180243934058e-05} +{"ts": "2025-12-28T00:47:18", "event": "train_log", "step": 6652, "epoch": 2.8067510548523207, "progress_pct": 46.78, "epoch_pct": 46.78, "eta": "34:51:34", "max_grad_norm": 0.8, "loss": 0.618746817111969, "grad_norm": 1.1175196170806885, "learning_rate": 6.221851730487463e-05} +{"ts": "2025-12-28T00:47:30", "event": "train_log", "step": 6654, "epoch": 2.8075949367088606, "progress_pct": 46.79, "epoch_pct": 46.79, "eta": "34:50:37", "max_grad_norm": 0.8, "loss": 0.5708954930305481, "grad_norm": 1.2256932258605957, "learning_rate": 6.219522935183141e-05} +{"ts": "2025-12-28T00:47:43", "event": "train_log", "step": 6656, "epoch": 2.808438818565401, "progress_pct": 46.81, "epoch_pct": 46.81, "eta": "34:49:41", "max_grad_norm": 0.8, "loss": 0.608521580696106, "grad_norm": 1.3388983011245728, "learning_rate": 6.217193858558298e-05} +{"ts": "2025-12-28T00:47:54", "event": "train_log", "step": 6658, "epoch": 2.809282700421941, "progress_pct": 46.82, "epoch_pct": 46.82, "eta": "34:48:43", "max_grad_norm": 0.8, "loss": 0.64382004737854, "grad_norm": 1.2913719415664673, "learning_rate": 6.214864501150208e-05} +{"ts": "2025-12-28T00:48:07", "event": "train_log", "step": 6660, "epoch": 2.810126582278481, "progress_pct": 46.84, "epoch_pct": 46.84, "eta": "34:47:47", "max_grad_norm": 0.8, "loss": 0.567484438419342, "grad_norm": 1.039406657218933, "learning_rate": 6.21253486349621e-05} +{"ts": "2025-12-28T00:48:20", "event": "train_log", "step": 6662, "epoch": 2.8109704641350213, "progress_pct": 46.85, "epoch_pct": 46.85, "eta": "34:46:51", "max_grad_norm": 0.8, "loss": 0.5696196556091309, "grad_norm": 1.123612642288208, "learning_rate": 6.210204946133707e-05} +{"ts": "2025-12-28T00:48:33", "event": "train_log", "step": 6664, "epoch": 2.811814345991561, "progress_pct": 46.86, "epoch_pct": 46.86, "eta": "34:45:55", "max_grad_norm": 0.8, "loss": 0.6068252921104431, "grad_norm": 1.1850367784500122, "learning_rate": 6.207874749600164e-05} +{"ts": "2025-12-28T00:48:45", "event": "train_log", "step": 6666, "epoch": 2.812658227848101, "progress_pct": 46.88, "epoch_pct": 46.88, "eta": "34:44:57", "max_grad_norm": 0.8, "loss": 0.6329811215400696, "grad_norm": 1.3630138635635376, "learning_rate": 6.205544274433115e-05} +{"ts": "2025-12-28T00:48:58", "event": "train_log", "step": 6668, "epoch": 2.8135021097046415, "progress_pct": 46.89, "epoch_pct": 46.89, "eta": "34:44:02", "max_grad_norm": 0.8, "loss": 0.5600330829620361, "grad_norm": 1.217410683631897, "learning_rate": 6.203213521170154e-05} +{"ts": "2025-12-28T00:49:10", "event": "train_log", "step": 6670, "epoch": 2.8143459915611815, "progress_pct": 46.91, "epoch_pct": 46.91, "eta": "34:43:05", "max_grad_norm": 0.8, "loss": 0.639461874961853, "grad_norm": 3.5133564472198486, "learning_rate": 6.200882490348942e-05} +{"ts": "2025-12-28T00:49:23", "event": "train_log", "step": 6672, "epoch": 2.8151898734177214, "progress_pct": 46.92, "epoch_pct": 46.92, "eta": "34:42:09", "max_grad_norm": 0.8, "loss": 0.5908592939376831, "grad_norm": 1.2535229921340942, "learning_rate": 6.198551182507203e-05} +{"ts": "2025-12-28T00:49:35", "event": "train_log", "step": 6674, "epoch": 2.816033755274262, "progress_pct": 46.93, "epoch_pct": 46.93, "eta": "34:41:12", "max_grad_norm": 0.8, "loss": 0.5490466952323914, "grad_norm": 1.2667300701141357, "learning_rate": 6.196219598182726e-05} +{"ts": "2025-12-28T00:49:47", "event": "train_log", "step": 6676, "epoch": 2.8168776371308017, "progress_pct": 46.95, "epoch_pct": 46.95, "eta": "34:40:15", "max_grad_norm": 0.8, "loss": 0.6570454239845276, "grad_norm": 1.332416296005249, "learning_rate": 6.19388773791336e-05} +{"ts": "2025-12-28T00:49:58", "event": "train_log", "step": 6678, "epoch": 2.8177215189873417, "progress_pct": 46.96, "epoch_pct": 46.96, "eta": "34:39:18", "max_grad_norm": 0.8, "loss": 0.6296758651733398, "grad_norm": 1.2882871627807617, "learning_rate": 6.191555602237023e-05} +{"ts": "2025-12-28T00:50:11", "event": "train_log", "step": 6680, "epoch": 2.818565400843882, "progress_pct": 46.98, "epoch_pct": 46.98, "eta": "34:38:22", "max_grad_norm": 0.8, "loss": 0.6238688826560974, "grad_norm": 1.2949540615081787, "learning_rate": 6.189223191691691e-05} +{"ts": "2025-12-28T00:50:24", "event": "train_log", "step": 6682, "epoch": 2.819409282700422, "progress_pct": 46.99, "epoch_pct": 46.99, "eta": "34:37:25", "max_grad_norm": 0.8, "loss": 0.6287838220596313, "grad_norm": 1.3507297039031982, "learning_rate": 6.18689050681541e-05} +{"ts": "2025-12-28T00:50:36", "event": "train_log", "step": 6684, "epoch": 2.820253164556962, "progress_pct": 47.0, "epoch_pct": 47.0, "eta": "34:36:29", "max_grad_norm": 0.8, "loss": 0.5871602892875671, "grad_norm": 1.0284801721572876, "learning_rate": 6.184557548146282e-05} +{"ts": "2025-12-28T00:50:49", "event": "train_log", "step": 6686, "epoch": 2.8210970464135023, "progress_pct": 47.02, "epoch_pct": 47.02, "eta": "34:35:33", "max_grad_norm": 0.8, "loss": 0.5973687171936035, "grad_norm": 1.3238089084625244, "learning_rate": 6.182224316222478e-05} +{"ts": "2025-12-28T00:51:03", "event": "train_log", "step": 6688, "epoch": 2.8219409282700423, "progress_pct": 47.03, "epoch_pct": 47.03, "eta": "34:34:38", "max_grad_norm": 0.8, "loss": 0.5463243722915649, "grad_norm": 1.0406007766723633, "learning_rate": 6.179890811582232e-05} +{"ts": "2025-12-28T00:51:16", "event": "train_log", "step": 6690, "epoch": 2.8227848101265822, "progress_pct": 47.05, "epoch_pct": 47.05, "eta": "34:33:42", "max_grad_norm": 0.8, "loss": 0.5976935625076294, "grad_norm": 1.1670905351638794, "learning_rate": 6.177557034763832e-05} +{"ts": "2025-12-28T00:51:29", "event": "train_log", "step": 6692, "epoch": 2.8236286919831226, "progress_pct": 47.06, "epoch_pct": 47.06, "eta": "34:32:47", "max_grad_norm": 0.8, "loss": 0.6159120798110962, "grad_norm": 1.0810848474502563, "learning_rate": 6.175222986305642e-05} +{"ts": "2025-12-28T00:51:41", "event": "train_log", "step": 6694, "epoch": 2.8244725738396625, "progress_pct": 47.07, "epoch_pct": 47.07, "eta": "34:31:51", "max_grad_norm": 0.8, "loss": 0.6232127547264099, "grad_norm": 1.1419588327407837, "learning_rate": 6.172888666746078e-05} +{"ts": "2025-12-28T00:51:54", "event": "train_log", "step": 6696, "epoch": 2.8253164556962025, "progress_pct": 47.09, "epoch_pct": 47.09, "eta": "34:30:55", "max_grad_norm": 0.8, "loss": 0.579402506351471, "grad_norm": 1.118447184562683, "learning_rate": 6.170554076623627e-05} +{"ts": "2025-12-28T00:52:07", "event": "train_log", "step": 6698, "epoch": 2.826160337552743, "progress_pct": 47.1, "epoch_pct": 47.1, "eta": "34:29:59", "max_grad_norm": 0.8, "loss": 0.5871124863624573, "grad_norm": 1.3584961891174316, "learning_rate": 6.168219216476828e-05} +{"ts": "2025-12-28T00:52:18", "event": "train_log", "step": 6700, "epoch": 2.827004219409283, "progress_pct": 47.12, "epoch_pct": 47.12, "eta": "34:29:02", "max_grad_norm": 0.8, "loss": 0.6119418144226074, "grad_norm": 1.1773170232772827, "learning_rate": 6.165884086844295e-05} +{"ts": "2025-12-28T01:00:52", "event": "train_log", "step": 6700, "epoch": 2.827004219409283, "progress_pct": 47.12, "epoch_pct": 47.12, "eta": "34:38:38", "max_grad_norm": 0.8, "eval_loss": 0.6737436056137085, "eval_runtime": 513.2559, "eval_samples_per_second": 4.105, "eval_steps_per_second": 4.105} +{"ts": "2025-12-28T01:01:04", "event": "train_log", "step": 6702, "epoch": 2.8278481012658228, "progress_pct": 47.13, "epoch_pct": 47.13, "eta": "34:37:42", "max_grad_norm": 0.8, "loss": 0.606975257396698, "grad_norm": 1.2150315046310425, "learning_rate": 6.163548688264693e-05} +{"ts": "2025-12-28T01:01:17", "event": "train_log", "step": 6704, "epoch": 2.828691983122363, "progress_pct": 47.14, "epoch_pct": 47.14, "eta": "34:36:46", "max_grad_norm": 0.8, "loss": 0.5860852003097534, "grad_norm": 1.23250412940979, "learning_rate": 6.161213021276754e-05} +{"ts": "2025-12-28T01:01:30", "event": "train_log", "step": 6706, "epoch": 2.829535864978903, "progress_pct": 47.16, "epoch_pct": 47.16, "eta": "34:35:51", "max_grad_norm": 0.8, "loss": 0.543590784072876, "grad_norm": 1.1053578853607178, "learning_rate": 6.158877086419273e-05} +{"ts": "2025-12-28T01:01:43", "event": "train_log", "step": 6708, "epoch": 2.830379746835443, "progress_pct": 47.17, "epoch_pct": 47.17, "eta": "34:34:54", "max_grad_norm": 0.8, "loss": 0.6040283441543579, "grad_norm": 1.2813301086425781, "learning_rate": 6.156540884231105e-05} +{"ts": "2025-12-28T01:01:55", "event": "train_log", "step": 6710, "epoch": 2.831223628691983, "progress_pct": 47.19, "epoch_pct": 47.19, "eta": "34:33:58", "max_grad_norm": 0.8, "loss": 0.586407482624054, "grad_norm": 1.2987254858016968, "learning_rate": 6.154204415251169e-05} +{"ts": "2025-12-28T01:02:07", "event": "train_log", "step": 6712, "epoch": 2.8320675105485233, "progress_pct": 47.2, "epoch_pct": 47.2, "eta": "34:33:02", "max_grad_norm": 0.8, "loss": 0.6180199384689331, "grad_norm": 1.1980805397033691, "learning_rate": 6.151867680018438e-05} +{"ts": "2025-12-28T01:02:20", "event": "train_log", "step": 6714, "epoch": 2.8329113924050633, "progress_pct": 47.22, "epoch_pct": 47.22, "eta": "34:32:05", "max_grad_norm": 0.8, "loss": 0.5772807002067566, "grad_norm": 1.642957329750061, "learning_rate": 6.149530679071956e-05} +{"ts": "2025-12-28T01:02:31", "event": "train_log", "step": 6716, "epoch": 2.8337552742616032, "progress_pct": 47.23, "epoch_pct": 47.23, "eta": "34:31:08", "max_grad_norm": 0.8, "loss": 0.6107099652290344, "grad_norm": 1.3908783197402954, "learning_rate": 6.147193412950825e-05} +{"ts": "2025-12-28T01:02:44", "event": "train_log", "step": 6718, "epoch": 2.8345991561181436, "progress_pct": 47.24, "epoch_pct": 47.24, "eta": "34:30:11", "max_grad_norm": 0.8, "loss": 0.5335796475410461, "grad_norm": 1.3866089582443237, "learning_rate": 6.144855882194206e-05} +{"ts": "2025-12-28T01:02:56", "event": "train_log", "step": 6720, "epoch": 2.8354430379746836, "progress_pct": 47.26, "epoch_pct": 47.26, "eta": "34:29:16", "max_grad_norm": 0.8, "loss": 0.5962506532669067, "grad_norm": 1.2989959716796875, "learning_rate": 6.14251808734132e-05} +{"ts": "2025-12-28T01:03:08", "event": "train_log", "step": 6722, "epoch": 2.8362869198312235, "progress_pct": 47.27, "epoch_pct": 47.27, "eta": "34:28:19", "max_grad_norm": 0.8, "loss": 0.6368465423583984, "grad_norm": 1.3145360946655273, "learning_rate": 6.140180028931456e-05} +{"ts": "2025-12-28T01:03:21", "event": "train_log", "step": 6724, "epoch": 2.8371308016877634, "progress_pct": 47.29, "epoch_pct": 47.29, "eta": "34:27:24", "max_grad_norm": 0.8, "loss": 0.6448454856872559, "grad_norm": 1.1515997648239136, "learning_rate": 6.137841707503955e-05} +{"ts": "2025-12-28T01:03:36", "event": "train_log", "step": 6726, "epoch": 2.837974683544304, "progress_pct": 47.3, "epoch_pct": 47.3, "eta": "34:26:29", "max_grad_norm": 0.8, "loss": 0.49946340918540955, "grad_norm": 1.0785750150680542, "learning_rate": 6.135503123598225e-05} +{"ts": "2025-12-28T01:03:48", "event": "train_log", "step": 6728, "epoch": 2.8388185654008438, "progress_pct": 47.31, "epoch_pct": 47.31, "eta": "34:25:33", "max_grad_norm": 0.8, "loss": 0.550529956817627, "grad_norm": 1.1683695316314697, "learning_rate": 6.133164277753733e-05} +{"ts": "2025-12-28T01:04:01", "event": "train_log", "step": 6730, "epoch": 2.8396624472573837, "progress_pct": 47.33, "epoch_pct": 47.33, "eta": "34:24:37", "max_grad_norm": 0.8, "loss": 0.5135641098022461, "grad_norm": 1.0640658140182495, "learning_rate": 6.130825170510006e-05} +{"ts": "2025-12-28T01:04:13", "event": "train_log", "step": 6732, "epoch": 2.840506329113924, "progress_pct": 47.34, "epoch_pct": 47.34, "eta": "34:23:42", "max_grad_norm": 0.8, "loss": 0.6608622670173645, "grad_norm": 1.1805553436279297, "learning_rate": 6.12848580240663e-05} +{"ts": "2025-12-28T01:04:26", "event": "train_log", "step": 6734, "epoch": 2.841350210970464, "progress_pct": 47.36, "epoch_pct": 47.36, "eta": "34:22:46", "max_grad_norm": 0.8, "loss": 0.6797777414321899, "grad_norm": 1.2218462228775024, "learning_rate": 6.12614617398325e-05} +{"ts": "2025-12-28T01:04:39", "event": "train_log", "step": 6736, "epoch": 2.842194092827004, "progress_pct": 47.37, "epoch_pct": 47.37, "eta": "34:21:50", "max_grad_norm": 0.8, "loss": 0.5570073127746582, "grad_norm": 1.0677950382232666, "learning_rate": 6.123806285779576e-05} +{"ts": "2025-12-28T01:04:51", "event": "train_log", "step": 6738, "epoch": 2.8430379746835444, "progress_pct": 47.38, "epoch_pct": 47.38, "eta": "34:20:54", "max_grad_norm": 0.8, "loss": 0.6273435354232788, "grad_norm": 1.202785849571228, "learning_rate": 6.121466138335376e-05} +{"ts": "2025-12-28T01:05:04", "event": "train_log", "step": 6740, "epoch": 2.8438818565400843, "progress_pct": 47.4, "epoch_pct": 47.4, "eta": "34:19:58", "max_grad_norm": 0.8, "loss": 0.6337732076644897, "grad_norm": 1.1837576627731323, "learning_rate": 6.119125732190477e-05} +{"ts": "2025-12-28T01:05:16", "event": "train_log", "step": 6742, "epoch": 2.8447257383966242, "progress_pct": 47.41, "epoch_pct": 47.41, "eta": "34:19:02", "max_grad_norm": 0.8, "loss": 0.6228005886077881, "grad_norm": 1.2692649364471436, "learning_rate": 6.116785067884764e-05} +{"ts": "2025-12-28T01:05:28", "event": "train_log", "step": 6744, "epoch": 2.8455696202531646, "progress_pct": 47.43, "epoch_pct": 47.43, "eta": "34:18:06", "max_grad_norm": 0.8, "loss": 0.5781991481781006, "grad_norm": 1.3237874507904053, "learning_rate": 6.114444145958183e-05} +{"ts": "2025-12-28T01:05:41", "event": "train_log", "step": 6746, "epoch": 2.8464135021097046, "progress_pct": 47.44, "epoch_pct": 47.44, "eta": "34:17:10", "max_grad_norm": 0.8, "loss": 0.5583632588386536, "grad_norm": 1.2384692430496216, "learning_rate": 6.112102966950742e-05} +{"ts": "2025-12-28T01:05:54", "event": "train_log", "step": 6748, "epoch": 2.8472573839662445, "progress_pct": 47.45, "epoch_pct": 47.45, "eta": "34:16:16", "max_grad_norm": 0.8, "loss": 0.5704524517059326, "grad_norm": 1.1730914115905762, "learning_rate": 6.109761531402505e-05} +{"ts": "2025-12-28T01:06:07", "event": "train_log", "step": 6750, "epoch": 2.848101265822785, "progress_pct": 47.47, "epoch_pct": 47.47, "eta": "34:15:20", "max_grad_norm": 0.8, "loss": 0.5658026933670044, "grad_norm": 1.3047250509262085, "learning_rate": 6.107419839853597e-05} +{"ts": "2025-12-28T01:06:19", "event": "train_log", "step": 6752, "epoch": 2.848945147679325, "progress_pct": 47.48, "epoch_pct": 47.48, "eta": "34:14:24", "max_grad_norm": 0.8, "loss": 0.5919271111488342, "grad_norm": 1.2044686079025269, "learning_rate": 6.105077892844198e-05} +{"ts": "2025-12-28T01:06:31", "event": "train_log", "step": 6754, "epoch": 2.8497890295358648, "progress_pct": 47.5, "epoch_pct": 47.5, "eta": "34:13:28", "max_grad_norm": 0.8, "loss": 0.578326404094696, "grad_norm": 1.1952540874481201, "learning_rate": 6.102735690914554e-05} +{"ts": "2025-12-28T01:06:44", "event": "train_log", "step": 6756, "epoch": 2.850632911392405, "progress_pct": 47.51, "epoch_pct": 47.51, "eta": "34:12:33", "max_grad_norm": 0.8, "loss": 0.6079645156860352, "grad_norm": 1.2275413274765015, "learning_rate": 6.1003932346049633e-05} +{"ts": "2025-12-28T01:06:56", "event": "train_log", "step": 6758, "epoch": 2.851476793248945, "progress_pct": 47.52, "epoch_pct": 47.52, "eta": "34:11:37", "max_grad_norm": 0.8, "loss": 0.6111302375793457, "grad_norm": 1.2760299444198608, "learning_rate": 6.0980505244557884e-05} +{"ts": "2025-12-28T01:07:08", "event": "train_log", "step": 6760, "epoch": 2.852320675105485, "progress_pct": 47.54, "epoch_pct": 47.54, "eta": "34:10:40", "max_grad_norm": 0.8, "loss": 0.6397197246551514, "grad_norm": 1.4044286012649536, "learning_rate": 6.095707561007444e-05} +{"ts": "2025-12-28T01:07:20", "event": "train_log", "step": 6762, "epoch": 2.8531645569620254, "progress_pct": 47.55, "epoch_pct": 47.55, "eta": "34:09:44", "max_grad_norm": 0.8, "loss": 0.6183030605316162, "grad_norm": 1.3707174062728882, "learning_rate": 6.0933643448004094e-05} +{"ts": "2025-12-28T01:07:32", "event": "train_log", "step": 6764, "epoch": 2.8540084388185654, "progress_pct": 47.57, "epoch_pct": 47.57, "eta": "34:08:48", "max_grad_norm": 0.8, "loss": 0.6367093920707703, "grad_norm": 1.290480613708496, "learning_rate": 6.091020876375221e-05} +{"ts": "2025-12-28T01:07:45", "event": "train_log", "step": 6766, "epoch": 2.8548523206751053, "progress_pct": 47.58, "epoch_pct": 47.58, "eta": "34:07:53", "max_grad_norm": 0.8, "loss": 0.550685703754425, "grad_norm": 1.0469609498977661, "learning_rate": 6.0886771562724673e-05} +{"ts": "2025-12-28T01:07:57", "event": "train_log", "step": 6768, "epoch": 2.8556962025316457, "progress_pct": 47.59, "epoch_pct": 47.59, "eta": "34:06:58", "max_grad_norm": 0.8, "loss": 0.5789266228675842, "grad_norm": 1.312018871307373, "learning_rate": 6.086333185032804e-05} +{"ts": "2025-12-28T01:08:10", "event": "train_log", "step": 6770, "epoch": 2.8565400843881856, "progress_pct": 47.61, "epoch_pct": 47.61, "eta": "34:06:03", "max_grad_norm": 0.8, "loss": 0.5595589876174927, "grad_norm": 1.3253673315048218, "learning_rate": 6.0839889631969374e-05} +{"ts": "2025-12-28T01:08:23", "event": "train_log", "step": 6772, "epoch": 2.8573839662447256, "progress_pct": 47.62, "epoch_pct": 47.62, "eta": "34:05:07", "max_grad_norm": 0.8, "loss": 0.5642995238304138, "grad_norm": 1.2848154306411743, "learning_rate": 6.0816444913056356e-05} +{"ts": "2025-12-28T01:08:36", "event": "train_log", "step": 6774, "epoch": 2.858227848101266, "progress_pct": 47.64, "epoch_pct": 47.64, "eta": "34:04:12", "max_grad_norm": 0.8, "loss": 0.5502132773399353, "grad_norm": 1.2492237091064453, "learning_rate": 6.079299769899722e-05} +{"ts": "2025-12-28T01:08:49", "event": "train_log", "step": 6776, "epoch": 2.859071729957806, "progress_pct": 47.65, "epoch_pct": 47.65, "eta": "34:03:17", "max_grad_norm": 0.8, "loss": 0.5535969138145447, "grad_norm": 1.2817713022232056, "learning_rate": 6.076954799520081e-05} +{"ts": "2025-12-28T01:09:01", "event": "train_log", "step": 6778, "epoch": 2.859915611814346, "progress_pct": 47.67, "epoch_pct": 47.67, "eta": "34:02:22", "max_grad_norm": 0.8, "loss": 0.6086817979812622, "grad_norm": 1.1986786127090454, "learning_rate": 6.074609580707651e-05} +{"ts": "2025-12-28T01:09:14", "event": "train_log", "step": 6780, "epoch": 2.8607594936708862, "progress_pct": 47.68, "epoch_pct": 47.68, "eta": "34:01:26", "max_grad_norm": 0.8, "loss": 0.6254655718803406, "grad_norm": 1.274839162826538, "learning_rate": 6.0722641140034285e-05} +{"ts": "2025-12-28T01:09:27", "event": "train_log", "step": 6782, "epoch": 2.861603375527426, "progress_pct": 47.69, "epoch_pct": 47.69, "eta": "34:00:32", "max_grad_norm": 0.8, "loss": 0.6227576732635498, "grad_norm": 1.0627212524414062, "learning_rate": 6.0699183999484685e-05} +{"ts": "2025-12-28T01:09:39", "event": "train_log", "step": 6784, "epoch": 2.862447257383966, "progress_pct": 47.71, "epoch_pct": 47.71, "eta": "33:59:37", "max_grad_norm": 0.8, "loss": 0.6257740259170532, "grad_norm": 1.2313296794891357, "learning_rate": 6.0675724390838815e-05} +{"ts": "2025-12-28T01:09:52", "event": "train_log", "step": 6786, "epoch": 2.8632911392405065, "progress_pct": 47.72, "epoch_pct": 47.72, "eta": "33:58:41", "max_grad_norm": 0.8, "loss": 0.6438660621643066, "grad_norm": 1.1398836374282837, "learning_rate": 6.065226231950837e-05} +{"ts": "2025-12-28T01:10:05", "event": "train_log", "step": 6788, "epoch": 2.8641350210970464, "progress_pct": 47.74, "epoch_pct": 47.74, "eta": "33:57:46", "max_grad_norm": 0.8, "loss": 0.5654972195625305, "grad_norm": 1.1606178283691406, "learning_rate": 6.0628797790905566e-05} +{"ts": "2025-12-28T01:10:18", "event": "train_log", "step": 6790, "epoch": 2.8649789029535864, "progress_pct": 47.75, "epoch_pct": 47.75, "eta": "33:56:52", "max_grad_norm": 0.8, "loss": 0.5413897633552551, "grad_norm": 1.2857846021652222, "learning_rate": 6.060533081044326e-05} +{"ts": "2025-12-28T01:10:30", "event": "train_log", "step": 6792, "epoch": 2.8658227848101268, "progress_pct": 47.76, "epoch_pct": 47.76, "eta": "33:55:56", "max_grad_norm": 0.8, "loss": 0.5737078785896301, "grad_norm": 1.2358965873718262, "learning_rate": 6.058186138353481e-05} +{"ts": "2025-12-28T01:10:43", "event": "train_log", "step": 6794, "epoch": 2.8666666666666667, "progress_pct": 47.78, "epoch_pct": 47.78, "eta": "33:55:02", "max_grad_norm": 0.8, "loss": 0.5880253314971924, "grad_norm": 1.0813729763031006, "learning_rate": 6.055838951559417e-05} +{"ts": "2025-12-28T01:10:56", "event": "train_log", "step": 6796, "epoch": 2.8675105485232066, "progress_pct": 47.79, "epoch_pct": 47.79, "eta": "33:54:07", "max_grad_norm": 0.8, "loss": 0.5762695074081421, "grad_norm": 1.2310819625854492, "learning_rate": 6.0534915212035836e-05} +{"ts": "2025-12-28T01:11:08", "event": "train_log", "step": 6798, "epoch": 2.868354430379747, "progress_pct": 47.81, "epoch_pct": 47.81, "eta": "33:53:12", "max_grad_norm": 0.8, "loss": 0.6172254085540771, "grad_norm": 1.2762445211410522, "learning_rate": 6.0511438478274906e-05} +{"ts": "2025-12-28T01:11:22", "event": "train_log", "step": 6800, "epoch": 2.869198312236287, "progress_pct": 47.82, "epoch_pct": 47.82, "eta": "33:52:17", "max_grad_norm": 0.8, "loss": 0.5419955849647522, "grad_norm": 1.0100860595703125, "learning_rate": 6.0487959319726994e-05} +{"ts": "2025-12-28T01:19:55", "event": "train_log", "step": 6800, "epoch": 2.869198312236287, "progress_pct": 47.82, "epoch_pct": 47.82, "eta": "34:01:37", "max_grad_norm": 0.8, "eval_loss": 0.6721681356430054, "eval_runtime": 513.1285, "eval_samples_per_second": 4.106, "eval_steps_per_second": 4.106} +{"ts": "2025-12-28T01:20:07", "event": "train_log", "step": 6802, "epoch": 2.870042194092827, "progress_pct": 47.83, "epoch_pct": 47.83, "eta": "34:00:41", "max_grad_norm": 0.8, "loss": 0.6330351233482361, "grad_norm": 1.3078527450561523, "learning_rate": 6.046447774180827e-05} +{"ts": "2025-12-28T01:20:19", "event": "train_log", "step": 6804, "epoch": 2.8708860759493673, "progress_pct": 47.85, "epoch_pct": 47.85, "eta": "33:59:46", "max_grad_norm": 0.8, "loss": 0.5479466915130615, "grad_norm": 1.3523176908493042, "learning_rate": 6.044099374993553e-05} +{"ts": "2025-12-28T01:20:32", "event": "train_log", "step": 6806, "epoch": 2.8717299578059072, "progress_pct": 47.86, "epoch_pct": 47.86, "eta": "33:58:51", "max_grad_norm": 0.8, "loss": 0.5516952872276306, "grad_norm": 1.109269142150879, "learning_rate": 6.041750734952604e-05} +{"ts": "2025-12-28T01:20:44", "event": "train_log", "step": 6808, "epoch": 2.872573839662447, "progress_pct": 47.88, "epoch_pct": 47.88, "eta": "33:57:55", "max_grad_norm": 0.8, "loss": 0.5878147482872009, "grad_norm": 1.2368918657302856, "learning_rate": 6.039401854599769e-05} +{"ts": "2025-12-28T01:20:56", "event": "train_log", "step": 6810, "epoch": 2.8734177215189876, "progress_pct": 47.89, "epoch_pct": 47.89, "eta": "33:57:00", "max_grad_norm": 0.8, "loss": 0.5637685656547546, "grad_norm": 1.1626032590866089, "learning_rate": 6.037052734476886e-05} +{"ts": "2025-12-28T01:21:09", "event": "train_log", "step": 6812, "epoch": 2.8742616033755275, "progress_pct": 47.9, "epoch_pct": 47.9, "eta": "33:56:05", "max_grad_norm": 0.8, "loss": 0.5398213267326355, "grad_norm": 1.1955288648605347, "learning_rate": 6.0347033751258566e-05} +{"ts": "2025-12-28T01:21:21", "event": "train_log", "step": 6814, "epoch": 2.8751054852320674, "progress_pct": 47.92, "epoch_pct": 47.92, "eta": "33:55:09", "max_grad_norm": 0.8, "loss": 0.6098157167434692, "grad_norm": 1.3805105686187744, "learning_rate": 6.0323537770886285e-05} +{"ts": "2025-12-28T01:21:33", "event": "train_log", "step": 6816, "epoch": 2.875949367088608, "progress_pct": 47.93, "epoch_pct": 47.93, "eta": "33:54:13", "max_grad_norm": 0.8, "loss": 0.5970560312271118, "grad_norm": 1.2644819021224976, "learning_rate": 6.030003940907212e-05} +{"ts": "2025-12-28T01:21:46", "event": "train_log", "step": 6818, "epoch": 2.8767932489451478, "progress_pct": 47.95, "epoch_pct": 47.95, "eta": "33:53:18", "max_grad_norm": 0.8, "loss": 0.5918156504631042, "grad_norm": 1.1625932455062866, "learning_rate": 6.027653867123667e-05} +{"ts": "2025-12-28T01:21:58", "event": "train_log", "step": 6820, "epoch": 2.8776371308016877, "progress_pct": 47.96, "epoch_pct": 47.96, "eta": "33:52:23", "max_grad_norm": 0.8, "loss": 0.5625584721565247, "grad_norm": 1.3591371774673462, "learning_rate": 6.025303556280112e-05} +{"ts": "2025-12-28T01:22:10", "event": "train_log", "step": 6822, "epoch": 2.878481012658228, "progress_pct": 47.97, "epoch_pct": 47.97, "eta": "33:51:28", "max_grad_norm": 0.8, "loss": 0.6422242522239685, "grad_norm": 1.266757845878601, "learning_rate": 6.022953008918718e-05} +{"ts": "2025-12-28T01:22:22", "event": "train_log", "step": 6824, "epoch": 2.879324894514768, "progress_pct": 47.99, "epoch_pct": 47.99, "eta": "33:50:31", "max_grad_norm": 0.8, "loss": 0.6625136733055115, "grad_norm": 1.273234248161316, "learning_rate": 6.0206022255817095e-05} +{"ts": "2025-12-28T01:22:34", "event": "train_log", "step": 6826, "epoch": 2.880168776371308, "progress_pct": 48.0, "epoch_pct": 48.0, "eta": "33:49:36", "max_grad_norm": 0.8, "loss": 0.6410037279129028, "grad_norm": 1.2808254957199097, "learning_rate": 6.0182512068113715e-05} +{"ts": "2025-12-28T01:22:47", "event": "train_log", "step": 6828, "epoch": 2.8810126582278484, "progress_pct": 48.02, "epoch_pct": 48.02, "eta": "33:48:41", "max_grad_norm": 0.8, "loss": 0.5269461274147034, "grad_norm": 1.1684991121292114, "learning_rate": 6.0158999531500335e-05} +{"ts": "2025-12-28T01:22:59", "event": "train_log", "step": 6830, "epoch": 2.8818565400843883, "progress_pct": 48.03, "epoch_pct": 48.03, "eta": "33:47:46", "max_grad_norm": 0.8, "loss": 0.6546348929405212, "grad_norm": 1.3655736446380615, "learning_rate": 6.0135484651400886e-05} +{"ts": "2025-12-28T01:23:11", "event": "train_log", "step": 6832, "epoch": 2.8827004219409282, "progress_pct": 48.05, "epoch_pct": 48.05, "eta": "33:46:50", "max_grad_norm": 0.8, "loss": 0.5872722864151001, "grad_norm": 1.3913087844848633, "learning_rate": 6.011196743323977e-05} +{"ts": "2025-12-28T01:23:23", "event": "train_log", "step": 6834, "epoch": 2.8835443037974686, "progress_pct": 48.06, "epoch_pct": 48.06, "eta": "33:45:55", "max_grad_norm": 0.8, "loss": 0.5498786568641663, "grad_norm": 1.1047117710113525, "learning_rate": 6.008844788244199e-05} +{"ts": "2025-12-28T01:23:36", "event": "train_log", "step": 6836, "epoch": 2.8843881856540086, "progress_pct": 48.07, "epoch_pct": 48.07, "eta": "33:45:00", "max_grad_norm": 0.8, "loss": 0.5740244388580322, "grad_norm": 1.0897705554962158, "learning_rate": 6.006492600443301e-05} +{"ts": "2025-12-28T01:23:49", "event": "train_log", "step": 6838, "epoch": 2.8852320675105485, "progress_pct": 48.09, "epoch_pct": 48.09, "eta": "33:44:06", "max_grad_norm": 0.8, "loss": 0.5618779063224792, "grad_norm": 1.0046823024749756, "learning_rate": 6.004140180463891e-05} +{"ts": "2025-12-28T01:24:01", "event": "train_log", "step": 6840, "epoch": 2.8860759493670884, "progress_pct": 48.1, "epoch_pct": 48.1, "eta": "33:43:11", "max_grad_norm": 0.8, "loss": 0.6124269366264343, "grad_norm": 1.231499195098877, "learning_rate": 6.001787528848628e-05} +{"ts": "2025-12-28T01:24:14", "event": "train_log", "step": 6842, "epoch": 2.886919831223629, "progress_pct": 48.12, "epoch_pct": 48.12, "eta": "33:42:16", "max_grad_norm": 0.8, "loss": 0.5512109994888306, "grad_norm": 1.1776596307754517, "learning_rate": 5.999434646140219e-05} +{"ts": "2025-12-28T01:24:26", "event": "train_log", "step": 6844, "epoch": 2.8877637130801688, "progress_pct": 48.13, "epoch_pct": 48.13, "eta": "33:41:21", "max_grad_norm": 0.8, "loss": 0.610329270362854, "grad_norm": 1.2528871297836304, "learning_rate": 5.9970815328814334e-05} +{"ts": "2025-12-28T01:24:38", "event": "train_log", "step": 6846, "epoch": 2.8886075949367087, "progress_pct": 48.14, "epoch_pct": 48.14, "eta": "33:40:25", "max_grad_norm": 0.8, "loss": 0.568793773651123, "grad_norm": 1.4408416748046875, "learning_rate": 5.994728189615087e-05} +{"ts": "2025-12-28T01:24:51", "event": "train_log", "step": 6848, "epoch": 2.889451476793249, "progress_pct": 48.16, "epoch_pct": 48.16, "eta": "33:39:31", "max_grad_norm": 0.8, "loss": 0.6107773184776306, "grad_norm": 1.2031673192977905, "learning_rate": 5.9923746168840523e-05} +{"ts": "2025-12-28T01:25:03", "event": "train_log", "step": 6850, "epoch": 2.890295358649789, "progress_pct": 48.17, "epoch_pct": 48.17, "eta": "33:38:35", "max_grad_norm": 0.8, "loss": 0.6217910647392273, "grad_norm": 1.3201221227645874, "learning_rate": 5.990020815231251e-05} +{"ts": "2025-12-28T01:25:15", "event": "train_log", "step": 6852, "epoch": 2.891139240506329, "progress_pct": 48.19, "epoch_pct": 48.19, "eta": "33:37:40", "max_grad_norm": 0.8, "loss": 0.6051784157752991, "grad_norm": 1.1753840446472168, "learning_rate": 5.987666785199661e-05} +{"ts": "2025-12-28T01:25:27", "event": "train_log", "step": 6854, "epoch": 2.8919831223628694, "progress_pct": 48.2, "epoch_pct": 48.2, "eta": "33:36:45", "max_grad_norm": 0.8, "loss": 0.5736448168754578, "grad_norm": 1.2406786680221558, "learning_rate": 5.985312527332314e-05} +{"ts": "2025-12-28T01:25:41", "event": "train_log", "step": 6856, "epoch": 2.8928270042194093, "progress_pct": 48.21, "epoch_pct": 48.21, "eta": "33:35:51", "max_grad_norm": 0.8, "loss": 0.5454224944114685, "grad_norm": 1.6206021308898926, "learning_rate": 5.98295804217229e-05} +{"ts": "2025-12-28T01:25:53", "event": "train_log", "step": 6858, "epoch": 2.8936708860759492, "progress_pct": 48.23, "epoch_pct": 48.23, "eta": "33:34:56", "max_grad_norm": 0.8, "loss": 0.5912685990333557, "grad_norm": 1.2756178379058838, "learning_rate": 5.9806033302627227e-05} +{"ts": "2025-12-28T01:26:05", "event": "train_log", "step": 6860, "epoch": 2.894514767932489, "progress_pct": 48.24, "epoch_pct": 48.24, "eta": "33:34:02", "max_grad_norm": 0.8, "loss": 0.5619014501571655, "grad_norm": 1.223631501197815, "learning_rate": 5.9782483921468e-05} +{"ts": "2025-12-28T01:26:19", "event": "train_log", "step": 6862, "epoch": 2.8953586497890296, "progress_pct": 48.26, "epoch_pct": 48.26, "eta": "33:33:08", "max_grad_norm": 0.8, "loss": 0.5629459619522095, "grad_norm": 1.06546151638031, "learning_rate": 5.975893228367762e-05} +{"ts": "2025-12-28T01:26:31", "event": "train_log", "step": 6864, "epoch": 2.8962025316455695, "progress_pct": 48.27, "epoch_pct": 48.27, "eta": "33:32:13", "max_grad_norm": 0.8, "loss": 0.4997110366821289, "grad_norm": 1.0573277473449707, "learning_rate": 5.9735378394688965e-05} +{"ts": "2025-12-28T01:26:43", "event": "train_log", "step": 6866, "epoch": 2.8970464135021095, "progress_pct": 48.28, "epoch_pct": 48.28, "eta": "33:31:19", "max_grad_norm": 0.8, "loss": 0.6370334625244141, "grad_norm": 1.2832465171813965, "learning_rate": 5.97118222599355e-05} +{"ts": "2025-12-28T01:26:57", "event": "train_log", "step": 6868, "epoch": 2.89789029535865, "progress_pct": 48.3, "epoch_pct": 48.3, "eta": "33:30:25", "max_grad_norm": 0.8, "loss": 0.6095840334892273, "grad_norm": 1.1721924543380737, "learning_rate": 5.968826388485116e-05} +{"ts": "2025-12-28T01:27:09", "event": "train_log", "step": 6870, "epoch": 2.8987341772151898, "progress_pct": 48.31, "epoch_pct": 48.31, "eta": "33:29:30", "max_grad_norm": 0.8, "loss": 0.6075419187545776, "grad_norm": 1.1428951025009155, "learning_rate": 5.966470327487042e-05} +{"ts": "2025-12-28T01:27:21", "event": "train_log", "step": 6872, "epoch": 2.8995780590717297, "progress_pct": 48.33, "epoch_pct": 48.33, "eta": "33:28:35", "max_grad_norm": 0.8, "loss": 0.6376850605010986, "grad_norm": 1.2369399070739746, "learning_rate": 5.964114043542822e-05} +{"ts": "2025-12-28T01:27:33", "event": "train_log", "step": 6874, "epoch": 2.90042194092827, "progress_pct": 48.34, "epoch_pct": 48.34, "eta": "33:27:40", "max_grad_norm": 0.8, "loss": 0.57747882604599, "grad_norm": 1.178520679473877, "learning_rate": 5.961757537196011e-05} +{"ts": "2025-12-28T01:27:46", "event": "train_log", "step": 6876, "epoch": 2.90126582278481, "progress_pct": 48.35, "epoch_pct": 48.35, "eta": "33:26:46", "max_grad_norm": 0.8, "loss": 0.626102864742279, "grad_norm": 1.2600151300430298, "learning_rate": 5.959400808990205e-05} +{"ts": "2025-12-28T01:27:58", "event": "train_log", "step": 6878, "epoch": 2.90210970464135, "progress_pct": 48.37, "epoch_pct": 48.37, "eta": "33:25:51", "max_grad_norm": 0.8, "loss": 0.6087106466293335, "grad_norm": 1.2809659242630005, "learning_rate": 5.957043859469058e-05} +{"ts": "2025-12-28T01:28:11", "event": "train_log", "step": 6880, "epoch": 2.9029535864978904, "progress_pct": 48.38, "epoch_pct": 48.38, "eta": "33:24:57", "max_grad_norm": 0.8, "loss": 0.599288284778595, "grad_norm": 1.2029764652252197, "learning_rate": 5.954686689176274e-05} +{"ts": "2025-12-28T01:28:23", "event": "train_log", "step": 6882, "epoch": 2.9037974683544303, "progress_pct": 48.4, "epoch_pct": 48.4, "eta": "33:24:02", "max_grad_norm": 0.8, "loss": 0.6364397406578064, "grad_norm": 1.2000751495361328, "learning_rate": 5.952329298655607e-05} +{"ts": "2025-12-28T01:28:35", "event": "train_log", "step": 6884, "epoch": 2.9046413502109703, "progress_pct": 48.41, "epoch_pct": 48.41, "eta": "33:23:07", "max_grad_norm": 0.8, "loss": 0.6032583713531494, "grad_norm": 1.3380756378173828, "learning_rate": 5.949971688450859e-05} +{"ts": "2025-12-28T01:28:48", "event": "train_log", "step": 6886, "epoch": 2.9054852320675106, "progress_pct": 48.42, "epoch_pct": 48.42, "eta": "33:22:13", "max_grad_norm": 0.8, "loss": 0.6217718720436096, "grad_norm": 1.207139015197754, "learning_rate": 5.9476138591058874e-05} +{"ts": "2025-12-28T01:29:00", "event": "train_log", "step": 6888, "epoch": 2.9063291139240506, "progress_pct": 48.44, "epoch_pct": 48.44, "eta": "33:21:19", "max_grad_norm": 0.8, "loss": 0.5663400888442993, "grad_norm": 1.2060731649398804, "learning_rate": 5.945255811164598e-05} +{"ts": "2025-12-28T01:29:13", "event": "train_log", "step": 6890, "epoch": 2.9071729957805905, "progress_pct": 48.45, "epoch_pct": 48.45, "eta": "33:20:25", "max_grad_norm": 0.8, "loss": 0.583290696144104, "grad_norm": 1.3331942558288574, "learning_rate": 5.9428975451709465e-05} +{"ts": "2025-12-28T01:29:25", "event": "train_log", "step": 6892, "epoch": 2.908016877637131, "progress_pct": 48.47, "epoch_pct": 48.47, "eta": "33:19:30", "max_grad_norm": 0.8, "loss": 0.5606404542922974, "grad_norm": 1.226565957069397, "learning_rate": 5.94053906166894e-05} +{"ts": "2025-12-28T01:29:38", "event": "train_log", "step": 6894, "epoch": 2.908860759493671, "progress_pct": 48.48, "epoch_pct": 48.48, "eta": "33:18:36", "max_grad_norm": 0.8, "loss": 0.5337109565734863, "grad_norm": 1.167909026145935, "learning_rate": 5.938180361202636e-05} +{"ts": "2025-12-28T01:29:50", "event": "train_log", "step": 6896, "epoch": 2.909704641350211, "progress_pct": 48.5, "epoch_pct": 48.5, "eta": "33:17:42", "max_grad_norm": 0.8, "loss": 0.64582759141922, "grad_norm": 1.2748368978500366, "learning_rate": 5.93582144431614e-05} +{"ts": "2025-12-28T01:30:02", "event": "train_log", "step": 6898, "epoch": 2.910548523206751, "progress_pct": 48.51, "epoch_pct": 48.51, "eta": "33:16:47", "max_grad_norm": 0.8, "loss": 0.631919801235199, "grad_norm": 1.2209413051605225, "learning_rate": 5.93346231155361e-05} +{"ts": "2025-12-28T01:30:14", "event": "train_log", "step": 6900, "epoch": 2.911392405063291, "progress_pct": 48.52, "epoch_pct": 48.52, "eta": "33:15:52", "max_grad_norm": 0.8, "loss": 0.5999054908752441, "grad_norm": 1.2692270278930664, "learning_rate": 5.931102963459252e-05} +{"ts": "2025-12-28T01:38:47", "event": "train_log", "step": 6900, "epoch": 2.911392405063291, "progress_pct": 48.52, "epoch_pct": 48.52, "eta": "33:24:57", "max_grad_norm": 0.8, "eval_loss": 0.6713213920593262, "eval_runtime": 513.1265, "eval_samples_per_second": 4.106, "eval_steps_per_second": 4.106} +{"ts": "2025-12-28T01:39:00", "event": "train_log", "step": 6902, "epoch": 2.912236286919831, "progress_pct": 48.54, "epoch_pct": 48.54, "eta": "33:24:02", "max_grad_norm": 0.8, "loss": 0.634549081325531, "grad_norm": 1.3654414415359497, "learning_rate": 5.928743400577323e-05} +{"ts": "2025-12-28T01:39:12", "event": "train_log", "step": 6904, "epoch": 2.9130801687763714, "progress_pct": 48.55, "epoch_pct": 48.55, "eta": "33:23:07", "max_grad_norm": 0.8, "loss": 0.684973418712616, "grad_norm": 1.4427542686462402, "learning_rate": 5.926383623452128e-05} +{"ts": "2025-12-28T01:39:24", "event": "train_log", "step": 6906, "epoch": 2.9139240506329114, "progress_pct": 48.57, "epoch_pct": 48.57, "eta": "33:22:12", "max_grad_norm": 0.8, "loss": 0.6641559600830078, "grad_norm": 1.3192591667175293, "learning_rate": 5.9240236326280216e-05} +{"ts": "2025-12-28T01:39:36", "event": "train_log", "step": 6908, "epoch": 2.9147679324894513, "progress_pct": 48.58, "epoch_pct": 48.58, "eta": "33:21:17", "max_grad_norm": 0.8, "loss": 0.6443751454353333, "grad_norm": 1.3328732252120972, "learning_rate": 5.921663428649411e-05} +{"ts": "2025-12-28T01:39:48", "event": "train_log", "step": 6910, "epoch": 2.9156118143459917, "progress_pct": 48.59, "epoch_pct": 48.59, "eta": "33:20:23", "max_grad_norm": 0.8, "loss": 0.674626886844635, "grad_norm": 1.191504716873169, "learning_rate": 5.9193030120607486e-05} +{"ts": "2025-12-28T01:40:00", "event": "train_log", "step": 6912, "epoch": 2.9164556962025316, "progress_pct": 48.61, "epoch_pct": 48.61, "eta": "33:19:28", "max_grad_norm": 0.8, "loss": 0.6297666430473328, "grad_norm": 1.2599490880966187, "learning_rate": 5.916942383406535e-05} +{"ts": "2025-12-28T01:40:14", "event": "train_log", "step": 6914, "epoch": 2.9172995780590716, "progress_pct": 48.62, "epoch_pct": 48.62, "eta": "33:18:35", "max_grad_norm": 0.8, "loss": 0.5809952616691589, "grad_norm": 0.9829303622245789, "learning_rate": 5.914581543231324e-05} +{"ts": "2025-12-28T01:40:26", "event": "train_log", "step": 6916, "epoch": 2.918143459915612, "progress_pct": 48.64, "epoch_pct": 48.64, "eta": "33:17:41", "max_grad_norm": 0.8, "loss": 0.6383126974105835, "grad_norm": 1.1566280126571655, "learning_rate": 5.9122204920797176e-05} +{"ts": "2025-12-28T01:40:40", "event": "train_log", "step": 6918, "epoch": 2.918987341772152, "progress_pct": 48.65, "epoch_pct": 48.65, "eta": "33:16:47", "max_grad_norm": 0.8, "loss": 0.5681729316711426, "grad_norm": 1.047351360321045, "learning_rate": 5.9098592304963616e-05} +{"ts": "2025-12-28T01:40:52", "event": "train_log", "step": 6920, "epoch": 2.919831223628692, "progress_pct": 48.66, "epoch_pct": 48.66, "eta": "33:15:53", "max_grad_norm": 0.8, "loss": 0.5985210537910461, "grad_norm": 1.2059552669525146, "learning_rate": 5.907497759025956e-05} +{"ts": "2025-12-28T01:41:05", "event": "train_log", "step": 6922, "epoch": 2.9206751054852322, "progress_pct": 48.68, "epoch_pct": 48.68, "eta": "33:14:59", "max_grad_norm": 0.8, "loss": 0.5815024375915527, "grad_norm": 1.1845992803573608, "learning_rate": 5.905136078213247e-05} +{"ts": "2025-12-28T01:41:18", "event": "train_log", "step": 6924, "epoch": 2.921518987341772, "progress_pct": 48.69, "epoch_pct": 48.69, "eta": "33:14:05", "max_grad_norm": 0.8, "loss": 0.6437575221061707, "grad_norm": 1.3542579412460327, "learning_rate": 5.9027741886030266e-05} +{"ts": "2025-12-28T01:41:30", "event": "train_log", "step": 6926, "epoch": 2.922362869198312, "progress_pct": 48.71, "epoch_pct": 48.71, "eta": "33:13:11", "max_grad_norm": 0.8, "loss": 0.5773448348045349, "grad_norm": 1.1001946926116943, "learning_rate": 5.900412090740139e-05} +{"ts": "2025-12-28T01:41:42", "event": "train_log", "step": 6928, "epoch": 2.9232067510548525, "progress_pct": 48.72, "epoch_pct": 48.72, "eta": "33:12:16", "max_grad_norm": 0.8, "loss": 0.6076427698135376, "grad_norm": 1.220449447631836, "learning_rate": 5.898049785169476e-05} +{"ts": "2025-12-28T01:41:56", "event": "train_log", "step": 6930, "epoch": 2.9240506329113924, "progress_pct": 48.73, "epoch_pct": 48.73, "eta": "33:11:23", "max_grad_norm": 0.8, "loss": 0.5418170690536499, "grad_norm": 1.126592993736267, "learning_rate": 5.895687272435975e-05} +{"ts": "2025-12-28T01:42:09", "event": "train_log", "step": 6932, "epoch": 2.9248945147679324, "progress_pct": 48.75, "epoch_pct": 48.75, "eta": "33:10:30", "max_grad_norm": 0.8, "loss": 0.6057441234588623, "grad_norm": 1.1005871295928955, "learning_rate": 5.893324553084622e-05} +{"ts": "2025-12-28T01:42:22", "event": "train_log", "step": 6934, "epoch": 2.9257383966244728, "progress_pct": 48.76, "epoch_pct": 48.76, "eta": "33:09:36", "max_grad_norm": 0.8, "loss": 0.4844438433647156, "grad_norm": 1.0291813611984253, "learning_rate": 5.89096162766045e-05} +{"ts": "2025-12-28T01:42:35", "event": "train_log", "step": 6936, "epoch": 2.9265822784810127, "progress_pct": 48.78, "epoch_pct": 48.78, "eta": "33:08:42", "max_grad_norm": 0.8, "loss": 0.5230311751365662, "grad_norm": 1.0685851573944092, "learning_rate": 5.888598496708543e-05} +{"ts": "2025-12-28T01:42:47", "event": "train_log", "step": 6938, "epoch": 2.9274261603375527, "progress_pct": 48.79, "epoch_pct": 48.79, "eta": "33:07:48", "max_grad_norm": 0.8, "loss": 0.6191393136978149, "grad_norm": 1.1004319190979004, "learning_rate": 5.8862351607740285e-05} +{"ts": "2025-12-28T01:42:59", "event": "train_log", "step": 6940, "epoch": 2.928270042194093, "progress_pct": 48.8, "epoch_pct": 48.8, "eta": "33:06:54", "max_grad_norm": 0.8, "loss": 0.5574309825897217, "grad_norm": 1.2164443731307983, "learning_rate": 5.8838716204020815e-05} +{"ts": "2025-12-28T01:43:12", "event": "train_log", "step": 6942, "epoch": 2.929113924050633, "progress_pct": 48.82, "epoch_pct": 48.82, "eta": "33:06:00", "max_grad_norm": 0.8, "loss": 0.5820326209068298, "grad_norm": 1.104511022567749, "learning_rate": 5.881507876137928e-05} +{"ts": "2025-12-28T01:43:24", "event": "train_log", "step": 6944, "epoch": 2.929957805907173, "progress_pct": 48.83, "epoch_pct": 48.83, "eta": "33:05:06", "max_grad_norm": 0.8, "loss": 0.6016243696212769, "grad_norm": 1.4402027130126953, "learning_rate": 5.879143928526838e-05} +{"ts": "2025-12-28T01:43:36", "event": "train_log", "step": 6946, "epoch": 2.9308016877637133, "progress_pct": 48.85, "epoch_pct": 48.85, "eta": "33:04:12", "max_grad_norm": 0.8, "loss": 0.574772834777832, "grad_norm": 1.2131510972976685, "learning_rate": 5.8767797781141274e-05} +{"ts": "2025-12-28T01:43:49", "event": "train_log", "step": 6948, "epoch": 2.9316455696202532, "progress_pct": 48.86, "epoch_pct": 48.86, "eta": "33:03:18", "max_grad_norm": 0.8, "loss": 0.6725581884384155, "grad_norm": 1.2146058082580566, "learning_rate": 5.874415425445159e-05} +{"ts": "2025-12-28T01:44:01", "event": "train_log", "step": 6950, "epoch": 2.932489451476793, "progress_pct": 48.87, "epoch_pct": 48.87, "eta": "33:02:24", "max_grad_norm": 0.8, "loss": 0.5900663733482361, "grad_norm": 1.2887672185897827, "learning_rate": 5.872050871065349e-05} +{"ts": "2025-12-28T01:44:13", "event": "train_log", "step": 6952, "epoch": 2.9333333333333336, "progress_pct": 48.89, "epoch_pct": 48.89, "eta": "33:01:30", "max_grad_norm": 0.8, "loss": 0.6624540686607361, "grad_norm": 1.340739369392395, "learning_rate": 5.869686115520148e-05} +{"ts": "2025-12-28T01:44:26", "event": "train_log", "step": 6954, "epoch": 2.9341772151898735, "progress_pct": 48.9, "epoch_pct": 48.9, "eta": "33:00:36", "max_grad_norm": 0.8, "loss": 0.5319855809211731, "grad_norm": 1.3531051874160767, "learning_rate": 5.867321159355062e-05} +{"ts": "2025-12-28T01:44:38", "event": "train_log", "step": 6956, "epoch": 2.9350210970464135, "progress_pct": 48.92, "epoch_pct": 48.92, "eta": "32:59:41", "max_grad_norm": 0.8, "loss": 0.6661397218704224, "grad_norm": 1.441260814666748, "learning_rate": 5.864956003115646e-05} +{"ts": "2025-12-28T01:44:51", "event": "train_log", "step": 6958, "epoch": 2.935864978902954, "progress_pct": 48.93, "epoch_pct": 48.93, "eta": "32:58:48", "max_grad_norm": 0.8, "loss": 0.6062843799591064, "grad_norm": 1.314922571182251, "learning_rate": 5.862590647347488e-05} +{"ts": "2025-12-28T01:45:03", "event": "train_log", "step": 6960, "epoch": 2.9367088607594938, "progress_pct": 48.95, "epoch_pct": 48.95, "eta": "32:57:54", "max_grad_norm": 0.8, "loss": 0.6123294234275818, "grad_norm": 1.134419560432434, "learning_rate": 5.860225092596237e-05} +{"ts": "2025-12-28T01:45:15", "event": "train_log", "step": 6962, "epoch": 2.9375527426160337, "progress_pct": 48.96, "epoch_pct": 48.96, "eta": "32:56:59", "max_grad_norm": 0.8, "loss": 0.5984833240509033, "grad_norm": 1.3195313215255737, "learning_rate": 5.8578593394075746e-05} +{"ts": "2025-12-28T01:45:28", "event": "train_log", "step": 6964, "epoch": 2.938396624472574, "progress_pct": 48.97, "epoch_pct": 48.97, "eta": "32:56:06", "max_grad_norm": 0.8, "loss": 0.5695837736129761, "grad_norm": 1.1626067161560059, "learning_rate": 5.855493388327242e-05} +{"ts": "2025-12-28T01:45:40", "event": "train_log", "step": 6966, "epoch": 2.939240506329114, "progress_pct": 48.99, "epoch_pct": 48.99, "eta": "32:55:12", "max_grad_norm": 0.8, "loss": 0.5688632726669312, "grad_norm": 1.1392630338668823, "learning_rate": 5.853127239901012e-05} +{"ts": "2025-12-28T01:45:53", "event": "train_log", "step": 6968, "epoch": 2.940084388185654, "progress_pct": 49.0, "epoch_pct": 49.0, "eta": "32:54:19", "max_grad_norm": 0.8, "loss": 0.6139572262763977, "grad_norm": 1.2131112813949585, "learning_rate": 5.850760894674713e-05} +{"ts": "2025-12-28T01:46:05", "event": "train_log", "step": 6970, "epoch": 2.9409282700421944, "progress_pct": 49.02, "epoch_pct": 49.02, "eta": "32:53:25", "max_grad_norm": 0.8, "loss": 0.6654361486434937, "grad_norm": 1.1740806102752686, "learning_rate": 5.8483943531942154e-05} +{"ts": "2025-12-28T01:46:18", "event": "train_log", "step": 6972, "epoch": 2.9417721518987343, "progress_pct": 49.03, "epoch_pct": 49.03, "eta": "32:52:32", "max_grad_norm": 0.8, "loss": 0.5477408766746521, "grad_norm": 1.1364716291427612, "learning_rate": 5.846027616005433e-05} +{"ts": "2025-12-28T01:46:30", "event": "train_log", "step": 6974, "epoch": 2.9426160337552743, "progress_pct": 49.04, "epoch_pct": 49.04, "eta": "32:51:38", "max_grad_norm": 0.8, "loss": 0.6023505926132202, "grad_norm": 1.212761640548706, "learning_rate": 5.843660683654328e-05} +{"ts": "2025-12-28T01:46:43", "event": "train_log", "step": 6976, "epoch": 2.943459915611814, "progress_pct": 49.06, "epoch_pct": 49.06, "eta": "32:50:44", "max_grad_norm": 0.8, "loss": 0.5926207304000854, "grad_norm": 1.1042946577072144, "learning_rate": 5.8412935566869075e-05} +{"ts": "2025-12-28T01:46:56", "event": "train_log", "step": 6978, "epoch": 2.9443037974683546, "progress_pct": 49.07, "epoch_pct": 49.07, "eta": "32:49:51", "max_grad_norm": 0.8, "loss": 0.5590356588363647, "grad_norm": 1.2444789409637451, "learning_rate": 5.83892623564922e-05} +{"ts": "2025-12-28T01:47:08", "event": "train_log", "step": 6980, "epoch": 2.9451476793248945, "progress_pct": 49.09, "epoch_pct": 49.09, "eta": "32:48:57", "max_grad_norm": 0.8, "loss": 0.553716778755188, "grad_norm": 1.0782465934753418, "learning_rate": 5.8365587210873616e-05} +{"ts": "2025-12-28T01:47:20", "event": "train_log", "step": 6982, "epoch": 2.9459915611814345, "progress_pct": 49.1, "epoch_pct": 49.1, "eta": "32:48:04", "max_grad_norm": 0.8, "loss": 0.5937044024467468, "grad_norm": 1.1914669275283813, "learning_rate": 5.834191013547473e-05} +{"ts": "2025-12-28T01:47:33", "event": "train_log", "step": 6984, "epoch": 2.946835443037975, "progress_pct": 49.11, "epoch_pct": 49.11, "eta": "32:47:11", "max_grad_norm": 0.8, "loss": 0.6439019441604614, "grad_norm": 1.1819682121276855, "learning_rate": 5.83182311357574e-05} +{"ts": "2025-12-28T01:47:46", "event": "train_log", "step": 6986, "epoch": 2.947679324894515, "progress_pct": 49.13, "epoch_pct": 49.13, "eta": "32:46:18", "max_grad_norm": 0.8, "loss": 0.5403141379356384, "grad_norm": 1.1807081699371338, "learning_rate": 5.829455021718389e-05} +{"ts": "2025-12-28T01:47:58", "event": "train_log", "step": 6988, "epoch": 2.9485232067510547, "progress_pct": 49.14, "epoch_pct": 49.14, "eta": "32:45:24", "max_grad_norm": 0.8, "loss": 0.5281378626823425, "grad_norm": 1.2721227407455444, "learning_rate": 5.827086738521692e-05} +{"ts": "2025-12-28T01:48:11", "event": "train_log", "step": 6990, "epoch": 2.9493670886075947, "progress_pct": 49.16, "epoch_pct": 49.16, "eta": "32:44:30", "max_grad_norm": 0.8, "loss": 0.5722067952156067, "grad_norm": 1.6942147016525269, "learning_rate": 5.824718264531972e-05} +{"ts": "2025-12-28T01:48:23", "event": "train_log", "step": 6992, "epoch": 2.950210970464135, "progress_pct": 49.17, "epoch_pct": 49.17, "eta": "32:43:37", "max_grad_norm": 0.8, "loss": 0.6228076815605164, "grad_norm": 1.3415225744247437, "learning_rate": 5.8223496002955865e-05} +{"ts": "2025-12-28T01:48:35", "event": "train_log", "step": 6994, "epoch": 2.951054852320675, "progress_pct": 49.18, "epoch_pct": 49.18, "eta": "32:42:43", "max_grad_norm": 0.8, "loss": 0.6019303202629089, "grad_norm": 1.235356092453003, "learning_rate": 5.819980746358941e-05} +{"ts": "2025-12-28T01:48:48", "event": "train_log", "step": 6996, "epoch": 2.951898734177215, "progress_pct": 49.2, "epoch_pct": 49.2, "eta": "32:41:51", "max_grad_norm": 0.8, "loss": 0.5699147582054138, "grad_norm": 1.2500600814819336, "learning_rate": 5.817611703268486e-05} +{"ts": "2025-12-28T01:49:02", "event": "train_log", "step": 6998, "epoch": 2.9527426160337553, "progress_pct": 49.21, "epoch_pct": 49.21, "eta": "32:40:58", "max_grad_norm": 0.8, "loss": 0.6304079294204712, "grad_norm": 1.1581830978393555, "learning_rate": 5.8152424715707145e-05} +{"ts": "2025-12-28T01:49:14", "event": "train_log", "step": 7000, "epoch": 2.9535864978902953, "progress_pct": 49.23, "epoch_pct": 49.23, "eta": "32:40:05", "max_grad_norm": 0.8, "loss": 0.5464767217636108, "grad_norm": 1.2924201488494873, "learning_rate": 5.812873051812161e-05} +{"ts": "2025-12-28T01:57:47", "event": "train_log", "step": 7000, "epoch": 2.9535864978902953, "progress_pct": 49.23, "epoch_pct": 49.23, "eta": "32:48:54", "max_grad_norm": 0.8, "eval_loss": 0.6706293225288391, "eval_runtime": 513.4396, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104} +{"ts": "2025-12-28T01:58:01", "event": "train_log", "step": 7002, "epoch": 2.954430379746835, "progress_pct": 49.24, "epoch_pct": 49.24, "eta": "32:48:01", "max_grad_norm": 0.8, "loss": 0.5139666795730591, "grad_norm": 1.2045931816101074, "learning_rate": 5.810503444539405e-05} +{"ts": "2025-12-28T01:58:13", "event": "train_log", "step": 7004, "epoch": 2.9552742616033756, "progress_pct": 49.25, "epoch_pct": 49.25, "eta": "32:47:08", "max_grad_norm": 0.8, "loss": 0.574500322341919, "grad_norm": 1.0592173337936401, "learning_rate": 5.8081336502990716e-05} +{"ts": "2025-12-28T01:58:26", "event": "train_log", "step": 7006, "epoch": 2.9561181434599155, "progress_pct": 49.27, "epoch_pct": 49.27, "eta": "32:46:14", "max_grad_norm": 0.8, "loss": 0.5784007906913757, "grad_norm": 1.003440499305725, "learning_rate": 5.805763669637825e-05} +{"ts": "2025-12-28T01:58:38", "event": "train_log", "step": 7008, "epoch": 2.9569620253164555, "progress_pct": 49.28, "epoch_pct": 49.28, "eta": "32:45:21", "max_grad_norm": 0.8, "loss": 0.5930284261703491, "grad_norm": 1.2018240690231323, "learning_rate": 5.8033935031023757e-05} +{"ts": "2025-12-28T01:58:50", "event": "train_log", "step": 7010, "epoch": 2.957805907172996, "progress_pct": 49.3, "epoch_pct": 49.3, "eta": "32:44:27", "max_grad_norm": 0.8, "loss": 0.641598105430603, "grad_norm": 1.4118605852127075, "learning_rate": 5.801023151239473e-05} +{"ts": "2025-12-28T01:59:03", "event": "train_log", "step": 7012, "epoch": 2.958649789029536, "progress_pct": 49.31, "epoch_pct": 49.31, "eta": "32:43:33", "max_grad_norm": 0.8, "loss": 0.5804623365402222, "grad_norm": 1.167186975479126, "learning_rate": 5.798652614595914e-05} +{"ts": "2025-12-28T01:59:15", "event": "train_log", "step": 7014, "epoch": 2.9594936708860757, "progress_pct": 49.32, "epoch_pct": 49.32, "eta": "32:42:40", "max_grad_norm": 0.8, "loss": 0.6128653883934021, "grad_norm": 1.1934285163879395, "learning_rate": 5.796281893718536e-05} +{"ts": "2025-12-28T01:59:27", "event": "train_log", "step": 7016, "epoch": 2.960337552742616, "progress_pct": 49.34, "epoch_pct": 49.34, "eta": "32:41:46", "max_grad_norm": 0.8, "loss": 0.5458035469055176, "grad_norm": 1.1616190671920776, "learning_rate": 5.7939109891542164e-05} +{"ts": "2025-12-28T01:59:40", "event": "train_log", "step": 7018, "epoch": 2.961181434599156, "progress_pct": 49.35, "epoch_pct": 49.35, "eta": "32:40:52", "max_grad_norm": 0.8, "loss": 0.5968486070632935, "grad_norm": 1.2685189247131348, "learning_rate": 5.7915399014498814e-05} +{"ts": "2025-12-28T01:59:52", "event": "train_log", "step": 7020, "epoch": 2.962025316455696, "progress_pct": 49.37, "epoch_pct": 49.37, "eta": "32:39:59", "max_grad_norm": 0.8, "loss": 0.633076548576355, "grad_norm": 1.2075960636138916, "learning_rate": 5.789168631152491e-05} +{"ts": "2025-12-28T02:00:04", "event": "train_log", "step": 7022, "epoch": 2.9628691983122364, "progress_pct": 49.38, "epoch_pct": 49.38, "eta": "32:39:06", "max_grad_norm": 0.8, "loss": 0.592155933380127, "grad_norm": 1.1098700761795044, "learning_rate": 5.786797178809055e-05} +{"ts": "2025-12-28T02:00:17", "event": "train_log", "step": 7024, "epoch": 2.9637130801687763, "progress_pct": 49.4, "epoch_pct": 49.4, "eta": "32:38:12", "max_grad_norm": 0.8, "loss": 0.5480605363845825, "grad_norm": 1.1458083391189575, "learning_rate": 5.78442554496662e-05} +{"ts": "2025-12-28T02:00:30", "event": "train_log", "step": 7026, "epoch": 2.9645569620253163, "progress_pct": 49.41, "epoch_pct": 49.41, "eta": "32:37:19", "max_grad_norm": 0.8, "loss": 0.5167773365974426, "grad_norm": 1.0702389478683472, "learning_rate": 5.7820537301722766e-05} +{"ts": "2025-12-28T02:00:43", "event": "train_log", "step": 7028, "epoch": 2.9654008438818567, "progress_pct": 49.42, "epoch_pct": 49.42, "eta": "32:36:27", "max_grad_norm": 0.8, "loss": 0.5489409565925598, "grad_norm": 1.200501799583435, "learning_rate": 5.779681734973157e-05} +{"ts": "2025-12-28T02:00:56", "event": "train_log", "step": 7030, "epoch": 2.9662447257383966, "progress_pct": 49.44, "epoch_pct": 49.44, "eta": "32:35:34", "max_grad_norm": 0.8, "loss": 0.6175599098205566, "grad_norm": 1.075738549232483, "learning_rate": 5.777309559916435e-05} +{"ts": "2025-12-28T02:01:08", "event": "train_log", "step": 7032, "epoch": 2.9670886075949365, "progress_pct": 49.45, "epoch_pct": 49.45, "eta": "32:34:41", "max_grad_norm": 0.8, "loss": 0.5721893310546875, "grad_norm": 1.2832911014556885, "learning_rate": 5.774937205549328e-05} +{"ts": "2025-12-28T02:01:20", "event": "train_log", "step": 7034, "epoch": 2.967932489451477, "progress_pct": 49.47, "epoch_pct": 49.47, "eta": "32:33:47", "max_grad_norm": 0.8, "loss": 0.7007027864456177, "grad_norm": 1.3263260126113892, "learning_rate": 5.7725646724190884e-05} +{"ts": "2025-12-28T02:01:32", "event": "train_log", "step": 7036, "epoch": 2.968776371308017, "progress_pct": 49.48, "epoch_pct": 49.48, "eta": "32:32:53", "max_grad_norm": 0.8, "loss": 0.5676232576370239, "grad_norm": 1.254817247390747, "learning_rate": 5.770191961073017e-05} +{"ts": "2025-12-28T02:01:45", "event": "train_log", "step": 7038, "epoch": 2.969620253164557, "progress_pct": 49.49, "epoch_pct": 49.49, "eta": "32:32:01", "max_grad_norm": 0.8, "loss": 0.5563743114471436, "grad_norm": 1.0725815296173096, "learning_rate": 5.767819072058453e-05} +{"ts": "2025-12-28T02:01:57", "event": "train_log", "step": 7040, "epoch": 2.970464135021097, "progress_pct": 49.51, "epoch_pct": 49.51, "eta": "32:31:07", "max_grad_norm": 0.8, "loss": 0.6449083089828491, "grad_norm": 1.2760009765625, "learning_rate": 5.765446005922774e-05} +{"ts": "2025-12-28T02:02:10", "event": "train_log", "step": 7042, "epoch": 2.971308016877637, "progress_pct": 49.52, "epoch_pct": 49.52, "eta": "32:30:14", "max_grad_norm": 0.8, "loss": 0.6155483722686768, "grad_norm": 1.2716739177703857, "learning_rate": 5.763072763213402e-05} +{"ts": "2025-12-28T02:02:22", "event": "train_log", "step": 7044, "epoch": 2.972151898734177, "progress_pct": 49.54, "epoch_pct": 49.54, "eta": "32:29:21", "max_grad_norm": 0.8, "loss": 0.5797539949417114, "grad_norm": 1.3112155199050903, "learning_rate": 5.7606993444778004e-05} +{"ts": "2025-12-28T02:02:35", "event": "train_log", "step": 7046, "epoch": 2.9729957805907175, "progress_pct": 49.55, "epoch_pct": 49.55, "eta": "32:28:28", "max_grad_norm": 0.8, "loss": 0.5277710556983948, "grad_norm": 1.069555401802063, "learning_rate": 5.758325750263468e-05} +{"ts": "2025-12-28T02:02:47", "event": "train_log", "step": 7048, "epoch": 2.9738396624472574, "progress_pct": 49.56, "epoch_pct": 49.56, "eta": "32:27:35", "max_grad_norm": 0.8, "loss": 0.5641140937805176, "grad_norm": 1.2229703664779663, "learning_rate": 5.755951981117949e-05} +{"ts": "2025-12-28T02:03:00", "event": "train_log", "step": 7050, "epoch": 2.9746835443037973, "progress_pct": 49.58, "epoch_pct": 49.58, "eta": "32:26:42", "max_grad_norm": 0.8, "loss": 0.5734127163887024, "grad_norm": 1.1228448152542114, "learning_rate": 5.753578037588827e-05} +{"ts": "2025-12-28T02:03:12", "event": "train_log", "step": 7052, "epoch": 2.9755274261603377, "progress_pct": 49.59, "epoch_pct": 49.59, "eta": "32:25:48", "max_grad_norm": 0.8, "loss": 0.5992875695228577, "grad_norm": 1.372084379196167, "learning_rate": 5.751203920223724e-05} +{"ts": "2025-12-28T02:03:25", "event": "train_log", "step": 7054, "epoch": 2.9763713080168777, "progress_pct": 49.61, "epoch_pct": 49.61, "eta": "32:24:56", "max_grad_norm": 0.8, "loss": 0.5811893343925476, "grad_norm": 1.232243537902832, "learning_rate": 5.7488296295703036e-05} +{"ts": "2025-12-28T02:03:38", "event": "train_log", "step": 7056, "epoch": 2.9772151898734176, "progress_pct": 49.62, "epoch_pct": 49.62, "eta": "32:24:03", "max_grad_norm": 0.8, "loss": 0.5502846240997314, "grad_norm": 1.1907097101211548, "learning_rate": 5.746455166176269e-05} +{"ts": "2025-12-28T02:03:51", "event": "train_log", "step": 7058, "epoch": 2.978059071729958, "progress_pct": 49.63, "epoch_pct": 49.63, "eta": "32:23:11", "max_grad_norm": 0.8, "loss": 0.5908812284469604, "grad_norm": 1.1842679977416992, "learning_rate": 5.7440805305893644e-05} +{"ts": "2025-12-28T02:04:03", "event": "train_log", "step": 7060, "epoch": 2.978902953586498, "progress_pct": 49.65, "epoch_pct": 49.65, "eta": "32:22:17", "max_grad_norm": 0.8, "loss": 0.5468931198120117, "grad_norm": 1.2167452573776245, "learning_rate": 5.741705723357371e-05} +{"ts": "2025-12-28T02:04:16", "event": "train_log", "step": 7062, "epoch": 2.979746835443038, "progress_pct": 49.66, "epoch_pct": 49.66, "eta": "32:21:25", "max_grad_norm": 0.8, "loss": 0.5421503782272339, "grad_norm": 1.2835358381271362, "learning_rate": 5.739330745028113e-05} +{"ts": "2025-12-28T02:04:28", "event": "train_log", "step": 7064, "epoch": 2.9805907172995783, "progress_pct": 49.68, "epoch_pct": 49.68, "eta": "32:20:32", "max_grad_norm": 0.8, "loss": 0.5574424266815186, "grad_norm": 1.230869174003601, "learning_rate": 5.736955596149449e-05} +{"ts": "2025-12-28T02:04:41", "event": "train_log", "step": 7066, "epoch": 2.981434599156118, "progress_pct": 49.69, "epoch_pct": 49.69, "eta": "32:19:40", "max_grad_norm": 0.8, "loss": 0.5349726676940918, "grad_norm": 1.1757540702819824, "learning_rate": 5.7345802772692844e-05} +{"ts": "2025-12-28T02:04:54", "event": "train_log", "step": 7068, "epoch": 2.982278481012658, "progress_pct": 49.7, "epoch_pct": 49.7, "eta": "32:18:47", "max_grad_norm": 0.8, "loss": 0.5739659667015076, "grad_norm": 1.2147842645645142, "learning_rate": 5.732204788935558e-05} +{"ts": "2025-12-28T02:05:06", "event": "train_log", "step": 7070, "epoch": 2.9831223628691985, "progress_pct": 49.72, "epoch_pct": 49.72, "eta": "32:17:54", "max_grad_norm": 0.8, "loss": 0.6001242995262146, "grad_norm": 1.1981799602508545, "learning_rate": 5.729829131696247e-05} +{"ts": "2025-12-28T02:05:19", "event": "train_log", "step": 7072, "epoch": 2.9839662447257385, "progress_pct": 49.73, "epoch_pct": 49.73, "eta": "32:17:02", "max_grad_norm": 0.8, "loss": 0.5373315811157227, "grad_norm": 1.0104349851608276, "learning_rate": 5.7274533060993744e-05} +{"ts": "2025-12-28T02:05:32", "event": "train_log", "step": 7074, "epoch": 2.9848101265822784, "progress_pct": 49.75, "epoch_pct": 49.75, "eta": "32:16:09", "max_grad_norm": 0.8, "loss": 0.6236737370491028, "grad_norm": 1.31861412525177, "learning_rate": 5.725077312692994e-05} +{"ts": "2025-12-28T02:05:45", "event": "train_log", "step": 7076, "epoch": 2.985654008438819, "progress_pct": 49.76, "epoch_pct": 49.76, "eta": "32:15:17", "max_grad_norm": 0.8, "loss": 0.5138278007507324, "grad_norm": 1.2060835361480713, "learning_rate": 5.722701152025203e-05} +{"ts": "2025-12-28T02:05:57", "event": "train_log", "step": 7078, "epoch": 2.9864978902953587, "progress_pct": 49.77, "epoch_pct": 49.77, "eta": "32:14:24", "max_grad_norm": 0.8, "loss": 0.5775829553604126, "grad_norm": 1.2231637239456177, "learning_rate": 5.720324824644134e-05} +{"ts": "2025-12-28T02:06:10", "event": "train_log", "step": 7080, "epoch": 2.9873417721518987, "progress_pct": 49.79, "epoch_pct": 49.79, "eta": "32:13:31", "max_grad_norm": 0.8, "loss": 0.5619624853134155, "grad_norm": 1.110559344291687, "learning_rate": 5.717948331097965e-05} +{"ts": "2025-12-28T02:06:23", "event": "train_log", "step": 7082, "epoch": 2.988185654008439, "progress_pct": 49.8, "epoch_pct": 49.8, "eta": "32:12:39", "max_grad_norm": 0.8, "loss": 0.5401903390884399, "grad_norm": 1.0486462116241455, "learning_rate": 5.715571671934903e-05} +{"ts": "2025-12-28T02:06:35", "event": "train_log", "step": 7084, "epoch": 2.989029535864979, "progress_pct": 49.82, "epoch_pct": 49.82, "eta": "32:11:47", "max_grad_norm": 0.8, "loss": 0.6185324192047119, "grad_norm": 1.7979792356491089, "learning_rate": 5.713194847703201e-05} +{"ts": "2025-12-28T02:06:48", "event": "train_log", "step": 7086, "epoch": 2.989873417721519, "progress_pct": 49.83, "epoch_pct": 49.83, "eta": "32:10:55", "max_grad_norm": 0.8, "loss": 0.5637381672859192, "grad_norm": 1.1270287036895752, "learning_rate": 5.710817858951143e-05} +{"ts": "2025-12-28T02:07:01", "event": "train_log", "step": 7088, "epoch": 2.9907172995780593, "progress_pct": 49.85, "epoch_pct": 49.85, "eta": "32:10:03", "max_grad_norm": 0.8, "loss": 0.5341202020645142, "grad_norm": 1.0734593868255615, "learning_rate": 5.708440706227055e-05} +{"ts": "2025-12-28T02:07:13", "event": "train_log", "step": 7090, "epoch": 2.9915611814345993, "progress_pct": 49.86, "epoch_pct": 49.86, "eta": "32:09:10", "max_grad_norm": 0.8, "loss": 0.6088040471076965, "grad_norm": 1.1479569673538208, "learning_rate": 5.7060633900793035e-05} +{"ts": "2025-12-28T02:07:25", "event": "train_log", "step": 7092, "epoch": 2.992405063291139, "progress_pct": 49.87, "epoch_pct": 49.87, "eta": "32:08:16", "max_grad_norm": 0.8, "loss": 0.6260532736778259, "grad_norm": 1.417993426322937, "learning_rate": 5.703685911056288e-05} +{"ts": "2025-12-28T02:07:38", "event": "train_log", "step": 7094, "epoch": 2.9932489451476796, "progress_pct": 49.89, "epoch_pct": 49.89, "eta": "32:07:24", "max_grad_norm": 0.8, "loss": 0.5241007804870605, "grad_norm": 1.0302354097366333, "learning_rate": 5.701308269706449e-05} +{"ts": "2025-12-28T02:07:51", "event": "train_log", "step": 7096, "epoch": 2.9940928270042195, "progress_pct": 49.9, "epoch_pct": 49.9, "eta": "32:06:32", "max_grad_norm": 0.8, "loss": 0.5663899183273315, "grad_norm": 1.0818110704421997, "learning_rate": 5.6989304665782585e-05} +{"ts": "2025-12-28T02:08:03", "event": "train_log", "step": 7098, "epoch": 2.9949367088607595, "progress_pct": 49.92, "epoch_pct": 49.92, "eta": "32:05:39", "max_grad_norm": 0.8, "loss": 0.6420456171035767, "grad_norm": 1.3382261991500854, "learning_rate": 5.696552502220235e-05} +{"ts": "2025-12-28T02:08:16", "event": "train_log", "step": 7100, "epoch": 2.9957805907173, "progress_pct": 49.93, "epoch_pct": 49.93, "eta": "32:04:47", "max_grad_norm": 0.8, "loss": 0.6239140033721924, "grad_norm": 1.0404452085494995, "learning_rate": 5.6941743771809254e-05} +{"ts": "2025-12-28T02:16:49", "event": "train_log", "step": 7100, "epoch": 2.9957805907173, "progress_pct": 49.93, "epoch_pct": 49.93, "eta": "32:13:22", "max_grad_norm": 0.8, "eval_loss": 0.6692973375320435, "eval_runtime": 512.8985, "eval_samples_per_second": 4.108, "eval_steps_per_second": 4.108} +{"ts": "2025-12-28T02:17:02", "event": "train_log", "step": 7102, "epoch": 2.99662447257384, "progress_pct": 49.94, "epoch_pct": 49.94, "eta": "32:12:30", "max_grad_norm": 0.8, "loss": 0.5956323146820068, "grad_norm": 1.0349514484405518, "learning_rate": 5.691796092008918e-05} +{"ts": "2025-12-28T02:17:15", "event": "train_log", "step": 7104, "epoch": 2.9974683544303797, "progress_pct": 49.96, "epoch_pct": 49.96, "eta": "32:11:37", "max_grad_norm": 0.8, "loss": 0.5639365911483765, "grad_norm": 1.0786800384521484, "learning_rate": 5.689417647252839e-05} +{"ts": "2025-12-28T02:17:27", "event": "train_log", "step": 7106, "epoch": 2.9983122362869197, "progress_pct": 49.97, "epoch_pct": 49.97, "eta": "32:10:45", "max_grad_norm": 0.8, "loss": 0.5529769659042358, "grad_norm": 1.2075775861740112, "learning_rate": 5.687039043461351e-05} +{"ts": "2025-12-28T02:17:39", "event": "train_log", "step": 7108, "epoch": 2.99915611814346, "progress_pct": 49.99, "epoch_pct": 49.99, "eta": "32:09:51", "max_grad_norm": 0.8, "loss": 0.5834671258926392, "grad_norm": 1.2835887670516968, "learning_rate": 5.6846602811831496e-05} +{"ts": "2025-12-28T02:17:52", "event": "train_log", "step": 7110, "epoch": 3.0, "progress_pct": 50.0, "epoch_pct": 50.0, "eta": "32:08:59", "max_grad_norm": 0.8, "loss": 0.5820922255516052, "grad_norm": 1.3102463483810425, "learning_rate": 5.682281360966969e-05} +{"ts": "2025-12-28T02:18:04", "event": "train_log", "step": 7112, "epoch": 3.00084388185654, "progress_pct": 50.01, "epoch_pct": 50.01, "eta": "32:08:06", "max_grad_norm": 0.8, "loss": 0.5958086252212524, "grad_norm": 1.24532949924469, "learning_rate": 5.679902283361582e-05} +{"ts": "2025-12-28T02:18:17", "event": "train_log", "step": 7114, "epoch": 3.0016877637130803, "progress_pct": 50.03, "epoch_pct": 50.03, "eta": "32:07:14", "max_grad_norm": 0.8, "loss": 0.5267294645309448, "grad_norm": 1.0468344688415527, "learning_rate": 5.677523048915798e-05} +{"ts": "2025-12-28T02:18:30", "event": "train_log", "step": 7116, "epoch": 3.0025316455696203, "progress_pct": 50.04, "epoch_pct": 50.04, "eta": "32:06:22", "max_grad_norm": 0.8, "loss": 0.49180498719215393, "grad_norm": 1.2053340673446655, "learning_rate": 5.675143658178458e-05} +{"ts": "2025-12-28T02:18:43", "event": "train_log", "step": 7118, "epoch": 3.00337552742616, "progress_pct": 50.06, "epoch_pct": 50.06, "eta": "32:05:30", "max_grad_norm": 0.8, "loss": 0.6163156032562256, "grad_norm": 1.1861987113952637, "learning_rate": 5.6727641116984406e-05} +{"ts": "2025-12-28T02:18:56", "event": "train_log", "step": 7120, "epoch": 3.0042194092827006, "progress_pct": 50.07, "epoch_pct": 50.07, "eta": "32:04:38", "max_grad_norm": 0.8, "loss": 0.4780079424381256, "grad_norm": 0.9804314374923706, "learning_rate": 5.670384410024665e-05} +{"ts": "2025-12-28T02:19:09", "event": "train_log", "step": 7122, "epoch": 3.0050632911392405, "progress_pct": 50.08, "epoch_pct": 50.08, "eta": "32:03:46", "max_grad_norm": 0.8, "loss": 0.4762009382247925, "grad_norm": 1.148734450340271, "learning_rate": 5.668004553706081e-05} +{"ts": "2025-12-28T02:19:22", "event": "train_log", "step": 7124, "epoch": 3.0059071729957805, "progress_pct": 50.1, "epoch_pct": 50.1, "eta": "32:02:54", "max_grad_norm": 0.8, "loss": 0.5391061305999756, "grad_norm": 1.3817394971847534, "learning_rate": 5.665624543291677e-05} +{"ts": "2025-12-28T02:19:34", "event": "train_log", "step": 7126, "epoch": 3.006751054852321, "progress_pct": 50.11, "epoch_pct": 50.11, "eta": "32:02:01", "max_grad_norm": 0.8, "loss": 0.5118980407714844, "grad_norm": 1.2641339302062988, "learning_rate": 5.663244379330471e-05} +{"ts": "2025-12-28T02:19:47", "event": "train_log", "step": 7128, "epoch": 3.007594936708861, "progress_pct": 50.13, "epoch_pct": 50.13, "eta": "32:01:09", "max_grad_norm": 0.8, "loss": 0.5076818466186523, "grad_norm": 1.1882877349853516, "learning_rate": 5.660864062371527e-05} +{"ts": "2025-12-28T02:19:59", "event": "train_log", "step": 7130, "epoch": 3.0084388185654007, "progress_pct": 50.14, "epoch_pct": 50.14, "eta": "32:00:16", "max_grad_norm": 0.8, "loss": 0.5128282308578491, "grad_norm": 1.3996630907058716, "learning_rate": 5.658483592963936e-05} +{"ts": "2025-12-28T02:20:11", "event": "train_log", "step": 7132, "epoch": 3.009282700421941, "progress_pct": 50.15, "epoch_pct": 50.15, "eta": "31:59:23", "max_grad_norm": 0.8, "loss": 0.5689603090286255, "grad_norm": 1.4738327264785767, "learning_rate": 5.6561029716568246e-05} +{"ts": "2025-12-28T02:20:24", "event": "train_log", "step": 7134, "epoch": 3.010126582278481, "progress_pct": 50.17, "epoch_pct": 50.17, "eta": "31:58:32", "max_grad_norm": 0.8, "loss": 0.537216067314148, "grad_norm": 1.2539118528366089, "learning_rate": 5.6537221989993605e-05} +{"ts": "2025-12-28T02:20:37", "event": "train_log", "step": 7136, "epoch": 3.010970464135021, "progress_pct": 50.18, "epoch_pct": 50.18, "eta": "31:57:39", "max_grad_norm": 0.8, "loss": 0.5913172960281372, "grad_norm": 1.2467267513275146, "learning_rate": 5.6513412755407394e-05} +{"ts": "2025-12-28T02:20:49", "event": "train_log", "step": 7138, "epoch": 3.0118143459915614, "progress_pct": 50.2, "epoch_pct": 50.2, "eta": "31:56:46", "max_grad_norm": 0.8, "loss": 0.535701334476471, "grad_norm": 1.232380986213684, "learning_rate": 5.648960201830194e-05} +{"ts": "2025-12-28T02:21:01", "event": "train_log", "step": 7140, "epoch": 3.0126582278481013, "progress_pct": 50.21, "epoch_pct": 50.21, "eta": "31:55:53", "max_grad_norm": 0.8, "loss": 0.5035087466239929, "grad_norm": 1.2236435413360596, "learning_rate": 5.6465789784169944e-05} +{"ts": "2025-12-28T02:21:14", "event": "train_log", "step": 7142, "epoch": 3.0135021097046413, "progress_pct": 50.23, "epoch_pct": 50.23, "eta": "31:55:02", "max_grad_norm": 0.8, "loss": 0.5219660401344299, "grad_norm": 1.1154464483261108, "learning_rate": 5.6441976058504444e-05} +{"ts": "2025-12-28T02:21:27", "event": "train_log", "step": 7144, "epoch": 3.014345991561181, "progress_pct": 50.24, "epoch_pct": 50.24, "eta": "31:54:10", "max_grad_norm": 0.8, "loss": 0.5170891880989075, "grad_norm": 1.1690709590911865, "learning_rate": 5.6418160846798765e-05} +{"ts": "2025-12-28T02:21:40", "event": "train_log", "step": 7146, "epoch": 3.0151898734177216, "progress_pct": 50.25, "epoch_pct": 50.25, "eta": "31:53:19", "max_grad_norm": 0.8, "loss": 0.52115398645401, "grad_norm": 1.3172271251678467, "learning_rate": 5.639434415454663e-05} +{"ts": "2025-12-28T02:21:53", "event": "train_log", "step": 7148, "epoch": 3.0160337552742615, "progress_pct": 50.27, "epoch_pct": 50.27, "eta": "31:52:27", "max_grad_norm": 0.8, "loss": 0.49015527963638306, "grad_norm": 1.1508091688156128, "learning_rate": 5.637052598724213e-05} +{"ts": "2025-12-28T02:22:06", "event": "train_log", "step": 7150, "epoch": 3.0168776371308015, "progress_pct": 50.28, "epoch_pct": 50.28, "eta": "31:51:35", "max_grad_norm": 0.8, "loss": 0.5465641021728516, "grad_norm": 1.1777493953704834, "learning_rate": 5.634670635037962e-05} +{"ts": "2025-12-28T02:22:19", "event": "train_log", "step": 7152, "epoch": 3.017721518987342, "progress_pct": 50.3, "epoch_pct": 50.3, "eta": "31:50:43", "max_grad_norm": 0.8, "loss": 0.5174515843391418, "grad_norm": 1.2320231199264526, "learning_rate": 5.632288524945385e-05} +{"ts": "2025-12-28T02:22:31", "event": "train_log", "step": 7154, "epoch": 3.018565400843882, "progress_pct": 50.31, "epoch_pct": 50.31, "eta": "31:49:51", "max_grad_norm": 0.8, "loss": 0.521284818649292, "grad_norm": 1.3233075141906738, "learning_rate": 5.629906268995988e-05} +{"ts": "2025-12-28T02:22:44", "event": "train_log", "step": 7156, "epoch": 3.0194092827004217, "progress_pct": 50.32, "epoch_pct": 50.32, "eta": "31:48:59", "max_grad_norm": 0.8, "loss": 0.4841000437736511, "grad_norm": 1.1378387212753296, "learning_rate": 5.6275238677393136e-05} +{"ts": "2025-12-28T02:22:55", "event": "train_log", "step": 7158, "epoch": 3.020253164556962, "progress_pct": 50.34, "epoch_pct": 50.34, "eta": "31:48:06", "max_grad_norm": 0.8, "loss": 0.5399911403656006, "grad_norm": 1.4944018125534058, "learning_rate": 5.6251413217249325e-05} +{"ts": "2025-12-28T02:23:07", "event": "train_log", "step": 7160, "epoch": 3.021097046413502, "progress_pct": 50.35, "epoch_pct": 50.35, "eta": "31:47:13", "max_grad_norm": 0.8, "loss": 0.6075693368911743, "grad_norm": 1.3964036703109741, "learning_rate": 5.622758631502457e-05} +{"ts": "2025-12-28T02:23:21", "event": "train_log", "step": 7162, "epoch": 3.021940928270042, "progress_pct": 50.37, "epoch_pct": 50.37, "eta": "31:46:22", "max_grad_norm": 0.8, "loss": 0.4700590968132019, "grad_norm": 1.2494895458221436, "learning_rate": 5.6203757976215244e-05} +{"ts": "2025-12-28T02:23:34", "event": "train_log", "step": 7164, "epoch": 3.0227848101265824, "progress_pct": 50.38, "epoch_pct": 50.38, "eta": "31:45:31", "max_grad_norm": 0.8, "loss": 0.46371224522590637, "grad_norm": 1.2082068920135498, "learning_rate": 5.617992820631809e-05} +{"ts": "2025-12-28T02:23:46", "event": "train_log", "step": 7166, "epoch": 3.0236286919831223, "progress_pct": 50.39, "epoch_pct": 50.39, "eta": "31:44:38", "max_grad_norm": 0.8, "loss": 0.6175356507301331, "grad_norm": 1.2820552587509155, "learning_rate": 5.61560970108302e-05} +{"ts": "2025-12-28T02:23:57", "event": "train_log", "step": 7168, "epoch": 3.0244725738396623, "progress_pct": 50.41, "epoch_pct": 50.41, "eta": "31:43:45", "max_grad_norm": 0.8, "loss": 0.5443550944328308, "grad_norm": 1.243906855583191, "learning_rate": 5.613226439524896e-05} +{"ts": "2025-12-28T02:24:10", "event": "train_log", "step": 7170, "epoch": 3.0253164556962027, "progress_pct": 50.42, "epoch_pct": 50.42, "eta": "31:42:53", "max_grad_norm": 0.8, "loss": 0.540513277053833, "grad_norm": 1.2818046808242798, "learning_rate": 5.6108430365072097e-05} +{"ts": "2025-12-28T02:24:22", "event": "train_log", "step": 7172, "epoch": 3.0261603375527426, "progress_pct": 50.44, "epoch_pct": 50.44, "eta": "31:42:01", "max_grad_norm": 0.8, "loss": 0.47928962111473083, "grad_norm": 1.2159545421600342, "learning_rate": 5.608459492579765e-05} +{"ts": "2025-12-28T02:24:35", "event": "train_log", "step": 7174, "epoch": 3.0270042194092825, "progress_pct": 50.45, "epoch_pct": 50.45, "eta": "31:41:09", "max_grad_norm": 0.8, "loss": 0.572704553604126, "grad_norm": 1.2186859846115112, "learning_rate": 5.606075808292401e-05} +{"ts": "2025-12-28T02:24:48", "event": "train_log", "step": 7176, "epoch": 3.027848101265823, "progress_pct": 50.46, "epoch_pct": 50.46, "eta": "31:40:18", "max_grad_norm": 0.8, "loss": 0.5537641048431396, "grad_norm": 1.0899910926818848, "learning_rate": 5.60369198419499e-05} +{"ts": "2025-12-28T02:25:00", "event": "train_log", "step": 7178, "epoch": 3.028691983122363, "progress_pct": 50.48, "epoch_pct": 50.48, "eta": "31:39:26", "max_grad_norm": 0.8, "loss": 0.5430077910423279, "grad_norm": 1.1885626316070557, "learning_rate": 5.601308020837431e-05} +{"ts": "2025-12-28T02:25:13", "event": "train_log", "step": 7180, "epoch": 3.029535864978903, "progress_pct": 50.49, "epoch_pct": 50.49, "eta": "31:38:34", "max_grad_norm": 0.8, "loss": 0.5838874578475952, "grad_norm": 1.3681434392929077, "learning_rate": 5.5989239187696595e-05} +{"ts": "2025-12-28T02:25:25", "event": "train_log", "step": 7182, "epoch": 3.030379746835443, "progress_pct": 50.51, "epoch_pct": 50.51, "eta": "31:37:42", "max_grad_norm": 0.8, "loss": 0.5168817639350891, "grad_norm": 1.4902375936508179, "learning_rate": 5.596539678541644e-05} +{"ts": "2025-12-28T02:25:36", "event": "train_log", "step": 7184, "epoch": 3.031223628691983, "progress_pct": 50.52, "epoch_pct": 50.52, "eta": "31:36:50", "max_grad_norm": 0.8, "loss": 0.5464931726455688, "grad_norm": 1.4395933151245117, "learning_rate": 5.59415530070338e-05} +{"ts": "2025-12-28T02:25:49", "event": "train_log", "step": 7186, "epoch": 3.032067510548523, "progress_pct": 50.53, "epoch_pct": 50.53, "eta": "31:35:58", "max_grad_norm": 0.8, "loss": 0.5364856123924255, "grad_norm": 1.2699668407440186, "learning_rate": 5.5917707858049e-05} +{"ts": "2025-12-28T02:26:01", "event": "train_log", "step": 7188, "epoch": 3.0329113924050635, "progress_pct": 50.55, "epoch_pct": 50.55, "eta": "31:35:06", "max_grad_norm": 0.8, "loss": 0.5676021575927734, "grad_norm": 1.1673169136047363, "learning_rate": 5.589386134396264e-05} +{"ts": "2025-12-28T02:26:14", "event": "train_log", "step": 7190, "epoch": 3.0337552742616034, "progress_pct": 50.56, "epoch_pct": 50.56, "eta": "31:34:14", "max_grad_norm": 0.8, "loss": 0.5174224972724915, "grad_norm": 1.2029050588607788, "learning_rate": 5.5870013470275675e-05} +{"ts": "2025-12-28T02:26:26", "event": "train_log", "step": 7192, "epoch": 3.0345991561181433, "progress_pct": 50.58, "epoch_pct": 50.58, "eta": "31:33:22", "max_grad_norm": 0.8, "loss": 0.5298268795013428, "grad_norm": 1.2046477794647217, "learning_rate": 5.5846164242489326e-05} +{"ts": "2025-12-28T02:26:39", "event": "train_log", "step": 7194, "epoch": 3.0354430379746837, "progress_pct": 50.59, "epoch_pct": 50.59, "eta": "31:32:31", "max_grad_norm": 0.8, "loss": 0.5120787024497986, "grad_norm": 1.2438830137252808, "learning_rate": 5.582231366610516e-05} +{"ts": "2025-12-28T02:26:52", "event": "train_log", "step": 7196, "epoch": 3.0362869198312237, "progress_pct": 50.6, "epoch_pct": 50.6, "eta": "31:31:39", "max_grad_norm": 0.8, "loss": 0.4706324338912964, "grad_norm": 1.1918164491653442, "learning_rate": 5.579846174662506e-05} +{"ts": "2025-12-28T02:27:04", "event": "train_log", "step": 7198, "epoch": 3.0371308016877636, "progress_pct": 50.62, "epoch_pct": 50.62, "eta": "31:30:48", "max_grad_norm": 0.8, "loss": 0.5319511294364929, "grad_norm": 1.125056266784668, "learning_rate": 5.57746084895512e-05} +{"ts": "2025-12-28T02:27:16", "event": "train_log", "step": 7200, "epoch": 3.037974683544304, "progress_pct": 50.63, "epoch_pct": 50.63, "eta": "31:29:55", "max_grad_norm": 0.8, "loss": 0.5893887877464294, "grad_norm": 1.3552099466323853, "learning_rate": 5.575075390038607e-05} +{"ts": "2025-12-28T02:35:50", "event": "train_log", "step": 7200, "epoch": 3.037974683544304, "progress_pct": 50.63, "epoch_pct": 50.63, "eta": "31:38:16", "max_grad_norm": 0.8, "eval_loss": 0.6751418709754944, "eval_runtime": 513.8972, "eval_samples_per_second": 4.1, "eval_steps_per_second": 4.1} +{"ts": "2025-12-28T02:36:02", "event": "train_log", "step": 7202, "epoch": 3.038818565400844, "progress_pct": 50.65, "epoch_pct": 50.65, "eta": "31:37:24", "max_grad_norm": 0.8, "loss": 0.5680004358291626, "grad_norm": 1.3924046754837036, "learning_rate": 5.572689798463243e-05} +{"ts": "2025-12-28T02:36:15", "event": "train_log", "step": 7204, "epoch": 3.039662447257384, "progress_pct": 50.66, "epoch_pct": 50.66, "eta": "31:36:33", "max_grad_norm": 0.8, "loss": 0.5572541356086731, "grad_norm": 1.3154771327972412, "learning_rate": 5.5703040747793444e-05} +{"ts": "2025-12-28T02:36:28", "event": "train_log", "step": 7206, "epoch": 3.0405063291139243, "progress_pct": 50.68, "epoch_pct": 50.68, "eta": "31:35:41", "max_grad_norm": 0.8, "loss": 0.535094141960144, "grad_norm": 1.2266511917114258, "learning_rate": 5.567918219537247e-05} +{"ts": "2025-12-28T02:36:40", "event": "train_log", "step": 7208, "epoch": 3.041350210970464, "progress_pct": 50.69, "epoch_pct": 50.69, "eta": "31:34:50", "max_grad_norm": 0.8, "loss": 0.5958529710769653, "grad_norm": 1.2234530448913574, "learning_rate": 5.565532233287324e-05} +{"ts": "2025-12-28T02:36:53", "event": "train_log", "step": 7210, "epoch": 3.042194092827004, "progress_pct": 50.7, "epoch_pct": 50.7, "eta": "31:33:58", "max_grad_norm": 0.8, "loss": 0.5555807948112488, "grad_norm": 1.2451010942459106, "learning_rate": 5.563146116579977e-05} +{"ts": "2025-12-28T02:37:05", "event": "train_log", "step": 7212, "epoch": 3.043037974683544, "progress_pct": 50.72, "epoch_pct": 50.72, "eta": "31:33:05", "max_grad_norm": 0.8, "loss": 0.5391029715538025, "grad_norm": 1.518996000289917, "learning_rate": 5.560759869965635e-05} +{"ts": "2025-12-28T02:37:17", "event": "train_log", "step": 7214, "epoch": 3.0438818565400845, "progress_pct": 50.73, "epoch_pct": 50.73, "eta": "31:32:13", "max_grad_norm": 0.8, "loss": 0.6110212802886963, "grad_norm": 1.4555507898330688, "learning_rate": 5.5583734939947604e-05} +{"ts": "2025-12-28T02:37:30", "event": "train_log", "step": 7216, "epoch": 3.0447257383966244, "progress_pct": 50.75, "epoch_pct": 50.75, "eta": "31:31:22", "max_grad_norm": 0.8, "loss": 0.4841096103191376, "grad_norm": 1.1732209920883179, "learning_rate": 5.555986989217844e-05} +{"ts": "2025-12-28T02:37:42", "event": "train_log", "step": 7218, "epoch": 3.0455696202531644, "progress_pct": 50.76, "epoch_pct": 50.76, "eta": "31:30:30", "max_grad_norm": 0.8, "loss": 0.5234199166297913, "grad_norm": 1.3211549520492554, "learning_rate": 5.55360035618541e-05} +{"ts": "2025-12-28T02:37:55", "event": "train_log", "step": 7220, "epoch": 3.0464135021097047, "progress_pct": 50.77, "epoch_pct": 50.77, "eta": "31:29:39", "max_grad_norm": 0.8, "loss": 0.5311322808265686, "grad_norm": 1.0290759801864624, "learning_rate": 5.551213595448003e-05} +{"ts": "2025-12-28T02:38:08", "event": "train_log", "step": 7222, "epoch": 3.0472573839662447, "progress_pct": 50.79, "epoch_pct": 50.79, "eta": "31:28:48", "max_grad_norm": 0.8, "loss": 0.5279681086540222, "grad_norm": 1.3045908212661743, "learning_rate": 5.548826707556206e-05} +{"ts": "2025-12-28T02:38:21", "event": "train_log", "step": 7224, "epoch": 3.0481012658227846, "progress_pct": 50.8, "epoch_pct": 50.8, "eta": "31:27:56", "max_grad_norm": 0.8, "loss": 0.47327345609664917, "grad_norm": 1.039219617843628, "learning_rate": 5.54643969306063e-05} +{"ts": "2025-12-28T02:38:33", "event": "train_log", "step": 7226, "epoch": 3.048945147679325, "progress_pct": 50.82, "epoch_pct": 50.82, "eta": "31:27:04", "max_grad_norm": 0.8, "loss": 0.5803293585777283, "grad_norm": 1.5341938734054565, "learning_rate": 5.544052552511909e-05} +{"ts": "2025-12-28T02:38:46", "event": "train_log", "step": 7228, "epoch": 3.049789029535865, "progress_pct": 50.83, "epoch_pct": 50.83, "eta": "31:26:13", "max_grad_norm": 0.8, "loss": 0.5452714562416077, "grad_norm": 1.24624502658844, "learning_rate": 5.5416652864607156e-05} +{"ts": "2025-12-28T02:38:58", "event": "train_log", "step": 7230, "epoch": 3.050632911392405, "progress_pct": 50.84, "epoch_pct": 50.84, "eta": "31:25:21", "max_grad_norm": 0.8, "loss": 0.48333147168159485, "grad_norm": 1.192566156387329, "learning_rate": 5.5392778954577416e-05} +{"ts": "2025-12-28T02:39:11", "event": "train_log", "step": 7232, "epoch": 3.0514767932489453, "progress_pct": 50.86, "epoch_pct": 50.86, "eta": "31:24:30", "max_grad_norm": 0.8, "loss": 0.4947234094142914, "grad_norm": 1.3091192245483398, "learning_rate": 5.536890380053715e-05} +{"ts": "2025-12-28T02:39:24", "event": "train_log", "step": 7234, "epoch": 3.052320675105485, "progress_pct": 50.87, "epoch_pct": 50.87, "eta": "31:23:39", "max_grad_norm": 0.8, "loss": 0.5226179361343384, "grad_norm": 1.171740174293518, "learning_rate": 5.534502740799388e-05} +{"ts": "2025-12-28T02:39:38", "event": "train_log", "step": 7236, "epoch": 3.053164556962025, "progress_pct": 50.89, "epoch_pct": 50.89, "eta": "31:22:49", "max_grad_norm": 0.8, "loss": 0.490182101726532, "grad_norm": 1.1677600145339966, "learning_rate": 5.532114978245544e-05} +{"ts": "2025-12-28T02:39:50", "event": "train_log", "step": 7238, "epoch": 3.0540084388185655, "progress_pct": 50.9, "epoch_pct": 50.9, "eta": "31:21:57", "max_grad_norm": 0.8, "loss": 0.5542705655097961, "grad_norm": 1.2062798738479614, "learning_rate": 5.529727092942994e-05} +{"ts": "2025-12-28T02:40:02", "event": "train_log", "step": 7240, "epoch": 3.0548523206751055, "progress_pct": 50.91, "epoch_pct": 50.91, "eta": "31:21:05", "max_grad_norm": 0.8, "loss": 0.5947107076644897, "grad_norm": 1.2385777235031128, "learning_rate": 5.5273390854425774e-05} +{"ts": "2025-12-28T02:40:15", "event": "train_log", "step": 7242, "epoch": 3.0556962025316454, "progress_pct": 50.93, "epoch_pct": 50.93, "eta": "31:20:13", "max_grad_norm": 0.8, "loss": 0.5728942155838013, "grad_norm": 1.39088773727417, "learning_rate": 5.524950956295162e-05} +{"ts": "2025-12-28T02:40:27", "event": "train_log", "step": 7244, "epoch": 3.056540084388186, "progress_pct": 50.94, "epoch_pct": 50.94, "eta": "31:19:22", "max_grad_norm": 0.8, "loss": 0.5572934150695801, "grad_norm": 1.3944119215011597, "learning_rate": 5.522562706051643e-05} +{"ts": "2025-12-28T02:40:39", "event": "train_log", "step": 7246, "epoch": 3.0573839662447257, "progress_pct": 50.96, "epoch_pct": 50.96, "eta": "31:18:30", "max_grad_norm": 0.8, "loss": 0.6066661477088928, "grad_norm": 1.3647639751434326, "learning_rate": 5.520174335262944e-05} +{"ts": "2025-12-28T02:40:50", "event": "train_log", "step": 7248, "epoch": 3.0582278481012657, "progress_pct": 50.97, "epoch_pct": 50.97, "eta": "31:17:38", "max_grad_norm": 0.8, "loss": 0.5683084726333618, "grad_norm": 1.604581356048584, "learning_rate": 5.5177858444800146e-05} +{"ts": "2025-12-28T02:41:02", "event": "train_log", "step": 7250, "epoch": 3.059071729957806, "progress_pct": 50.98, "epoch_pct": 50.98, "eta": "31:16:46", "max_grad_norm": 0.8, "loss": 0.5676847100257874, "grad_norm": 1.280266284942627, "learning_rate": 5.515397234253836e-05} +{"ts": "2025-12-28T02:41:15", "event": "train_log", "step": 7252, "epoch": 3.059915611814346, "progress_pct": 51.0, "epoch_pct": 51.0, "eta": "31:15:54", "max_grad_norm": 0.8, "loss": 0.48119837045669556, "grad_norm": 1.1750892400741577, "learning_rate": 5.513008505135414e-05} +{"ts": "2025-12-28T02:41:27", "event": "train_log", "step": 7254, "epoch": 3.060759493670886, "progress_pct": 51.01, "epoch_pct": 51.01, "eta": "31:15:02", "max_grad_norm": 0.8, "loss": 0.5646002888679504, "grad_norm": 1.307988166809082, "learning_rate": 5.510619657675783e-05} +{"ts": "2025-12-28T02:41:39", "event": "train_log", "step": 7256, "epoch": 3.0616033755274263, "progress_pct": 51.03, "epoch_pct": 51.03, "eta": "31:14:11", "max_grad_norm": 0.8, "loss": 0.5276528596878052, "grad_norm": 1.2408503293991089, "learning_rate": 5.508230692426002e-05} +{"ts": "2025-12-28T02:41:51", "event": "train_log", "step": 7258, "epoch": 3.0624472573839663, "progress_pct": 51.04, "epoch_pct": 51.04, "eta": "31:13:19", "max_grad_norm": 0.8, "loss": 0.5539094805717468, "grad_norm": 1.2521553039550781, "learning_rate": 5.505841609937161e-05} +{"ts": "2025-12-28T02:42:03", "event": "train_log", "step": 7260, "epoch": 3.0632911392405062, "progress_pct": 51.05, "epoch_pct": 51.05, "eta": "31:12:28", "max_grad_norm": 0.8, "loss": 0.5842460989952087, "grad_norm": 1.387758493423462, "learning_rate": 5.503452410760377e-05} +{"ts": "2025-12-28T02:42:16", "event": "train_log", "step": 7262, "epoch": 3.0641350210970466, "progress_pct": 51.07, "epoch_pct": 51.07, "eta": "31:11:37", "max_grad_norm": 0.8, "loss": 0.5075781345367432, "grad_norm": 1.300126552581787, "learning_rate": 5.501063095446789e-05} +{"ts": "2025-12-28T02:42:28", "event": "train_log", "step": 7264, "epoch": 3.0649789029535865, "progress_pct": 51.08, "epoch_pct": 51.08, "eta": "31:10:45", "max_grad_norm": 0.8, "loss": 0.5713207721710205, "grad_norm": 1.3773088455200195, "learning_rate": 5.498673664547569e-05} +{"ts": "2025-12-28T02:42:40", "event": "train_log", "step": 7266, "epoch": 3.0658227848101265, "progress_pct": 51.1, "epoch_pct": 51.1, "eta": "31:09:54", "max_grad_norm": 0.8, "loss": 0.523406982421875, "grad_norm": 1.3680146932601929, "learning_rate": 5.496284118613912e-05} +{"ts": "2025-12-28T02:42:53", "event": "train_log", "step": 7268, "epoch": 3.066666666666667, "progress_pct": 51.11, "epoch_pct": 51.11, "eta": "31:09:03", "max_grad_norm": 0.8, "loss": 0.5297801494598389, "grad_norm": 1.1380960941314697, "learning_rate": 5.493894458197041e-05} +{"ts": "2025-12-28T02:43:06", "event": "train_log", "step": 7270, "epoch": 3.067510548523207, "progress_pct": 51.13, "epoch_pct": 51.13, "eta": "31:08:12", "max_grad_norm": 0.8, "loss": 0.5217325091362, "grad_norm": 1.4078724384307861, "learning_rate": 5.491504683848202e-05} +{"ts": "2025-12-28T02:43:19", "event": "train_log", "step": 7272, "epoch": 3.0683544303797468, "progress_pct": 51.14, "epoch_pct": 51.14, "eta": "31:07:21", "max_grad_norm": 0.8, "loss": 0.5451233386993408, "grad_norm": 1.2392537593841553, "learning_rate": 5.489114796118674e-05} +{"ts": "2025-12-28T02:43:31", "event": "train_log", "step": 7274, "epoch": 3.0691983122362867, "progress_pct": 51.15, "epoch_pct": 51.15, "eta": "31:06:30", "max_grad_norm": 0.8, "loss": 0.5403155088424683, "grad_norm": 1.159034013748169, "learning_rate": 5.4867247955597544e-05} +{"ts": "2025-12-28T02:43:44", "event": "train_log", "step": 7276, "epoch": 3.070042194092827, "progress_pct": 51.17, "epoch_pct": 51.17, "eta": "31:05:39", "max_grad_norm": 0.8, "loss": 0.5579524636268616, "grad_norm": 1.1931780576705933, "learning_rate": 5.484334682722773e-05} +{"ts": "2025-12-28T02:43:56", "event": "train_log", "step": 7278, "epoch": 3.070886075949367, "progress_pct": 51.18, "epoch_pct": 51.18, "eta": "31:04:47", "max_grad_norm": 0.8, "loss": 0.570443868637085, "grad_norm": 1.1836986541748047, "learning_rate": 5.4819444581590805e-05} +{"ts": "2025-12-28T02:44:09", "event": "train_log", "step": 7280, "epoch": 3.071729957805907, "progress_pct": 51.2, "epoch_pct": 51.2, "eta": "31:03:57", "max_grad_norm": 0.8, "loss": 0.5276142358779907, "grad_norm": 1.2491910457611084, "learning_rate": 5.4795541224200595e-05} +{"ts": "2025-12-28T02:44:21", "event": "train_log", "step": 7282, "epoch": 3.0725738396624473, "progress_pct": 51.21, "epoch_pct": 51.21, "eta": "31:03:05", "max_grad_norm": 0.8, "loss": 0.48629680275917053, "grad_norm": 1.1931475400924683, "learning_rate": 5.477163676057112e-05} +{"ts": "2025-12-28T02:44:33", "event": "train_log", "step": 7284, "epoch": 3.0734177215189873, "progress_pct": 51.22, "epoch_pct": 51.22, "eta": "31:02:14", "max_grad_norm": 0.8, "loss": 0.5148687958717346, "grad_norm": 1.2027641534805298, "learning_rate": 5.4747731196216676e-05} +{"ts": "2025-12-28T02:44:46", "event": "train_log", "step": 7286, "epoch": 3.0742616033755272, "progress_pct": 51.24, "epoch_pct": 51.24, "eta": "31:01:23", "max_grad_norm": 0.8, "loss": 0.5084875226020813, "grad_norm": 1.4708147048950195, "learning_rate": 5.4723824536651844e-05} +{"ts": "2025-12-28T02:44:58", "event": "train_log", "step": 7288, "epoch": 3.0751054852320676, "progress_pct": 51.25, "epoch_pct": 51.25, "eta": "31:00:32", "max_grad_norm": 0.8, "loss": 0.5537340641021729, "grad_norm": 1.2080403566360474, "learning_rate": 5.4699916787391404e-05} +{"ts": "2025-12-28T02:45:11", "event": "train_log", "step": 7290, "epoch": 3.0759493670886076, "progress_pct": 51.27, "epoch_pct": 51.27, "eta": "30:59:41", "max_grad_norm": 0.8, "loss": 0.5617695450782776, "grad_norm": 1.1593934297561646, "learning_rate": 5.467600795395043e-05} +{"ts": "2025-12-28T02:45:23", "event": "train_log", "step": 7292, "epoch": 3.0767932489451475, "progress_pct": 51.28, "epoch_pct": 51.28, "eta": "30:58:50", "max_grad_norm": 0.8, "loss": 0.5376757383346558, "grad_norm": 1.2356617450714111, "learning_rate": 5.465209804184421e-05} +{"ts": "2025-12-28T02:45:36", "event": "train_log", "step": 7294, "epoch": 3.077637130801688, "progress_pct": 51.29, "epoch_pct": 51.29, "eta": "30:58:00", "max_grad_norm": 0.8, "loss": 0.5002268552780151, "grad_norm": 1.1403000354766846, "learning_rate": 5.4628187056588344e-05} +{"ts": "2025-12-28T02:45:48", "event": "train_log", "step": 7296, "epoch": 3.078481012658228, "progress_pct": 51.31, "epoch_pct": 51.31, "eta": "30:57:08", "max_grad_norm": 0.8, "loss": 0.6053714752197266, "grad_norm": 1.4888559579849243, "learning_rate": 5.460427500369858e-05} +{"ts": "2025-12-28T02:45:59", "event": "train_log", "step": 7298, "epoch": 3.0793248945147678, "progress_pct": 51.32, "epoch_pct": 51.32, "eta": "30:56:17", "max_grad_norm": 0.8, "loss": 0.6156546473503113, "grad_norm": 1.4204037189483643, "learning_rate": 5.4580361888691e-05} +{"ts": "2025-12-28T02:46:12", "event": "train_log", "step": 7300, "epoch": 3.080168776371308, "progress_pct": 51.34, "epoch_pct": 51.34, "eta": "30:55:26", "max_grad_norm": 0.8, "loss": 0.5875506401062012, "grad_norm": 1.9313244819641113, "learning_rate": 5.4556447717081925e-05} +{"ts": "2025-12-28T02:54:46", "event": "train_log", "step": 7300, "epoch": 3.080168776371308, "progress_pct": 51.34, "epoch_pct": 51.34, "eta": "31:03:33", "max_grad_norm": 0.8, "eval_loss": 0.678839385509491, "eval_runtime": 513.7013, "eval_samples_per_second": 4.102, "eval_steps_per_second": 4.102} +{"ts": "2025-12-28T02:54:59", "event": "train_log", "step": 7302, "epoch": 3.081012658227848, "progress_pct": 51.35, "epoch_pct": 51.35, "eta": "31:02:42", "max_grad_norm": 0.8, "loss": 0.48908641934394836, "grad_norm": 1.1877124309539795, "learning_rate": 5.453253249438786e-05} +{"ts": "2025-12-28T02:55:11", "event": "train_log", "step": 7304, "epoch": 3.081856540084388, "progress_pct": 51.36, "epoch_pct": 51.36, "eta": "31:01:51", "max_grad_norm": 0.8, "loss": 0.5307457447052002, "grad_norm": 1.308233380317688, "learning_rate": 5.4508616226125595e-05} +{"ts": "2025-12-28T02:55:24", "event": "train_log", "step": 7306, "epoch": 3.0827004219409284, "progress_pct": 51.38, "epoch_pct": 51.38, "eta": "31:01:00", "max_grad_norm": 0.8, "loss": 0.48060229420661926, "grad_norm": 1.3067306280136108, "learning_rate": 5.4484698917812164e-05} +{"ts": "2025-12-28T02:55:37", "event": "train_log", "step": 7308, "epoch": 3.0835443037974684, "progress_pct": 51.39, "epoch_pct": 51.39, "eta": "31:00:10", "max_grad_norm": 0.8, "loss": 0.5300682187080383, "grad_norm": 1.3354034423828125, "learning_rate": 5.446078057496481e-05} +{"ts": "2025-12-28T02:55:50", "event": "train_log", "step": 7310, "epoch": 3.0843881856540083, "progress_pct": 51.41, "epoch_pct": 51.41, "eta": "30:59:20", "max_grad_norm": 0.8, "loss": 0.47262853384017944, "grad_norm": 1.1963045597076416, "learning_rate": 5.443686120310105e-05} +{"ts": "2025-12-28T02:56:03", "event": "train_log", "step": 7312, "epoch": 3.0852320675105487, "progress_pct": 51.42, "epoch_pct": 51.42, "eta": "30:58:29", "max_grad_norm": 0.8, "loss": 0.5363158583641052, "grad_norm": 1.352649450302124, "learning_rate": 5.441294080773863e-05} +{"ts": "2025-12-28T02:56:15", "event": "train_log", "step": 7314, "epoch": 3.0860759493670886, "progress_pct": 51.43, "epoch_pct": 51.43, "eta": "30:57:38", "max_grad_norm": 0.8, "loss": 0.5516205430030823, "grad_norm": 1.415164828300476, "learning_rate": 5.438901939439551e-05} +{"ts": "2025-12-28T02:56:28", "event": "train_log", "step": 7316, "epoch": 3.0869198312236286, "progress_pct": 51.45, "epoch_pct": 51.45, "eta": "30:56:47", "max_grad_norm": 0.8, "loss": 0.5458099246025085, "grad_norm": 1.2061728239059448, "learning_rate": 5.436509696858992e-05} +{"ts": "2025-12-28T02:56:41", "event": "train_log", "step": 7318, "epoch": 3.087763713080169, "progress_pct": 51.46, "epoch_pct": 51.46, "eta": "30:55:56", "max_grad_norm": 0.8, "loss": 0.5184649229049683, "grad_norm": 1.2327239513397217, "learning_rate": 5.434117353584027e-05} +{"ts": "2025-12-28T02:56:54", "event": "train_log", "step": 7320, "epoch": 3.088607594936709, "progress_pct": 51.48, "epoch_pct": 51.48, "eta": "30:55:06", "max_grad_norm": 0.8, "loss": 0.46032577753067017, "grad_norm": 1.0882518291473389, "learning_rate": 5.431724910166528e-05} +{"ts": "2025-12-28T02:57:06", "event": "train_log", "step": 7322, "epoch": 3.089451476793249, "progress_pct": 51.49, "epoch_pct": 51.49, "eta": "30:54:15", "max_grad_norm": 0.8, "loss": 0.5696587562561035, "grad_norm": 1.2710907459259033, "learning_rate": 5.429332367158384e-05} +{"ts": "2025-12-28T02:57:18", "event": "train_log", "step": 7324, "epoch": 3.090295358649789, "progress_pct": 51.5, "epoch_pct": 51.5, "eta": "30:53:24", "max_grad_norm": 0.8, "loss": 0.59807288646698, "grad_norm": 1.5157700777053833, "learning_rate": 5.4269397251115065e-05} +{"ts": "2025-12-28T02:57:31", "event": "train_log", "step": 7326, "epoch": 3.091139240506329, "progress_pct": 51.52, "epoch_pct": 51.52, "eta": "30:52:33", "max_grad_norm": 0.8, "loss": 0.5135430693626404, "grad_norm": 1.2869718074798584, "learning_rate": 5.424546984577835e-05} +{"ts": "2025-12-28T02:57:44", "event": "train_log", "step": 7328, "epoch": 3.091983122362869, "progress_pct": 51.53, "epoch_pct": 51.53, "eta": "30:51:42", "max_grad_norm": 0.8, "loss": 0.47552013397216797, "grad_norm": 1.19942045211792, "learning_rate": 5.4221541461093276e-05} +{"ts": "2025-12-28T02:57:57", "event": "train_log", "step": 7330, "epoch": 3.0928270042194095, "progress_pct": 51.55, "epoch_pct": 51.55, "eta": "30:50:52", "max_grad_norm": 0.8, "loss": 0.5549390316009521, "grad_norm": 1.4979162216186523, "learning_rate": 5.4197612102579665e-05} +{"ts": "2025-12-28T02:58:10", "event": "train_log", "step": 7332, "epoch": 3.0936708860759494, "progress_pct": 51.56, "epoch_pct": 51.56, "eta": "30:50:02", "max_grad_norm": 0.8, "loss": 0.4878964126110077, "grad_norm": 1.3181121349334717, "learning_rate": 5.4173681775757545e-05} +{"ts": "2025-12-28T02:58:23", "event": "train_log", "step": 7334, "epoch": 3.0945147679324894, "progress_pct": 51.58, "epoch_pct": 51.58, "eta": "30:49:11", "max_grad_norm": 0.8, "loss": 0.526251494884491, "grad_norm": 1.740233063697815, "learning_rate": 5.414975048614722e-05} +{"ts": "2025-12-28T02:58:35", "event": "train_log", "step": 7336, "epoch": 3.0953586497890297, "progress_pct": 51.59, "epoch_pct": 51.59, "eta": "30:48:21", "max_grad_norm": 0.8, "loss": 0.48297834396362305, "grad_norm": 1.2123478651046753, "learning_rate": 5.412581823926914e-05} +{"ts": "2025-12-28T02:58:48", "event": "train_log", "step": 7338, "epoch": 3.0962025316455697, "progress_pct": 51.6, "epoch_pct": 51.6, "eta": "30:47:30", "max_grad_norm": 0.8, "loss": 0.5184051990509033, "grad_norm": 1.2853679656982422, "learning_rate": 5.410188504064403e-05} +{"ts": "2025-12-28T02:59:01", "event": "train_log", "step": 7340, "epoch": 3.0970464135021096, "progress_pct": 51.62, "epoch_pct": 51.62, "eta": "30:46:40", "max_grad_norm": 0.8, "loss": 0.5476894974708557, "grad_norm": 1.2580705881118774, "learning_rate": 5.4077950895792815e-05} +{"ts": "2025-12-28T02:59:14", "event": "train_log", "step": 7342, "epoch": 3.09789029535865, "progress_pct": 51.63, "epoch_pct": 51.63, "eta": "30:45:49", "max_grad_norm": 0.8, "loss": 0.5379365682601929, "grad_norm": 1.3363854885101318, "learning_rate": 5.4054015810236666e-05} +{"ts": "2025-12-28T02:59:26", "event": "train_log", "step": 7344, "epoch": 3.09873417721519, "progress_pct": 51.65, "epoch_pct": 51.65, "eta": "30:44:59", "max_grad_norm": 0.8, "loss": 0.5325208306312561, "grad_norm": 1.3067597150802612, "learning_rate": 5.4030079789496925e-05} +{"ts": "2025-12-28T02:59:39", "event": "train_log", "step": 7346, "epoch": 3.09957805907173, "progress_pct": 51.66, "epoch_pct": 51.66, "eta": "30:44:08", "max_grad_norm": 0.8, "loss": 0.56773442029953, "grad_norm": 1.3179864883422852, "learning_rate": 5.400614283909515e-05} +{"ts": "2025-12-28T02:59:52", "event": "train_log", "step": 7348, "epoch": 3.10042194092827, "progress_pct": 51.67, "epoch_pct": 51.67, "eta": "30:43:18", "max_grad_norm": 0.8, "loss": 0.542707622051239, "grad_norm": 1.2006254196166992, "learning_rate": 5.3982204964553196e-05} +{"ts": "2025-12-28T03:00:05", "event": "train_log", "step": 7350, "epoch": 3.1012658227848102, "progress_pct": 51.69, "epoch_pct": 51.69, "eta": "30:42:28", "max_grad_norm": 0.8, "loss": 0.5579875111579895, "grad_norm": 1.2013983726501465, "learning_rate": 5.395826617139301e-05} +{"ts": "2025-12-28T03:00:17", "event": "train_log", "step": 7352, "epoch": 3.10210970464135, "progress_pct": 51.7, "epoch_pct": 51.7, "eta": "30:41:37", "max_grad_norm": 0.8, "loss": 0.5369019508361816, "grad_norm": 1.2002209424972534, "learning_rate": 5.3934326465136854e-05} +{"ts": "2025-12-28T03:00:30", "event": "train_log", "step": 7354, "epoch": 3.10295358649789, "progress_pct": 51.72, "epoch_pct": 51.72, "eta": "30:40:47", "max_grad_norm": 0.8, "loss": 0.5573506355285645, "grad_norm": 1.1660926342010498, "learning_rate": 5.3910385851307133e-05} +{"ts": "2025-12-28T03:00:42", "event": "train_log", "step": 7356, "epoch": 3.1037974683544305, "progress_pct": 51.73, "epoch_pct": 51.73, "eta": "30:39:56", "max_grad_norm": 0.8, "loss": 0.5485683679580688, "grad_norm": 1.3189473152160645, "learning_rate": 5.38864443354265e-05} +{"ts": "2025-12-28T03:00:56", "event": "train_log", "step": 7358, "epoch": 3.1046413502109704, "progress_pct": 51.74, "epoch_pct": 51.74, "eta": "30:39:07", "max_grad_norm": 0.8, "loss": 0.4980843663215637, "grad_norm": 1.11967134475708, "learning_rate": 5.38625019230178e-05} +{"ts": "2025-12-28T03:01:08", "event": "train_log", "step": 7360, "epoch": 3.1054852320675104, "progress_pct": 51.76, "epoch_pct": 51.76, "eta": "30:38:15", "max_grad_norm": 0.8, "loss": 0.5331753492355347, "grad_norm": 1.429019570350647, "learning_rate": 5.3838558619604074e-05} +{"ts": "2025-12-28T03:01:20", "event": "train_log", "step": 7362, "epoch": 3.1063291139240508, "progress_pct": 51.77, "epoch_pct": 51.77, "eta": "30:37:25", "max_grad_norm": 0.8, "loss": 0.5362547636032104, "grad_norm": 1.2600942850112915, "learning_rate": 5.381461443070862e-05} +{"ts": "2025-12-28T03:01:32", "event": "train_log", "step": 7364, "epoch": 3.1071729957805907, "progress_pct": 51.79, "epoch_pct": 51.79, "eta": "30:36:34", "max_grad_norm": 0.8, "loss": 0.5793240070343018, "grad_norm": 1.6344311237335205, "learning_rate": 5.379066936185486e-05} +{"ts": "2025-12-28T03:01:44", "event": "train_log", "step": 7366, "epoch": 3.1080168776371306, "progress_pct": 51.8, "epoch_pct": 51.8, "eta": "30:35:43", "max_grad_norm": 0.8, "loss": 0.5316762328147888, "grad_norm": 1.4372280836105347, "learning_rate": 5.376672341856649e-05} +{"ts": "2025-12-28T03:01:57", "event": "train_log", "step": 7368, "epoch": 3.108860759493671, "progress_pct": 51.81, "epoch_pct": 51.81, "eta": "30:34:53", "max_grad_norm": 0.8, "loss": 0.5305402874946594, "grad_norm": 1.4075509309768677, "learning_rate": 5.3742776606367364e-05} +{"ts": "2025-12-28T03:02:10", "event": "train_log", "step": 7370, "epoch": 3.109704641350211, "progress_pct": 51.83, "epoch_pct": 51.83, "eta": "30:34:03", "max_grad_norm": 0.8, "loss": 0.5756345391273499, "grad_norm": 1.6254384517669678, "learning_rate": 5.371882893078156e-05} +{"ts": "2025-12-28T03:02:22", "event": "train_log", "step": 7372, "epoch": 3.110548523206751, "progress_pct": 51.84, "epoch_pct": 51.84, "eta": "30:33:12", "max_grad_norm": 0.8, "loss": 0.4959688186645508, "grad_norm": 1.2218619585037231, "learning_rate": 5.3694880397333335e-05} +{"ts": "2025-12-28T03:02:34", "event": "train_log", "step": 7374, "epoch": 3.1113924050632913, "progress_pct": 51.86, "epoch_pct": 51.86, "eta": "30:32:22", "max_grad_norm": 0.8, "loss": 0.593587338924408, "grad_norm": 1.3503917455673218, "learning_rate": 5.3670931011547166e-05} +{"ts": "2025-12-28T03:02:46", "event": "train_log", "step": 7376, "epoch": 3.1122362869198312, "progress_pct": 51.87, "epoch_pct": 51.87, "eta": "30:31:31", "max_grad_norm": 0.8, "loss": 0.5407475233078003, "grad_norm": 1.403222918510437, "learning_rate": 5.364698077894772e-05} +{"ts": "2025-12-28T03:02:58", "event": "train_log", "step": 7378, "epoch": 3.113080168776371, "progress_pct": 51.88, "epoch_pct": 51.88, "eta": "30:30:40", "max_grad_norm": 0.8, "loss": 0.6125431060791016, "grad_norm": 1.4017539024353027, "learning_rate": 5.3623029705059835e-05} +{"ts": "2025-12-28T03:03:10", "event": "train_log", "step": 7380, "epoch": 3.1139240506329116, "progress_pct": 51.9, "epoch_pct": 51.9, "eta": "30:29:49", "max_grad_norm": 0.8, "loss": 0.5179317593574524, "grad_norm": 1.4538600444793701, "learning_rate": 5.359907779540859e-05} +{"ts": "2025-12-28T03:03:22", "event": "train_log", "step": 7382, "epoch": 3.1147679324894515, "progress_pct": 51.91, "epoch_pct": 51.91, "eta": "30:28:59", "max_grad_norm": 0.8, "loss": 0.43457767367362976, "grad_norm": 1.2120319604873657, "learning_rate": 5.3575125055519225e-05} +{"ts": "2025-12-28T03:03:35", "event": "train_log", "step": 7384, "epoch": 3.1156118143459914, "progress_pct": 51.93, "epoch_pct": 51.93, "eta": "30:28:09", "max_grad_norm": 0.8, "loss": 0.5810346603393555, "grad_norm": 1.3049911260604858, "learning_rate": 5.355117149091717e-05} +{"ts": "2025-12-28T03:03:48", "event": "train_log", "step": 7386, "epoch": 3.116455696202532, "progress_pct": 51.94, "epoch_pct": 51.94, "eta": "30:27:19", "max_grad_norm": 0.8, "loss": 0.4865732789039612, "grad_norm": 1.1788939237594604, "learning_rate": 5.3527217107128036e-05} +{"ts": "2025-12-28T03:04:00", "event": "train_log", "step": 7388, "epoch": 3.1172995780590718, "progress_pct": 51.95, "epoch_pct": 51.95, "eta": "30:26:29", "max_grad_norm": 0.8, "loss": 0.5363003015518188, "grad_norm": 1.4433233737945557, "learning_rate": 5.350326190967768e-05} +{"ts": "2025-12-28T03:04:13", "event": "train_log", "step": 7390, "epoch": 3.1181434599156117, "progress_pct": 51.97, "epoch_pct": 51.97, "eta": "30:25:39", "max_grad_norm": 0.8, "loss": 0.5111554861068726, "grad_norm": 1.2610430717468262, "learning_rate": 5.347930590409207e-05} +{"ts": "2025-12-28T03:04:26", "event": "train_log", "step": 7392, "epoch": 3.118987341772152, "progress_pct": 51.98, "epoch_pct": 51.98, "eta": "30:24:49", "max_grad_norm": 0.8, "loss": 0.5018916726112366, "grad_norm": 1.1659626960754395, "learning_rate": 5.345534909589742e-05} +{"ts": "2025-12-28T03:04:39", "event": "train_log", "step": 7394, "epoch": 3.119831223628692, "progress_pct": 52.0, "epoch_pct": 52.0, "eta": "30:23:59", "max_grad_norm": 0.8, "loss": 0.5640519261360168, "grad_norm": 1.1380181312561035, "learning_rate": 5.343139149062008e-05} +{"ts": "2025-12-28T03:04:52", "event": "train_log", "step": 7396, "epoch": 3.120675105485232, "progress_pct": 52.01, "epoch_pct": 52.01, "eta": "30:23:10", "max_grad_norm": 0.8, "loss": 0.5245673060417175, "grad_norm": 2.249542713165283, "learning_rate": 5.340743309378663e-05} +{"ts": "2025-12-28T03:05:04", "event": "train_log", "step": 7398, "epoch": 3.1215189873417724, "progress_pct": 52.03, "epoch_pct": 52.03, "eta": "30:22:20", "max_grad_norm": 0.8, "loss": 0.5807012915611267, "grad_norm": 1.288784384727478, "learning_rate": 5.338347391092381e-05} +{"ts": "2025-12-28T03:05:17", "event": "train_log", "step": 7400, "epoch": 3.1223628691983123, "progress_pct": 52.04, "epoch_pct": 52.04, "eta": "30:21:30", "max_grad_norm": 0.8, "loss": 0.5531487464904785, "grad_norm": 1.3856520652770996, "learning_rate": 5.3359513947558525e-05} +{"ts": "2025-12-28T03:13:51", "event": "train_log", "step": 7400, "epoch": 3.1223628691983123, "progress_pct": 52.04, "epoch_pct": 52.04, "eta": "30:29:23", "max_grad_norm": 0.8, "eval_loss": 0.676459550857544, "eval_runtime": 513.5901, "eval_samples_per_second": 4.102, "eval_steps_per_second": 4.102} +{"ts": "2025-12-28T03:14:03", "event": "train_log", "step": 7402, "epoch": 3.1232067510548522, "progress_pct": 52.05, "epoch_pct": 52.05, "eta": "30:28:32", "max_grad_norm": 0.8, "loss": 0.610917329788208, "grad_norm": 1.4650845527648926, "learning_rate": 5.333555320921791e-05} +{"ts": "2025-12-28T03:14:15", "event": "train_log", "step": 7404, "epoch": 3.124050632911392, "progress_pct": 52.07, "epoch_pct": 52.07, "eta": "30:27:41", "max_grad_norm": 0.8, "loss": 0.5132687091827393, "grad_norm": 1.4528120756149292, "learning_rate": 5.331159170142923e-05} +{"ts": "2025-12-28T03:14:27", "event": "train_log", "step": 7406, "epoch": 3.1248945147679326, "progress_pct": 52.08, "epoch_pct": 52.08, "eta": "30:26:51", "max_grad_norm": 0.8, "loss": 0.5424297451972961, "grad_norm": 1.297371745109558, "learning_rate": 5.328762942971994e-05} +{"ts": "2025-12-28T03:14:38", "event": "train_log", "step": 7408, "epoch": 3.1257383966244725, "progress_pct": 52.1, "epoch_pct": 52.1, "eta": "30:26:00", "max_grad_norm": 0.8, "loss": 0.5797439813613892, "grad_norm": 1.3470855951309204, "learning_rate": 5.326366639961767e-05} +{"ts": "2025-12-28T03:14:51", "event": "train_log", "step": 7410, "epoch": 3.1265822784810124, "progress_pct": 52.11, "epoch_pct": 52.11, "eta": "30:25:10", "max_grad_norm": 0.8, "loss": 0.45681023597717285, "grad_norm": 1.0487306118011475, "learning_rate": 5.323970261665027e-05} +{"ts": "2025-12-28T03:15:05", "event": "train_log", "step": 7412, "epoch": 3.127426160337553, "progress_pct": 52.12, "epoch_pct": 52.12, "eta": "30:24:21", "max_grad_norm": 0.8, "loss": 0.5057253241539001, "grad_norm": 1.1351137161254883, "learning_rate": 5.321573808634567e-05} +{"ts": "2025-12-28T03:15:18", "event": "train_log", "step": 7414, "epoch": 3.1282700421940928, "progress_pct": 52.14, "epoch_pct": 52.14, "eta": "30:23:31", "max_grad_norm": 0.8, "loss": 0.55838543176651, "grad_norm": 1.3002208471298218, "learning_rate": 5.3191772814232055e-05} +{"ts": "2025-12-28T03:15:30", "event": "train_log", "step": 7416, "epoch": 3.1291139240506327, "progress_pct": 52.15, "epoch_pct": 52.15, "eta": "30:22:40", "max_grad_norm": 0.8, "loss": 0.5052227973937988, "grad_norm": 1.3143419027328491, "learning_rate": 5.316780680583776e-05} +{"ts": "2025-12-28T03:15:43", "event": "train_log", "step": 7418, "epoch": 3.129957805907173, "progress_pct": 52.17, "epoch_pct": 52.17, "eta": "30:21:50", "max_grad_norm": 0.8, "loss": 0.5119181871414185, "grad_norm": 1.2752583026885986, "learning_rate": 5.314384006669126e-05} +{"ts": "2025-12-28T03:15:56", "event": "train_log", "step": 7420, "epoch": 3.130801687763713, "progress_pct": 52.18, "epoch_pct": 52.18, "eta": "30:21:01", "max_grad_norm": 0.8, "loss": 0.5696089267730713, "grad_norm": 1.2892590761184692, "learning_rate": 5.3119872602321256e-05} +{"ts": "2025-12-28T03:16:08", "event": "train_log", "step": 7422, "epoch": 3.131645569620253, "progress_pct": 52.19, "epoch_pct": 52.19, "eta": "30:20:10", "max_grad_norm": 0.8, "loss": 0.6057182550430298, "grad_norm": 1.510764718055725, "learning_rate": 5.309590441825654e-05} +{"ts": "2025-12-28T03:16:20", "event": "train_log", "step": 7424, "epoch": 3.1324894514767934, "progress_pct": 52.21, "epoch_pct": 52.21, "eta": "30:19:20", "max_grad_norm": 0.8, "loss": 0.5079684853553772, "grad_norm": 1.2366914749145508, "learning_rate": 5.307193552002616e-05} +{"ts": "2025-12-28T03:16:34", "event": "train_log", "step": 7426, "epoch": 3.1333333333333333, "progress_pct": 52.22, "epoch_pct": 52.22, "eta": "30:18:31", "max_grad_norm": 0.8, "loss": 0.4977130591869354, "grad_norm": 1.2063475847244263, "learning_rate": 5.3047965913159226e-05} +{"ts": "2025-12-28T03:16:46", "event": "train_log", "step": 7428, "epoch": 3.1341772151898732, "progress_pct": 52.24, "epoch_pct": 52.24, "eta": "30:17:40", "max_grad_norm": 0.8, "loss": 0.5761610865592957, "grad_norm": 1.603097677230835, "learning_rate": 5.30239956031851e-05} +{"ts": "2025-12-28T03:16:58", "event": "train_log", "step": 7430, "epoch": 3.1350210970464136, "progress_pct": 52.25, "epoch_pct": 52.25, "eta": "30:16:50", "max_grad_norm": 0.8, "loss": 0.5743051767349243, "grad_norm": 1.2723357677459717, "learning_rate": 5.300002459563328e-05} +{"ts": "2025-12-28T03:17:11", "event": "train_log", "step": 7432, "epoch": 3.1358649789029536, "progress_pct": 52.26, "epoch_pct": 52.26, "eta": "30:16:01", "max_grad_norm": 0.8, "loss": 0.5411891341209412, "grad_norm": 1.3077106475830078, "learning_rate": 5.297605289603338e-05} +{"ts": "2025-12-28T03:17:24", "event": "train_log", "step": 7434, "epoch": 3.1367088607594935, "progress_pct": 52.28, "epoch_pct": 52.28, "eta": "30:15:11", "max_grad_norm": 0.8, "loss": 0.5488677620887756, "grad_norm": 1.3610905408859253, "learning_rate": 5.2952080509915246e-05} +{"ts": "2025-12-28T03:17:37", "event": "train_log", "step": 7436, "epoch": 3.137552742616034, "progress_pct": 52.29, "epoch_pct": 52.29, "eta": "30:14:21", "max_grad_norm": 0.8, "loss": 0.554864227771759, "grad_norm": 1.1999255418777466, "learning_rate": 5.292810744280884e-05} +{"ts": "2025-12-28T03:17:50", "event": "train_log", "step": 7438, "epoch": 3.138396624472574, "progress_pct": 52.31, "epoch_pct": 52.31, "eta": "30:13:32", "max_grad_norm": 0.8, "loss": 0.49844983220100403, "grad_norm": 1.2868118286132812, "learning_rate": 5.2904133700244276e-05} +{"ts": "2025-12-28T03:18:03", "event": "train_log", "step": 7440, "epoch": 3.1392405063291138, "progress_pct": 52.32, "epoch_pct": 52.32, "eta": "30:12:42", "max_grad_norm": 0.8, "loss": 0.5171698331832886, "grad_norm": 1.3824434280395508, "learning_rate": 5.288015928775183e-05} +{"ts": "2025-12-28T03:18:15", "event": "train_log", "step": 7442, "epoch": 3.140084388185654, "progress_pct": 52.33, "epoch_pct": 52.33, "eta": "30:11:52", "max_grad_norm": 0.8, "loss": 0.550440788269043, "grad_norm": 1.502249002456665, "learning_rate": 5.285618421086197e-05} +{"ts": "2025-12-28T03:18:27", "event": "train_log", "step": 7444, "epoch": 3.140928270042194, "progress_pct": 52.35, "epoch_pct": 52.35, "eta": "30:11:02", "max_grad_norm": 0.8, "loss": 0.5033495426177979, "grad_norm": 1.2650765180587769, "learning_rate": 5.283220847510526e-05} +{"ts": "2025-12-28T03:18:40", "event": "train_log", "step": 7446, "epoch": 3.141772151898734, "progress_pct": 52.36, "epoch_pct": 52.36, "eta": "30:10:12", "max_grad_norm": 0.8, "loss": 0.48968273401260376, "grad_norm": 1.2669732570648193, "learning_rate": 5.280823208601244e-05} +{"ts": "2025-12-28T03:18:53", "event": "train_log", "step": 7448, "epoch": 3.1426160337552744, "progress_pct": 52.38, "epoch_pct": 52.38, "eta": "30:09:23", "max_grad_norm": 0.8, "loss": 0.4713798463344574, "grad_norm": 1.104645848274231, "learning_rate": 5.278425504911442e-05} +{"ts": "2025-12-28T03:19:05", "event": "train_log", "step": 7450, "epoch": 3.1434599156118144, "progress_pct": 52.39, "epoch_pct": 52.39, "eta": "30:08:33", "max_grad_norm": 0.8, "loss": 0.5249105095863342, "grad_norm": 1.2858284711837769, "learning_rate": 5.276027736994224e-05} +{"ts": "2025-12-28T03:19:18", "event": "train_log", "step": 7452, "epoch": 3.1443037974683543, "progress_pct": 52.41, "epoch_pct": 52.41, "eta": "30:07:43", "max_grad_norm": 0.8, "loss": 0.5125989317893982, "grad_norm": 1.3720128536224365, "learning_rate": 5.2736299054027064e-05} +{"ts": "2025-12-28T03:19:30", "event": "train_log", "step": 7454, "epoch": 3.1451476793248947, "progress_pct": 52.42, "epoch_pct": 52.42, "eta": "30:06:53", "max_grad_norm": 0.8, "loss": 0.5324952006340027, "grad_norm": 1.2519328594207764, "learning_rate": 5.271232010690025e-05} +{"ts": "2025-12-28T03:19:43", "event": "train_log", "step": 7456, "epoch": 3.1459915611814346, "progress_pct": 52.43, "epoch_pct": 52.43, "eta": "30:06:04", "max_grad_norm": 0.8, "loss": 0.4961182475090027, "grad_norm": 1.2284791469573975, "learning_rate": 5.2688340534093295e-05} +{"ts": "2025-12-28T03:19:55", "event": "train_log", "step": 7458, "epoch": 3.1468354430379746, "progress_pct": 52.45, "epoch_pct": 52.45, "eta": "30:05:14", "max_grad_norm": 0.8, "loss": 0.5569467544555664, "grad_norm": 1.428916335105896, "learning_rate": 5.26643603411378e-05} +{"ts": "2025-12-28T03:20:08", "event": "train_log", "step": 7460, "epoch": 3.147679324894515, "progress_pct": 52.46, "epoch_pct": 52.46, "eta": "30:04:24", "max_grad_norm": 0.8, "loss": 0.476906418800354, "grad_norm": 1.172302007675171, "learning_rate": 5.264037953356554e-05} +{"ts": "2025-12-28T03:20:20", "event": "train_log", "step": 7462, "epoch": 3.148523206751055, "progress_pct": 52.48, "epoch_pct": 52.48, "eta": "30:03:34", "max_grad_norm": 0.8, "loss": 0.5321967601776123, "grad_norm": 1.2087178230285645, "learning_rate": 5.261639811690843e-05} +{"ts": "2025-12-28T03:20:33", "event": "train_log", "step": 7464, "epoch": 3.149367088607595, "progress_pct": 52.49, "epoch_pct": 52.49, "eta": "30:02:45", "max_grad_norm": 0.8, "loss": 0.5333749651908875, "grad_norm": 1.1226983070373535, "learning_rate": 5.259241609669854e-05} +{"ts": "2025-12-28T03:20:46", "event": "train_log", "step": 7466, "epoch": 3.1502109704641352, "progress_pct": 52.5, "epoch_pct": 52.5, "eta": "30:01:56", "max_grad_norm": 0.8, "loss": 0.5035849809646606, "grad_norm": 1.156534194946289, "learning_rate": 5.256843347846803e-05} +{"ts": "2025-12-28T03:20:59", "event": "train_log", "step": 7468, "epoch": 3.151054852320675, "progress_pct": 52.52, "epoch_pct": 52.52, "eta": "30:01:06", "max_grad_norm": 0.8, "loss": 0.4934900104999542, "grad_norm": 1.3600608110427856, "learning_rate": 5.2544450267749244e-05} +{"ts": "2025-12-28T03:21:12", "event": "train_log", "step": 7470, "epoch": 3.151898734177215, "progress_pct": 52.53, "epoch_pct": 52.53, "eta": "30:00:17", "max_grad_norm": 0.8, "loss": 0.5037409067153931, "grad_norm": 1.2820971012115479, "learning_rate": 5.252046647007465e-05} +{"ts": "2025-12-28T03:21:25", "event": "train_log", "step": 7472, "epoch": 3.1527426160337555, "progress_pct": 52.55, "epoch_pct": 52.55, "eta": "29:59:28", "max_grad_norm": 0.8, "loss": 0.5056651830673218, "grad_norm": 1.1549314260482788, "learning_rate": 5.249648209097685e-05} +{"ts": "2025-12-28T03:21:38", "event": "train_log", "step": 7474, "epoch": 3.1535864978902954, "progress_pct": 52.56, "epoch_pct": 52.56, "eta": "29:58:39", "max_grad_norm": 0.8, "loss": 0.4975930154323578, "grad_norm": 1.1724461317062378, "learning_rate": 5.2472497135988586e-05} +{"ts": "2025-12-28T03:21:51", "event": "train_log", "step": 7476, "epoch": 3.1544303797468354, "progress_pct": 52.57, "epoch_pct": 52.57, "eta": "29:57:49", "max_grad_norm": 0.8, "loss": 0.46283629536628723, "grad_norm": 1.1598713397979736, "learning_rate": 5.2448511610642695e-05} +{"ts": "2025-12-28T03:22:03", "event": "train_log", "step": 7478, "epoch": 3.1552742616033758, "progress_pct": 52.59, "epoch_pct": 52.59, "eta": "29:57:00", "max_grad_norm": 0.8, "loss": 0.557330846786499, "grad_norm": 1.44228196144104, "learning_rate": 5.2424525520472236e-05} +{"ts": "2025-12-28T03:22:16", "event": "train_log", "step": 7480, "epoch": 3.1561181434599157, "progress_pct": 52.6, "epoch_pct": 52.6, "eta": "29:56:11", "max_grad_norm": 0.8, "loss": 0.513535737991333, "grad_norm": 1.3199583292007446, "learning_rate": 5.2400538871010266e-05} +{"ts": "2025-12-28T03:22:29", "event": "train_log", "step": 7482, "epoch": 3.1569620253164556, "progress_pct": 52.62, "epoch_pct": 52.62, "eta": "29:55:21", "max_grad_norm": 0.8, "loss": 0.5200037956237793, "grad_norm": 1.180692434310913, "learning_rate": 5.23765516677901e-05} +{"ts": "2025-12-28T03:22:41", "event": "train_log", "step": 7484, "epoch": 3.1578059071729956, "progress_pct": 52.63, "epoch_pct": 52.63, "eta": "29:54:32", "max_grad_norm": 0.8, "loss": 0.576216459274292, "grad_norm": 1.4217020273208618, "learning_rate": 5.23525639163451e-05} +{"ts": "2025-12-28T03:22:53", "event": "train_log", "step": 7486, "epoch": 3.158649789029536, "progress_pct": 52.64, "epoch_pct": 52.64, "eta": "29:53:42", "max_grad_norm": 0.8, "loss": 0.555095374584198, "grad_norm": 1.238783359527588, "learning_rate": 5.23285756222088e-05} +{"ts": "2025-12-28T03:23:06", "event": "train_log", "step": 7488, "epoch": 3.159493670886076, "progress_pct": 52.66, "epoch_pct": 52.66, "eta": "29:52:53", "max_grad_norm": 0.8, "loss": 0.5228440761566162, "grad_norm": 1.293283462524414, "learning_rate": 5.2304586790914815e-05} +{"ts": "2025-12-28T03:23:19", "event": "train_log", "step": 7490, "epoch": 3.160337552742616, "progress_pct": 52.67, "epoch_pct": 52.67, "eta": "29:52:04", "max_grad_norm": 0.8, "loss": 0.5684541463851929, "grad_norm": 1.373578429222107, "learning_rate": 5.22805974279969e-05} +{"ts": "2025-12-28T03:23:32", "event": "train_log", "step": 7492, "epoch": 3.1611814345991562, "progress_pct": 52.69, "epoch_pct": 52.69, "eta": "29:51:15", "max_grad_norm": 0.8, "loss": 0.4627608358860016, "grad_norm": 1.1387807130813599, "learning_rate": 5.225660753898899e-05} +{"ts": "2025-12-28T03:23:44", "event": "train_log", "step": 7494, "epoch": 3.162025316455696, "progress_pct": 52.7, "epoch_pct": 52.7, "eta": "29:50:25", "max_grad_norm": 0.8, "loss": 0.5046111345291138, "grad_norm": 1.1708600521087646, "learning_rate": 5.223261712942504e-05} +{"ts": "2025-12-28T03:23:56", "event": "train_log", "step": 7496, "epoch": 3.162869198312236, "progress_pct": 52.71, "epoch_pct": 52.71, "eta": "29:49:35", "max_grad_norm": 0.8, "loss": 0.5108349323272705, "grad_norm": 1.3370471000671387, "learning_rate": 5.220862620483921e-05} +{"ts": "2025-12-28T03:24:08", "event": "train_log", "step": 7498, "epoch": 3.1637130801687765, "progress_pct": 52.73, "epoch_pct": 52.73, "eta": "29:48:45", "max_grad_norm": 0.8, "loss": 0.525260329246521, "grad_norm": 1.399530053138733, "learning_rate": 5.2184634770765716e-05} +{"ts": "2025-12-28T03:24:20", "event": "train_log", "step": 7500, "epoch": 3.1645569620253164, "progress_pct": 52.74, "epoch_pct": 52.74, "eta": "29:47:56", "max_grad_norm": 0.8, "loss": 0.6050346493721008, "grad_norm": 1.4769412279129028, "learning_rate": 5.216064283273896e-05} +{"ts": "2025-12-28T03:32:54", "event": "train_log", "step": 7500, "epoch": 3.1645569620253164, "progress_pct": 52.74, "epoch_pct": 52.74, "eta": "29:55:36", "max_grad_norm": 0.8, "eval_loss": 0.6774632334709167, "eval_runtime": 513.4064, "eval_samples_per_second": 4.104, "eval_steps_per_second": 4.104} +{"ts": "2025-12-28T03:33:07", "event": "train_log", "step": 7502, "epoch": 3.1654008438818564, "progress_pct": 52.76, "epoch_pct": 52.76, "eta": "29:54:47", "max_grad_norm": 0.8, "loss": 0.4950231611728668, "grad_norm": 1.5023313760757446, "learning_rate": 5.213665039629337e-05} +{"ts": "2025-12-28T03:33:19", "event": "train_log", "step": 7504, "epoch": 3.1662447257383968, "progress_pct": 52.77, "epoch_pct": 52.77, "eta": "29:53:57", "max_grad_norm": 0.8, "loss": 0.5718420743942261, "grad_norm": 1.4648873805999756, "learning_rate": 5.211265746696359e-05} +{"ts": "2025-12-28T03:33:31", "event": "train_log", "step": 7506, "epoch": 3.1670886075949367, "progress_pct": 52.78, "epoch_pct": 52.78, "eta": "29:53:07", "max_grad_norm": 0.8, "loss": 0.5318358540534973, "grad_norm": 1.2604464292526245, "learning_rate": 5.20886640502843e-05} +{"ts": "2025-12-28T03:33:44", "event": "train_log", "step": 7508, "epoch": 3.1679324894514767, "progress_pct": 52.8, "epoch_pct": 52.8, "eta": "29:52:17", "max_grad_norm": 0.8, "loss": 0.5378178358078003, "grad_norm": 1.2533620595932007, "learning_rate": 5.206467015179032e-05} +{"ts": "2025-12-28T03:33:56", "event": "train_log", "step": 7510, "epoch": 3.168776371308017, "progress_pct": 52.81, "epoch_pct": 52.81, "eta": "29:51:28", "max_grad_norm": 0.8, "loss": 0.587459146976471, "grad_norm": 1.2717853784561157, "learning_rate": 5.204067577701659e-05} +{"ts": "2025-12-28T03:34:09", "event": "train_log", "step": 7512, "epoch": 3.169620253164557, "progress_pct": 52.83, "epoch_pct": 52.83, "eta": "29:50:38", "max_grad_norm": 0.8, "loss": 0.5217386484146118, "grad_norm": 1.1071430444717407, "learning_rate": 5.201668093149816e-05} +{"ts": "2025-12-28T03:34:22", "event": "train_log", "step": 7514, "epoch": 3.170464135021097, "progress_pct": 52.84, "epoch_pct": 52.84, "eta": "29:49:50", "max_grad_norm": 0.8, "loss": 0.499948114156723, "grad_norm": 1.187596082687378, "learning_rate": 5.1992685620770166e-05} +{"ts": "2025-12-28T03:34:35", "event": "train_log", "step": 7516, "epoch": 3.1713080168776373, "progress_pct": 52.86, "epoch_pct": 52.86, "eta": "29:49:00", "max_grad_norm": 0.8, "loss": 0.5857545137405396, "grad_norm": 1.269935131072998, "learning_rate": 5.196868985036787e-05} +{"ts": "2025-12-28T03:34:47", "event": "train_log", "step": 7518, "epoch": 3.1721518987341772, "progress_pct": 52.87, "epoch_pct": 52.87, "eta": "29:48:11", "max_grad_norm": 0.8, "loss": 0.5482431650161743, "grad_norm": 1.363231897354126, "learning_rate": 5.194469362582663e-05} +{"ts": "2025-12-28T03:34:59", "event": "train_log", "step": 7520, "epoch": 3.172995780590717, "progress_pct": 52.88, "epoch_pct": 52.88, "eta": "29:47:21", "max_grad_norm": 0.8, "loss": 0.484623521566391, "grad_norm": 1.5368403196334839, "learning_rate": 5.1920696952681925e-05} +{"ts": "2025-12-28T03:35:12", "event": "train_log", "step": 7522, "epoch": 3.1738396624472576, "progress_pct": 52.9, "epoch_pct": 52.9, "eta": "29:46:32", "max_grad_norm": 0.8, "loss": 0.4609130620956421, "grad_norm": 1.1487576961517334, "learning_rate": 5.189669983646932e-05} +{"ts": "2025-12-28T03:35:24", "event": "train_log", "step": 7524, "epoch": 3.1746835443037975, "progress_pct": 52.91, "epoch_pct": 52.91, "eta": "29:45:43", "max_grad_norm": 0.8, "loss": 0.5638056993484497, "grad_norm": 1.262457013130188, "learning_rate": 5.187270228272448e-05} +{"ts": "2025-12-28T03:35:38", "event": "train_log", "step": 7526, "epoch": 3.1755274261603375, "progress_pct": 52.93, "epoch_pct": 52.93, "eta": "29:44:54", "max_grad_norm": 0.8, "loss": 0.506156861782074, "grad_norm": 1.1573964357376099, "learning_rate": 5.184870429698321e-05} +{"ts": "2025-12-28T03:35:50", "event": "train_log", "step": 7528, "epoch": 3.176371308016878, "progress_pct": 52.94, "epoch_pct": 52.94, "eta": "29:44:04", "max_grad_norm": 0.8, "loss": 0.533206045627594, "grad_norm": 1.2594209909439087, "learning_rate": 5.182470588478134e-05} +{"ts": "2025-12-28T03:36:02", "event": "train_log", "step": 7530, "epoch": 3.1772151898734178, "progress_pct": 52.95, "epoch_pct": 52.95, "eta": "29:43:15", "max_grad_norm": 0.8, "loss": 0.5168673396110535, "grad_norm": 1.268861174583435, "learning_rate": 5.1800707051654874e-05} +{"ts": "2025-12-28T03:36:14", "event": "train_log", "step": 7532, "epoch": 3.1780590717299577, "progress_pct": 52.97, "epoch_pct": 52.97, "eta": "29:42:25", "max_grad_norm": 0.8, "loss": 0.5314695239067078, "grad_norm": 1.3061206340789795, "learning_rate": 5.177670780313989e-05} +{"ts": "2025-12-28T03:36:25", "event": "train_log", "step": 7534, "epoch": 3.1789029535864977, "progress_pct": 52.98, "epoch_pct": 52.98, "eta": "29:41:35", "max_grad_norm": 0.8, "loss": 0.5938999056816101, "grad_norm": 1.5774306058883667, "learning_rate": 5.175270814477252e-05} +{"ts": "2025-12-28T03:36:38", "event": "train_log", "step": 7536, "epoch": 3.179746835443038, "progress_pct": 53.0, "epoch_pct": 53.0, "eta": "29:40:45", "max_grad_norm": 0.8, "loss": 0.6023578643798828, "grad_norm": 1.3401720523834229, "learning_rate": 5.172870808208905e-05} +{"ts": "2025-12-28T03:36:50", "event": "train_log", "step": 7538, "epoch": 3.180590717299578, "progress_pct": 53.01, "epoch_pct": 53.01, "eta": "29:39:56", "max_grad_norm": 0.8, "loss": 0.5246027708053589, "grad_norm": 1.3131903409957886, "learning_rate": 5.1704707620625823e-05} +{"ts": "2025-12-28T03:37:02", "event": "train_log", "step": 7540, "epoch": 3.181434599156118, "progress_pct": 53.02, "epoch_pct": 53.02, "eta": "29:39:06", "max_grad_norm": 0.8, "loss": 0.6149776577949524, "grad_norm": 1.4343721866607666, "learning_rate": 5.168070676591931e-05} +{"ts": "2025-12-28T03:37:14", "event": "train_log", "step": 7542, "epoch": 3.1822784810126583, "progress_pct": 53.04, "epoch_pct": 53.04, "eta": "29:38:17", "max_grad_norm": 0.8, "loss": 0.50123530626297, "grad_norm": 1.2892001867294312, "learning_rate": 5.1656705523505986e-05} +{"ts": "2025-12-28T03:37:27", "event": "train_log", "step": 7544, "epoch": 3.1831223628691983, "progress_pct": 53.05, "epoch_pct": 53.05, "eta": "29:37:28", "max_grad_norm": 0.8, "loss": 0.5273304581642151, "grad_norm": 1.157297134399414, "learning_rate": 5.1632703898922544e-05} +{"ts": "2025-12-28T03:37:40", "event": "train_log", "step": 7546, "epoch": 3.183966244725738, "progress_pct": 53.07, "epoch_pct": 53.07, "eta": "29:36:39", "max_grad_norm": 0.8, "loss": 0.6163570880889893, "grad_norm": 1.5902838706970215, "learning_rate": 5.1608701897705645e-05} +{"ts": "2025-12-28T03:37:53", "event": "train_log", "step": 7548, "epoch": 3.1848101265822786, "progress_pct": 53.08, "epoch_pct": 53.08, "eta": "29:35:50", "max_grad_norm": 0.8, "loss": 0.49931052327156067, "grad_norm": 1.2332760095596313, "learning_rate": 5.158469952539213e-05} +{"ts": "2025-12-28T03:38:05", "event": "train_log", "step": 7550, "epoch": 3.1856540084388185, "progress_pct": 53.09, "epoch_pct": 53.09, "eta": "29:35:01", "max_grad_norm": 0.8, "loss": 0.5390233993530273, "grad_norm": 1.4857407808303833, "learning_rate": 5.156069678751887e-05} +{"ts": "2025-12-28T03:38:18", "event": "train_log", "step": 7552, "epoch": 3.1864978902953585, "progress_pct": 53.11, "epoch_pct": 53.11, "eta": "29:34:12", "max_grad_norm": 0.8, "loss": 0.5518596768379211, "grad_norm": 1.2669347524642944, "learning_rate": 5.1536693689622816e-05} +{"ts": "2025-12-28T03:38:29", "event": "train_log", "step": 7554, "epoch": 3.187341772151899, "progress_pct": 53.12, "epoch_pct": 53.12, "eta": "29:33:22", "max_grad_norm": 0.8, "loss": 0.5428967475891113, "grad_norm": 1.5172291994094849, "learning_rate": 5.151269023724108e-05} +{"ts": "2025-12-28T03:38:42", "event": "train_log", "step": 7556, "epoch": 3.188185654008439, "progress_pct": 53.14, "epoch_pct": 53.14, "eta": "29:32:33", "max_grad_norm": 0.8, "loss": 0.5223005414009094, "grad_norm": 1.3122823238372803, "learning_rate": 5.1488686435910716e-05} +{"ts": "2025-12-28T03:38:54", "event": "train_log", "step": 7558, "epoch": 3.1890295358649787, "progress_pct": 53.15, "epoch_pct": 53.15, "eta": "29:31:44", "max_grad_norm": 0.8, "loss": 0.5134532451629639, "grad_norm": 1.3801649808883667, "learning_rate": 5.146468229116901e-05} +{"ts": "2025-12-28T03:39:07", "event": "train_log", "step": 7560, "epoch": 3.189873417721519, "progress_pct": 53.16, "epoch_pct": 53.16, "eta": "29:30:55", "max_grad_norm": 0.8, "loss": 0.48221880197525024, "grad_norm": 1.19368314743042, "learning_rate": 5.144067780855322e-05} +{"ts": "2025-12-28T03:39:20", "event": "train_log", "step": 7562, "epoch": 3.190717299578059, "progress_pct": 53.18, "epoch_pct": 53.18, "eta": "29:30:07", "max_grad_norm": 0.8, "loss": 0.500047504901886, "grad_norm": 1.3684933185577393, "learning_rate": 5.141667299360073e-05} +{"ts": "2025-12-28T03:39:34", "event": "train_log", "step": 7564, "epoch": 3.191561181434599, "progress_pct": 53.19, "epoch_pct": 53.19, "eta": "29:29:19", "max_grad_norm": 0.8, "loss": 0.48337340354919434, "grad_norm": 1.3344818353652954, "learning_rate": 5.1392667851848977e-05} +{"ts": "2025-12-28T03:39:46", "event": "train_log", "step": 7566, "epoch": 3.1924050632911394, "progress_pct": 53.21, "epoch_pct": 53.21, "eta": "29:28:29", "max_grad_norm": 0.8, "loss": 0.5282860398292542, "grad_norm": 1.2710987329483032, "learning_rate": 5.136866238883551e-05} +{"ts": "2025-12-28T03:39:59", "event": "train_log", "step": 7568, "epoch": 3.1932489451476793, "progress_pct": 53.22, "epoch_pct": 53.22, "eta": "29:27:41", "max_grad_norm": 0.8, "loss": 0.5580173134803772, "grad_norm": 1.228177785873413, "learning_rate": 5.134465661009792e-05} +{"ts": "2025-12-28T03:40:11", "event": "train_log", "step": 7570, "epoch": 3.1940928270042193, "progress_pct": 53.23, "epoch_pct": 53.23, "eta": "29:26:52", "max_grad_norm": 0.8, "loss": 0.5567790865898132, "grad_norm": 1.4681973457336426, "learning_rate": 5.132065052117385e-05} +{"ts": "2025-12-28T03:40:24", "event": "train_log", "step": 7572, "epoch": 3.1949367088607596, "progress_pct": 53.25, "epoch_pct": 53.25, "eta": "29:26:03", "max_grad_norm": 0.8, "loss": 0.5302338004112244, "grad_norm": 1.3475103378295898, "learning_rate": 5.129664412760109e-05} +{"ts": "2025-12-28T03:40:37", "event": "train_log", "step": 7574, "epoch": 3.1957805907172996, "progress_pct": 53.26, "epoch_pct": 53.26, "eta": "29:25:15", "max_grad_norm": 0.8, "loss": 0.4927324950695038, "grad_norm": 1.1450512409210205, "learning_rate": 5.1272637434917424e-05} +{"ts": "2025-12-28T03:40:50", "event": "train_log", "step": 7576, "epoch": 3.1966244725738395, "progress_pct": 53.28, "epoch_pct": 53.28, "eta": "29:24:26", "max_grad_norm": 0.8, "loss": 0.6003677248954773, "grad_norm": 1.247843623161316, "learning_rate": 5.124863044866075e-05} +{"ts": "2025-12-28T03:41:02", "event": "train_log", "step": 7578, "epoch": 3.19746835443038, "progress_pct": 53.29, "epoch_pct": 53.29, "eta": "29:23:37", "max_grad_norm": 0.8, "loss": 0.5316550731658936, "grad_norm": 1.5563061237335205, "learning_rate": 5.1224623174369004e-05} +{"ts": "2025-12-28T03:41:14", "event": "train_log", "step": 7580, "epoch": 3.19831223628692, "progress_pct": 53.31, "epoch_pct": 53.31, "eta": "29:22:48", "max_grad_norm": 0.8, "loss": 0.5495386719703674, "grad_norm": 2.6963047981262207, "learning_rate": 5.120061561758022e-05} +{"ts": "2025-12-28T03:41:27", "event": "train_log", "step": 7582, "epoch": 3.19915611814346, "progress_pct": 53.32, "epoch_pct": 53.32, "eta": "29:22:00", "max_grad_norm": 0.8, "loss": 0.5243270993232727, "grad_norm": 1.4850870370864868, "learning_rate": 5.1176607783832465e-05} +{"ts": "2025-12-28T03:41:39", "event": "train_log", "step": 7584, "epoch": 3.2, "progress_pct": 53.33, "epoch_pct": 53.33, "eta": "29:21:11", "max_grad_norm": 0.8, "loss": 0.5345951914787292, "grad_norm": 1.408374309539795, "learning_rate": 5.115259967866389e-05} +{"ts": "2025-12-28T03:41:52", "event": "train_log", "step": 7586, "epoch": 3.20084388185654, "progress_pct": 53.35, "epoch_pct": 53.35, "eta": "29:20:22", "max_grad_norm": 0.8, "loss": 0.5151079893112183, "grad_norm": 1.1883639097213745, "learning_rate": 5.1128591307612706e-05} +{"ts": "2025-12-28T03:42:04", "event": "train_log", "step": 7588, "epoch": 3.20168776371308, "progress_pct": 53.36, "epoch_pct": 53.36, "eta": "29:19:33", "max_grad_norm": 0.8, "loss": 0.5611642003059387, "grad_norm": 1.5572841167449951, "learning_rate": 5.110458267621718e-05} +{"ts": "2025-12-28T03:42:16", "event": "train_log", "step": 7590, "epoch": 3.2025316455696204, "progress_pct": 53.38, "epoch_pct": 53.38, "eta": "29:18:44", "max_grad_norm": 0.8, "loss": 0.5970073342323303, "grad_norm": 1.3882172107696533, "learning_rate": 5.108057379001566e-05} +{"ts": "2025-12-28T03:42:29", "event": "train_log", "step": 7592, "epoch": 3.2033755274261604, "progress_pct": 53.39, "epoch_pct": 53.39, "eta": "29:17:55", "max_grad_norm": 0.8, "loss": 0.533338725566864, "grad_norm": 1.2527399063110352, "learning_rate": 5.10565646545465e-05} +{"ts": "2025-12-28T03:42:41", "event": "train_log", "step": 7594, "epoch": 3.2042194092827003, "progress_pct": 53.4, "epoch_pct": 53.4, "eta": "29:17:06", "max_grad_norm": 0.8, "loss": 0.543342649936676, "grad_norm": 1.3665586709976196, "learning_rate": 5.103255527534817e-05} +{"ts": "2025-12-28T03:42:53", "event": "train_log", "step": 7596, "epoch": 3.2050632911392407, "progress_pct": 53.42, "epoch_pct": 53.42, "eta": "29:16:17", "max_grad_norm": 0.8, "loss": 0.5761571526527405, "grad_norm": 1.2960436344146729, "learning_rate": 5.100854565795918e-05} +{"ts": "2025-12-28T03:43:06", "event": "train_log", "step": 7598, "epoch": 3.2059071729957807, "progress_pct": 53.43, "epoch_pct": 53.43, "eta": "29:15:29", "max_grad_norm": 0.8, "loss": 0.5874620676040649, "grad_norm": 1.545060157775879, "learning_rate": 5.0984535807918065e-05} +{"ts": "2025-12-28T03:43:18", "event": "train_log", "step": 7600, "epoch": 3.2067510548523206, "progress_pct": 53.45, "epoch_pct": 53.45, "eta": "29:14:39", "max_grad_norm": 0.8, "loss": 0.5559571981430054, "grad_norm": 1.2778210639953613, "learning_rate": 5.0960525730763455e-05} +{"ts": "2025-12-28T03:51:52", "event": "train_log", "step": 7600, "epoch": 3.2067510548523206, "progress_pct": 53.45, "epoch_pct": 53.45, "eta": "29:22:07", "max_grad_norm": 0.8, "eval_loss": 0.6755207777023315, "eval_runtime": 513.9779, "eval_samples_per_second": 4.099, "eval_steps_per_second": 4.099} +{"ts": "2025-12-28T03:51:52", "event": "train_log", "step": 7600, "epoch": 3.2067510548523206, "progress_pct": 53.45, "epoch_pct": 53.45, "eta": "29:22:07", "max_grad_norm": 0.8, "train_runtime": 121379.3179, "train_samples_per_second": 0.937, "train_steps_per_second": 0.117, "total_flos": 7.892056292508187e+18, "train_loss": 0.6813117427967097} +{"ts": "2025-12-28T04:00:24", "event": "train_log", "step": 7600, "epoch": 3.2067510548523206, "progress_pct": 53.45, "epoch_pct": 53.45, "eta": "29:29:33", "max_grad_norm": 0.8, "eval_loss": 0.6706293225288391, "eval_runtime": 511.6513, "eval_samples_per_second": 4.118, "eval_steps_per_second": 4.118} diff --git a/sft_devstral_24B_v2/wandb/debug-internal.log b/sft_devstral_24B_v2/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..0f91037f83a83206ed5992bb506a61dc164da769 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-12-26T18:08:08.66103332Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-26T18:08:08.82172381Z","level":"INFO","msg":"stream: created new stream","id":"ny9q48hd"} +{"time":"2025-12-26T18:08:08.821819478Z","level":"INFO","msg":"handler: started","stream_id":"ny9q48hd"} +{"time":"2025-12-26T18:08:08.822049155Z","level":"INFO","msg":"stream: started","id":"ny9q48hd"} +{"time":"2025-12-26T18:08:08.822072296Z","level":"INFO","msg":"writer: started","stream_id":"ny9q48hd"} +{"time":"2025-12-26T18:08:08.822098276Z","level":"INFO","msg":"sender: started","stream_id":"ny9q48hd"} +{"time":"2025-12-28T04:02:04.935383596Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-28T04:02:05.045953421Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-12-28T04:02:05.051806259Z","level":"INFO","msg":"stream: closing","id":"ny9q48hd"} +{"time":"2025-12-28T04:02:05.051833004Z","level":"INFO","msg":"handler: closed","stream_id":"ny9q48hd"} +{"time":"2025-12-28T04:02:05.051917075Z","level":"INFO","msg":"sender: closed","stream_id":"ny9q48hd"} +{"time":"2025-12-28T04:02:05.051937152Z","level":"INFO","msg":"stream: closed","id":"ny9q48hd"} diff --git a/sft_devstral_24B_v2/wandb/debug.log b/sft_devstral_24B_v2/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0d4749de3aa5f0726c27004e2dfdcc433243200c --- /dev/null +++ b/sft_devstral_24B_v2/wandb/debug.log @@ -0,0 +1,29 @@ +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Configure stats pid to 190322 +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Loading settings from /workspace/wandb/settings +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:setup_run_log_directory():714] Logging user logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug.log +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug-internal.log +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:init():841] calling init triggers +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'model': {'repo_id': './Models/Devstral-Small-2-24B-HS-CPT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'sft_dataset.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'instruction_field': 'instruction', 'input_field': 'input', 'output_field': 'output', 'format_type': 'custom', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with \n\n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don\'t trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\ncrates/common_enums/src/transformers.rs::SubscriptionStatus\n\n', 'custom_template': '##INSTRUCTION\n{instruction}<|im_end|>\n{input}<|im_end|>\n{output}<|im_end|>', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'train': {'num_train_epochs': 6, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '1e-4', 'weight_decay': 0.0, 'warmup_ratio': 0.08, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 0.8, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'evaluation_strategy': 'steps', 'eval_steps': 100, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'task2file/sft_devstral_24B_v2', '_wandb': {}} +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:init():889] starting backend +2025-12-26 18:08:08,653 INFO MainThread:190322 [wandb_init.py:init():892] sending inform_init request +2025-12-26 18:08:08,658 INFO MainThread:190322 [wandb_init.py:init():900] backend started and connected +2025-12-26 18:08:08,661 INFO MainThread:190322 [wandb_init.py:init():970] updated telemetry +2025-12-26 18:08:08,662 INFO MainThread:190322 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-26 18:08:09,021 INFO MainThread:190322 [wandb_init.py:init():1041] starting run threads in backend +2025-12-26 18:08:09,134 INFO MainThread:190322 [wandb_run.py:_console_start():2521] atexit reg +2025-12-26 18:08:09,134 INFO MainThread:190322 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-26 18:08:09,135 INFO MainThread:190322 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-26 18:08:09,135 INFO MainThread:190322 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-26 18:08:09,138 INFO MainThread:190322 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-26 18:08:52,955 INFO MainThread:190322 [wandb_run.py:_config_callback():1396] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.0', 'base_model_name_or_path': 'Models/Devstral-Small-2-24B-HS-CPT', 'revision': None, 'inference_mode': False, 'r': 8, 'target_modules': ['v_proj', 'q_proj', 'o_proj', 'k_proj'], 'exclude_modules': None, 'lora_alpha': 16, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'image_token_index': 10, 'projector_hidden_act': 'gelu', 'vision_feature_layer': -1, 'vision_config': {'hidden_size': 1024, 'intermediate_size': 4096, 'num_hidden_layers': 24, 'num_attention_heads': 16, 'num_channels': 3, 'patch_size': 14, 'image_size': 1540, 'attention_dropout': 0.0, 'hidden_act': 'silu', 'head_dim': 64, 'initializer_range': 0.02, 'rope_parameters': {'rope_theta': 10000.0, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': None, 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': '', 'model_type': 'pixtral', 'output_attentions': False}, 'text_config': {'vocab_size': 131072, 'max_position_embeddings': 393216, 'hidden_size': 5120, 'intermediate_size': 32768, 'num_hidden_layers': 40, 'num_attention_heads': 32, 'sliding_window': None, 'head_dim': 128, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': True, 'attention_dropout': 0.0, 'rope_parameters': {'beta_fast': 32.0, 'beta_slow': 1.0, 'factor': 48.0, 'llama_4_scaling_beta': 0.1, 'mscale': 1.0, 'mscale_all_dim': 1.0, 'original_max_position_embeddings': 8192, 'rope_theta': 100000000.0, 'rope_type': 'yarn', 'type': 'yarn'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': None, 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': 11, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': '', 'model_type': 'ministral3', 'output_attentions': False}, 'multimodal_projector_bias': False, 'spatial_merge_size': 2, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': ['Mistral3ForConditionalGeneration'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': 'Models/Devstral-Small-2-24B-HS-CPT', 'transformers_version': '5.0.0.dev0', 'model_type': 'mistral3', 'use_cache': False, 'output_attentions': False, 'output_dir': 'task2file/sft_devstral_24B_v2/checkpoints', 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 0.8, 'num_train_epochs': 6.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.08, 'warmup_steps': 0.08, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True} +2025-12-26 18:08:52,965 INFO MainThread:190322 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 24022764544 - > +2025-12-26 18:08:52,965 INFO MainThread:190322 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 24022764544 None +2025-12-28 04:02:04,643 INFO MainThread:190322 [wandb_run.py:_finish():2287] finishing run sirajuddin-shaik-007/sft-training/ny9q48hd +2025-12-28 04:02:04,645 INFO MainThread:190322 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0 +2025-12-28 04:02:04,646 INFO MainThread:190322 [wandb_run.py:_restore():2468] restore +2025-12-28 04:02:04,646 INFO MainThread:190322 [wandb_run.py:_restore():2474] restore done +2025-12-28 04:02:05,050 INFO MainThread:190322 [wandb_run.py:_footer_sync_info():3862] logging synced files diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/config.yaml b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6bb47eb113490d5f13a3a858ce46b261615fb47f --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/config.yaml @@ -0,0 +1,173 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + d58ijptqwmgcs1za2j0g4cusbym7bptn: + args: + - --config + - trainer-kit/SFT/config_instruct.yaml + codePath: trainer-kit/SFT/run_instruct.py + codePathLocal: trainer-kit/SFT/run_instruct.py + cpu_count: 12 + cpu_count_logical: 24 + cudaVersion: "13.0" + disk: + /: + total: "791251738624" + used: "385798254592" + email: shaiksirajuddin9949@gmail.com + executable: /workspace/llm_finetuning_env/bin/python + gpu: NVIDIA A100-SXM4-80GB + gpu_count: 2 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100-SXM4-80GB + uuid: GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100-SXM4-80GB + uuid: GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40 + host: a100-2gpu-shell-session-757d587799-mfdvv + memory: + total: "359047892992" + os: Linux-6.12.46+-x86_64-with-glibc2.35 + program: /workspace/trainer-kit/SFT/run_instruct.py + python: CPython 3.10.12 + root: task2file/sft_devstral_24B_v2 + startedAt: "2025-12-26T18:05:57.725585Z" + writerId: d58ijptqwmgcs1za2j0g4cusbym7bptn + m: [] + python_version: 3.10.12 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 98 + "3": + - 15 + - 16 + "4": 3.10.12 + "5": 0.23.1 + "6": 5.0.0.dev0 + "12": 0.23.1 + "13": linux-x86_64 +data: + value: + custom_template: |- + ##INSTRUCTION + {instruction}<|im_end|> + {input}<|im_end|> + {output}<|im_end|> + eval_jsonl: null + eval_split_ratio: 0.1 + format_type: custom + input_field: input + instruction_field: instruction + max_length: 2048 + num_proc: 4 + output_field: output + shuffle: true + system_prompt: | + You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task. + + ## Output Format + + ##OUTPUT + Explain the data flow and why each component must change: + - Flow: [Input → Processing → Output with arrows] + - For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]" + - Explain coupling between components + + ##SELECT + modify::crates/path/to/file.rs::impl::ComponentName + add::crates/another/file.rs::function::AnotherComponent + + + ## Rules + + 1. Use full paths: `remove::crates/folder/file.rs::Type::Name` + 2. Use `::` for nested items: `status::StructName::Type::Name` + 3. Always explain "must change because" and "without this" + 3. Types of components: function, struct, enum, impl, trait + 4. If there is extra information (e.g., enum variants), include that too. + 5. Start with ##OUTPUT, end with ##SELECT, terminate with + + ## Example + + ##TASK + Add webhook subscription support + + ##OUTPUT + The webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don't trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings. + + ##SELECT + crates/common_enums/src/enums.rs::EventClass + crates/common_enums/src/transformers.rs::SubscriptionStatus + + train_jsonl: ./sft_dataset.jsonl +model: + value: + attn_implementation: null + base_local_dir: base_model + bnb_4bit_compute_dtype: bfloat16 + bnb_4bit_quant_type: nf4 + bnb_4bit_use_double_quant: false + device_map: auto + repo_id: ./Models/Devstral-Small-2-24B-HS-CPT + revision: null + tokenizer_use_fast: true + torch_dtype: bfloat16 + trust_remote_code: true + use_4bit: false +peft: + value: + bias: none + enabled: true + lora_alpha: 16 + lora_dropout: 0.05 + r: 8 + target_modules: auto +run_dir: + value: task2file/sft_devstral_24B_v2 +train: + value: + early_stopping: + enabled: true + metric: eval_loss + min_delta: 0.001 + mode: min + patience: 5 + eval_steps: 100 + evaluation_strategy: steps + gradient_accumulation_steps: 8 + gradient_checkpointing: true + learning_rate: "1e-4" + load_best_model_at_end: true + logging_steps: 2 + lr_scheduler_type: cosine + max_grad_norm: 0.8 + num_train_epochs: 6 + optim: adamw_torch + per_device_eval_batch_size: 1 + per_device_train_batch_size: 1 + resume_from_checkpoint: auto + save_steps: 500 + save_strategy: steps + save_total_limit: 20 + warmup_ratio: 0.08 + weight_decay: 0 diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/output.log b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..0049a8ece883c35fcd8acb7bc6c56f0d7e2d9a96 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/output.log @@ -0,0 +1,34 @@ +Wandb initialized: project='sft-training', name='auto-generated' +[info] Detected Mistral3 model architecture, loading with specific class +Traceback (most recent call last): + File "/workspace/trainer-kit/SFT/run_instruct.py", line 983, in + main() + File "/workspace/trainer-kit/SFT/run_instruct.py", line 849, in main + model, tokenizer = load_base_model_and_tokenizer(cfg, base_dir) + File "/workspace/trainer-kit/SFT/run_instruct.py", line 579, in load_base_model_and_tokenizer + model = Mistral3ForConditionalGeneration.from_pretrained( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/modeling_utils.py", line 3948, in from_pretrained + device_map = _get_device_map(model, device_map, max_memory, hf_quantizer) + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/integrations/accelerate.py", line 281, in _get_device_map + inferred_max_memory = get_balanced_memory( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/integrations/accelerate.py", line 197, in get_balanced_memory + max_memory = get_max_memory(max_memory) + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/utils/modeling.py", line 804, in get_max_memory + _ = torch.tensor([0], device=i) +KeyboardInterrupt +Traceback (most recent call last): + File "/workspace/trainer-kit/SFT/run_instruct.py", line 983, in + main() + File "/workspace/trainer-kit/SFT/run_instruct.py", line 849, in main + model, tokenizer = load_base_model_and_tokenizer(cfg, base_dir) + File "/workspace/trainer-kit/SFT/run_instruct.py", line 579, in load_base_model_and_tokenizer + model = Mistral3ForConditionalGeneration.from_pretrained( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/modeling_utils.py", line 3948, in from_pretrained + device_map = _get_device_map(model, device_map, max_memory, hf_quantizer) + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/integrations/accelerate.py", line 281, in _get_device_map + inferred_max_memory = get_balanced_memory( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/integrations/accelerate.py", line 197, in get_balanced_memory + max_memory = get_max_memory(max_memory) + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/utils/modeling.py", line 804, in get_max_memory + _ = torch.tensor([0], device=i) +KeyboardInterrupt diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/requirements.txt b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..79a4241d8724f018c9bdfcd7c289f1f14578574b --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/requirements.txt @@ -0,0 +1,104 @@ +exceptiongroup==1.3.1 +wheel==0.45.1 +python-dateutil==2.9.0.post0 +nvidia-ml-py==13.580.82 +huggingface_hub==1.2.3 +idna==3.11 +click==8.3.1 +numpy==2.2.6 +httpx==0.28.1 +tokenizers==0.22.1 +sympy==1.13.1 +yarl==1.22.0 +async-timeout==5.0.1 +datasets==4.4.2 +platformdirs==4.5.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-nvtx-cu12==12.1.105 +smmap==5.0.2 +accelerate==1.12.0 +requests==2.32.5 +aiohttp==3.13.2 +bitsandbytes==0.49.0 +nvidia-cublas-cu12==12.1.3.1 +mpmath==1.3.0 +torchaudio==2.5.1+cu121 +nvidia-cuda-runtime-cu12==12.1.105 +typing-inspection==0.4.2 +GitPython==3.1.45 +xxhash==3.6.0 +nvidia-cusolver-cu12==11.4.5.107 +pydantic_core==2.41.5 +six==1.17.0 +torchvision==0.20.1+cu121 +typing_extensions==4.15.0 +triton==3.1.0 +charset-normalizer==3.4.4 +nvitop==1.6.1 +wandb==0.23.1 +regex==2025.11.3 +pip==25.3 +nvidia-cusparse-cu12==12.1.0.106 +pytz==2025.2 +Jinja2==3.1.6 +psutil==7.2.0 +pillow==12.0.0 +packaging==25.0 +safetensors==0.7.0 +sentry-sdk==2.48.0 +gitdb==4.0.12 +httpcore==1.0.9 +setuptools==80.9.0 +nvidia-cufft-cu12==11.0.2.54 +anyio==4.12.0 +transformers==5.0.0.dev0 +pydantic==2.12.5 +fsspec==2025.10.0 +filelock==3.20.0 +PyYAML==6.0.3 +hf-xet==1.2.0 +nvidia-cudnn-cu12==9.1.0.70 +tqdm==4.67.1 +MarkupSafe==2.1.5 +attrs==25.4.0 +nvidia-cuda-nvrtc-cu12==12.1.105 +peft==0.18.0 +aiohappyeyeballs==2.6.1 +networkx==3.4.2 +nvidia-nvjitlink-cu12==12.9.86 +certifi==2025.11.12 +pyarrow==22.0.0 +dill==0.4.0 +protobuf==6.33.2 +aiosignal==1.4.0 +frozenlist==1.8.0 +urllib3==2.6.2 +propcache==0.4.1 +tzdata==2025.3 +pandas==2.3.3 +annotated-types==0.7.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.21.5 +multidict==6.7.0 +nvidia-curand-cu12==10.3.2.106 +trl==0.26.2 +torch==2.5.1+cu121 +h11==0.16.0 +multiprocess==0.70.18 +typer-slim==0.21.0 +wheel==0.45.1 +tomli==2.0.1 +autocommand==2.2.2 +jaraco.context==5.3.0 +zipp==3.19.2 +packaging==24.2 +inflect==7.3.1 +typing_extensions==4.12.2 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +jaraco.text==3.12.1 +backports.tarfile==1.2.0 +more-itertools==10.3.0 +importlib_metadata==8.0.0 +typeguard==4.3.0 diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/wandb-metadata.json b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9e2c3c122325a07fd25c228d9e6f6bb84b20b9de --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/wandb-metadata.json @@ -0,0 +1,47 @@ +{ + "os": "Linux-6.12.46+-x86_64-with-glibc2.35", + "python": "CPython 3.10.12", + "startedAt": "2025-12-26T18:05:57.725585Z", + "args": [ + "--config", + "trainer-kit/SFT/config_instruct.yaml" + ], + "program": "/workspace/trainer-kit/SFT/run_instruct.py", + "codePath": "trainer-kit/SFT/run_instruct.py", + "codePathLocal": "trainer-kit/SFT/run_instruct.py", + "email": "shaiksirajuddin9949@gmail.com", + "root": "task2file/sft_devstral_24B_v2", + "host": "a100-2gpu-shell-session-757d587799-mfdvv", + "executable": "/workspace/llm_finetuning_env/bin/python", + "cpu_count": 12, + "cpu_count_logical": 24, + "gpu": "NVIDIA A100-SXM4-80GB", + "gpu_count": 2, + "disk": { + "/": { + "total": "791251738624", + "used": "385798254592" + } + }, + "memory": { + "total": "359047892992" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40" + } + ], + "cudaVersion": "13.0", + "writerId": "d58ijptqwmgcs1za2j0g4cusbym7bptn" +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/wandb-summary.json b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..8afb95f49483c85658a334253ad61c5e4b5851ef --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":2},"_runtime":2} \ No newline at end of file diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/logs/debug-core.log b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..29c63cfbb9d134b0fac81e6e76ee6b7dceb11bd5 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-26T18:05:57.823957218Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpht47h6p7/port-189168.txt","pid":189168,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-26T18:05:57.824696455Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":189168} +{"time":"2025-12-26T18:05:57.82469386Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-189168-189238-3343268377/socket","Net":"unix"}} +{"time":"2025-12-26T18:05:58.003542516Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-26T18:05:58.010768581Z","level":"INFO","msg":"handleInformInit: received","streamId":"p7wwl5ek","id":"1(@)"} +{"time":"2025-12-26T18:05:58.174029705Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"p7wwl5ek","id":"1(@)"} +{"time":"2025-12-26T18:06:01.288089083Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-26T18:06:01.288159025Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-26T18:06:01.288200541Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-26T18:06:01.288263567Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-26T18:06:01.288324658Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-189168-189238-3343268377/socket","Net":"unix"}} +{"time":"2025-12-26T18:06:01.731707539Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-26T18:06:01.731736159Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-26T18:06:01.731744711Z","level":"INFO","msg":"server is closed"} diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/logs/debug-internal.log b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..9564181b6c26ffe3ddea7506e1320cb20d45a13d --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-26T18:05:58.010910802Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-26T18:05:58.173852132Z","level":"INFO","msg":"stream: created new stream","id":"p7wwl5ek"} +{"time":"2025-12-26T18:05:58.173936115Z","level":"INFO","msg":"handler: started","stream_id":"p7wwl5ek"} +{"time":"2025-12-26T18:05:58.174019933Z","level":"INFO","msg":"stream: started","id":"p7wwl5ek"} +{"time":"2025-12-26T18:05:58.174038448Z","level":"INFO","msg":"writer: started","stream_id":"p7wwl5ek"} +{"time":"2025-12-26T18:05:58.174048363Z","level":"INFO","msg":"sender: started","stream_id":"p7wwl5ek"} +{"time":"2025-12-26T18:06:01.288165843Z","level":"INFO","msg":"stream: closing","id":"p7wwl5ek"} +{"time":"2025-12-26T18:06:01.633870412Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-26T18:06:01.730892428Z","level":"INFO","msg":"handler: closed","stream_id":"p7wwl5ek"} +{"time":"2025-12-26T18:06:01.730977697Z","level":"INFO","msg":"sender: closed","stream_id":"p7wwl5ek"} +{"time":"2025-12-26T18:06:01.730985259Z","level":"INFO","msg":"stream: closed","id":"p7wwl5ek"} diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/logs/debug.log b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d7a226097af37dc1428dae509960b04b5d9eda66 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/logs/debug.log @@ -0,0 +1,23 @@ +2025-12-26 18:05:57,727 INFO MainThread:189168 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-26 18:05:57,727 INFO MainThread:189168 [wandb_setup.py:_flush():80] Configure stats pid to 189168 +2025-12-26 18:05:57,727 INFO MainThread:189168 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-12-26 18:05:57,727 INFO MainThread:189168 [wandb_setup.py:_flush():80] Loading settings from /workspace/wandb/settings +2025-12-26 18:05:57,727 INFO MainThread:189168 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-26 18:05:57,727 INFO MainThread:189168 [wandb_init.py:setup_run_log_directory():714] Logging user logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/logs/debug.log +2025-12-26 18:05:57,727 INFO MainThread:189168 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/logs/debug-internal.log +2025-12-26 18:05:57,727 INFO MainThread:189168 [wandb_init.py:init():841] calling init triggers +2025-12-26 18:05:57,727 INFO MainThread:189168 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'model': {'repo_id': './Models/Devstral-Small-2-24B-HS-CPT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': './sft_dataset.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'instruction_field': 'instruction', 'input_field': 'input', 'output_field': 'output', 'format_type': 'custom', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with \n\n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don\'t trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\ncrates/common_enums/src/transformers.rs::SubscriptionStatus\n\n', 'custom_template': '##INSTRUCTION\n{instruction}<|im_end|>\n{input}<|im_end|>\n{output}<|im_end|>', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'train': {'num_train_epochs': 6, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '1e-4', 'weight_decay': 0.0, 'warmup_ratio': 0.08, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 0.8, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'evaluation_strategy': 'steps', 'eval_steps': 100, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'task2file/sft_devstral_24B_v2', '_wandb': {}} +2025-12-26 18:05:57,727 INFO MainThread:189168 [wandb_init.py:init():889] starting backend +2025-12-26 18:05:58,003 INFO MainThread:189168 [wandb_init.py:init():892] sending inform_init request +2025-12-26 18:05:58,008 INFO MainThread:189168 [wandb_init.py:init():900] backend started and connected +2025-12-26 18:05:58,010 INFO MainThread:189168 [wandb_init.py:init():970] updated telemetry +2025-12-26 18:05:58,011 INFO MainThread:189168 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-26 18:05:58,366 INFO MainThread:189168 [wandb_init.py:init():1041] starting run threads in backend +2025-12-26 18:05:58,481 INFO MainThread:189168 [wandb_run.py:_console_start():2521] atexit reg +2025-12-26 18:05:58,481 INFO MainThread:189168 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-26 18:05:58,481 INFO MainThread:189168 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-26 18:05:58,481 INFO MainThread:189168 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-26 18:05:58,485 INFO MainThread:189168 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-26 18:06:01,288 INFO wandb-AsyncioManager-main:189168 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-26 18:06:01,288 INFO wandb-AsyncioManager-main:189168 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/run-p7wwl5ek.wandb b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/run-p7wwl5ek.wandb new file mode 100644 index 0000000000000000000000000000000000000000..720576eb9d6b134396929fc5fce4b17be5fdcb7c Binary files /dev/null and b/sft_devstral_24B_v2/wandb/run-20251226_180557-p7wwl5ek/run-p7wwl5ek.wandb differ diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/config.yaml b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a28a566190467228cbb40451bd46b295b52d8f7f --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/config.yaml @@ -0,0 +1,173 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + qgjoaibnrh2irresqyfv8dka3f0628ti: + args: + - --config + - trainer-kit/SFT/config_instruct.yaml + codePath: trainer-kit/SFT/run_instruct.py + codePathLocal: trainer-kit/SFT/run_instruct.py + cpu_count: 12 + cpu_count_logical: 24 + cudaVersion: "13.0" + disk: + /: + total: "791251738624" + used: "386025496576" + email: shaiksirajuddin9949@gmail.com + executable: /workspace/llm_finetuning_env/bin/python + gpu: NVIDIA A100-SXM4-80GB + gpu_count: 2 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100-SXM4-80GB + uuid: GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100-SXM4-80GB + uuid: GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40 + host: a100-2gpu-shell-session-757d587799-mfdvv + memory: + total: "359047892992" + os: Linux-6.12.46+-x86_64-with-glibc2.35 + program: /workspace/trainer-kit/SFT/run_instruct.py + python: CPython 3.10.12 + root: task2file/sft_devstral_24B_v2 + startedAt: "2025-12-26T18:06:13.427654Z" + writerId: qgjoaibnrh2irresqyfv8dka3f0628ti + m: [] + python_version: 3.10.12 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 98 + "3": + - 15 + - 16 + "4": 3.10.12 + "5": 0.23.1 + "6": 5.0.0.dev0 + "12": 0.23.1 + "13": linux-x86_64 +data: + value: + custom_template: |- + ##INSTRUCTION + {instruction}<|im_end|> + {input}<|im_end|> + {output}<|im_end|> + eval_jsonl: null + eval_split_ratio: 0.1 + format_type: custom + input_field: input + instruction_field: instruction + max_length: 2048 + num_proc: 4 + output_field: output + shuffle: true + system_prompt: | + You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task. + + ## Output Format + + ##OUTPUT + Explain the data flow and why each component must change: + - Flow: [Input → Processing → Output with arrows] + - For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]" + - Explain coupling between components + + ##SELECT + modify::crates/path/to/file.rs::impl::ComponentName + add::crates/another/file.rs::function::AnotherComponent + + + ## Rules + + 1. Use full paths: `remove::crates/folder/file.rs::Type::Name` + 2. Use `::` for nested items: `status::StructName::Type::Name` + 3. Always explain "must change because" and "without this" + 3. Types of components: function, struct, enum, impl, trait + 4. If there is extra information (e.g., enum variants), include that too. + 5. Start with ##OUTPUT, end with ##SELECT, terminate with + + ## Example + + ##TASK + Add webhook subscription support + + ##OUTPUT + The webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don't trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings. + + ##SELECT + crates/common_enums/src/enums.rs::EventClass + crates/common_enums/src/transformers.rs::SubscriptionStatus + + train_jsonl: ./sft_dataset.jsonl +model: + value: + attn_implementation: null + base_local_dir: base_model + bnb_4bit_compute_dtype: bfloat16 + bnb_4bit_quant_type: nf4 + bnb_4bit_use_double_quant: false + device_map: auto + repo_id: ./Models/Devstral-Small-2-24B-HS-CPT + revision: null + tokenizer_use_fast: true + torch_dtype: bfloat16 + trust_remote_code: true + use_4bit: false +peft: + value: + bias: none + enabled: true + lora_alpha: 16 + lora_dropout: 0.05 + r: 8 + target_modules: auto +run_dir: + value: task2file/sft_devstral_24B_v2 +train: + value: + early_stopping: + enabled: true + metric: eval_loss + min_delta: 0.001 + mode: min + patience: 5 + eval_steps: 100 + evaluation_strategy: steps + gradient_accumulation_steps: 8 + gradient_checkpointing: true + learning_rate: "1e-4" + load_best_model_at_end: true + logging_steps: 2 + lr_scheduler_type: cosine + max_grad_norm: 0.8 + num_train_epochs: 6 + optim: adamw_torch + per_device_eval_batch_size: 1 + per_device_train_batch_size: 1 + resume_from_checkpoint: auto + save_steps: 500 + save_strategy: steps + save_total_limit: 20 + warmup_ratio: 0.08 + weight_decay: 0 diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/output.log b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..0f6df7669f98274cdbfeaf638174ba4765ae106e --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/output.log @@ -0,0 +1,48 @@ +Wandb initialized: project='sft-training', name='auto-generated' +[info] Detected Mistral3 model architecture, loading with specific class +Loading weights: 100%|█| 585/585 [00:12<00:00, 46.92it/s, Materializing param=model.vision_tower.transfor +[info] Ensuring all parameters are materialized... +Traceback (most recent call last): + File "/workspace/trainer-kit/SFT/run_instruct.py", line 983, in + main() + File "/workspace/trainer-kit/SFT/run_instruct.py", line 852, in main + train_ds, eval_ds = build_datasets(cfg, tokenizer) + File "/workspace/trainer-kit/SFT/run_instruct.py", line 414, in build_datasets + ds = load_dataset("json", data_files={"train": train_path}) + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 1492, in load_dataset + builder_instance = load_dataset_builder( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 1137, in load_dataset_builder + dataset_module = dataset_module_factory( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 913, in dataset_module_factory + ).get_module() + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 527, in get_module + data_files = DataFilesDict.from_patterns( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 708, in from_patterns + else DataFilesList.from_patterns( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 601, in from_patterns + resolve_pattern( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 390, in resolve_pattern + raise FileNotFoundError(error_msg) +FileNotFoundError: Unable to find '/workspace/./sft_dataset.jsonl' +Traceback (most recent call last): + File "/workspace/trainer-kit/SFT/run_instruct.py", line 983, in + main() + File "/workspace/trainer-kit/SFT/run_instruct.py", line 852, in main + train_ds, eval_ds = build_datasets(cfg, tokenizer) + File "/workspace/trainer-kit/SFT/run_instruct.py", line 414, in build_datasets + ds = load_dataset("json", data_files={"train": train_path}) + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 1492, in load_dataset + builder_instance = load_dataset_builder( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 1137, in load_dataset_builder + dataset_module = dataset_module_factory( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 913, in dataset_module_factory + ).get_module() + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 527, in get_module + data_files = DataFilesDict.from_patterns( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 708, in from_patterns + else DataFilesList.from_patterns( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 601, in from_patterns + resolve_pattern( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 390, in resolve_pattern + raise FileNotFoundError(error_msg) +FileNotFoundError: Unable to find '/workspace/./sft_dataset.jsonl' diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/requirements.txt b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..79a4241d8724f018c9bdfcd7c289f1f14578574b --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/requirements.txt @@ -0,0 +1,104 @@ +exceptiongroup==1.3.1 +wheel==0.45.1 +python-dateutil==2.9.0.post0 +nvidia-ml-py==13.580.82 +huggingface_hub==1.2.3 +idna==3.11 +click==8.3.1 +numpy==2.2.6 +httpx==0.28.1 +tokenizers==0.22.1 +sympy==1.13.1 +yarl==1.22.0 +async-timeout==5.0.1 +datasets==4.4.2 +platformdirs==4.5.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-nvtx-cu12==12.1.105 +smmap==5.0.2 +accelerate==1.12.0 +requests==2.32.5 +aiohttp==3.13.2 +bitsandbytes==0.49.0 +nvidia-cublas-cu12==12.1.3.1 +mpmath==1.3.0 +torchaudio==2.5.1+cu121 +nvidia-cuda-runtime-cu12==12.1.105 +typing-inspection==0.4.2 +GitPython==3.1.45 +xxhash==3.6.0 +nvidia-cusolver-cu12==11.4.5.107 +pydantic_core==2.41.5 +six==1.17.0 +torchvision==0.20.1+cu121 +typing_extensions==4.15.0 +triton==3.1.0 +charset-normalizer==3.4.4 +nvitop==1.6.1 +wandb==0.23.1 +regex==2025.11.3 +pip==25.3 +nvidia-cusparse-cu12==12.1.0.106 +pytz==2025.2 +Jinja2==3.1.6 +psutil==7.2.0 +pillow==12.0.0 +packaging==25.0 +safetensors==0.7.0 +sentry-sdk==2.48.0 +gitdb==4.0.12 +httpcore==1.0.9 +setuptools==80.9.0 +nvidia-cufft-cu12==11.0.2.54 +anyio==4.12.0 +transformers==5.0.0.dev0 +pydantic==2.12.5 +fsspec==2025.10.0 +filelock==3.20.0 +PyYAML==6.0.3 +hf-xet==1.2.0 +nvidia-cudnn-cu12==9.1.0.70 +tqdm==4.67.1 +MarkupSafe==2.1.5 +attrs==25.4.0 +nvidia-cuda-nvrtc-cu12==12.1.105 +peft==0.18.0 +aiohappyeyeballs==2.6.1 +networkx==3.4.2 +nvidia-nvjitlink-cu12==12.9.86 +certifi==2025.11.12 +pyarrow==22.0.0 +dill==0.4.0 +protobuf==6.33.2 +aiosignal==1.4.0 +frozenlist==1.8.0 +urllib3==2.6.2 +propcache==0.4.1 +tzdata==2025.3 +pandas==2.3.3 +annotated-types==0.7.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.21.5 +multidict==6.7.0 +nvidia-curand-cu12==10.3.2.106 +trl==0.26.2 +torch==2.5.1+cu121 +h11==0.16.0 +multiprocess==0.70.18 +typer-slim==0.21.0 +wheel==0.45.1 +tomli==2.0.1 +autocommand==2.2.2 +jaraco.context==5.3.0 +zipp==3.19.2 +packaging==24.2 +inflect==7.3.1 +typing_extensions==4.12.2 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +jaraco.text==3.12.1 +backports.tarfile==1.2.0 +more-itertools==10.3.0 +importlib_metadata==8.0.0 +typeguard==4.3.0 diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/wandb-metadata.json b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..123dd2dae3546b21d00b55d1a4cebba0c0a84cb0 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/wandb-metadata.json @@ -0,0 +1,47 @@ +{ + "os": "Linux-6.12.46+-x86_64-with-glibc2.35", + "python": "CPython 3.10.12", + "startedAt": "2025-12-26T18:06:13.427654Z", + "args": [ + "--config", + "trainer-kit/SFT/config_instruct.yaml" + ], + "program": "/workspace/trainer-kit/SFT/run_instruct.py", + "codePath": "trainer-kit/SFT/run_instruct.py", + "codePathLocal": "trainer-kit/SFT/run_instruct.py", + "email": "shaiksirajuddin9949@gmail.com", + "root": "task2file/sft_devstral_24B_v2", + "host": "a100-2gpu-shell-session-757d587799-mfdvv", + "executable": "/workspace/llm_finetuning_env/bin/python", + "cpu_count": 12, + "cpu_count_logical": 24, + "gpu": "NVIDIA A100-SXM4-80GB", + "gpu_count": 2, + "disk": { + "/": { + "total": "791251738624", + "used": "386025496576" + } + }, + "memory": { + "total": "359047892992" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40" + } + ], + "cudaVersion": "13.0", + "writerId": "qgjoaibnrh2irresqyfv8dka3f0628ti" +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/wandb-summary.json b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..26eb9fe6450816c7c6ef2f464335d313076bb440 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":19},"_runtime":19} \ No newline at end of file diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/logs/debug-core.log b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..7b076a3990381f0bf76914939b2cf0e416a194a4 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-26T18:06:13.512689006Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpaql7q9l6/port-189374.txt","pid":189374,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-26T18:06:13.513337738Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":189374} +{"time":"2025-12-26T18:06:13.513339589Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-189374-189434-2072401818/socket","Net":"unix"}} +{"time":"2025-12-26T18:06:13.697155944Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-26T18:06:13.703520039Z","level":"INFO","msg":"handleInformInit: received","streamId":"i1cmzyri","id":"1(@)"} +{"time":"2025-12-26T18:06:13.861987964Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"i1cmzyri","id":"1(@)"} +{"time":"2025-12-26T18:06:33.265409361Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-26T18:06:33.265466167Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-26T18:06:33.265521907Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-26T18:06:33.265530544Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-26T18:06:33.265687334Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-189374-189434-2072401818/socket","Net":"unix"}} +{"time":"2025-12-26T18:06:33.573303185Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-26T18:06:33.573347477Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-26T18:06:33.573360632Z","level":"INFO","msg":"server is closed"} diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/logs/debug-internal.log b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..0d0d55e2dd4cd933a4d616971fcf7353b7e983c2 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-26T18:06:13.703642155Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-26T18:06:13.861777831Z","level":"INFO","msg":"stream: created new stream","id":"i1cmzyri"} +{"time":"2025-12-26T18:06:13.861855775Z","level":"INFO","msg":"handler: started","stream_id":"i1cmzyri"} +{"time":"2025-12-26T18:06:13.861978087Z","level":"INFO","msg":"stream: started","id":"i1cmzyri"} +{"time":"2025-12-26T18:06:13.862005472Z","level":"INFO","msg":"writer: started","stream_id":"i1cmzyri"} +{"time":"2025-12-26T18:06:13.862018215Z","level":"INFO","msg":"sender: started","stream_id":"i1cmzyri"} +{"time":"2025-12-26T18:06:33.265479861Z","level":"INFO","msg":"stream: closing","id":"i1cmzyri"} +{"time":"2025-12-26T18:06:33.464312202Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-26T18:06:33.550123155Z","level":"INFO","msg":"handler: closed","stream_id":"i1cmzyri"} +{"time":"2025-12-26T18:06:33.550255641Z","level":"INFO","msg":"sender: closed","stream_id":"i1cmzyri"} +{"time":"2025-12-26T18:06:33.550271731Z","level":"INFO","msg":"stream: closed","id":"i1cmzyri"} diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/logs/debug.log b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..695535f576be99cd31f351c88e2d182e4abcb663 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/logs/debug.log @@ -0,0 +1,23 @@ +2025-12-26 18:06:13,429 INFO MainThread:189374 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-26 18:06:13,429 INFO MainThread:189374 [wandb_setup.py:_flush():80] Configure stats pid to 189374 +2025-12-26 18:06:13,429 INFO MainThread:189374 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-12-26 18:06:13,429 INFO MainThread:189374 [wandb_setup.py:_flush():80] Loading settings from /workspace/wandb/settings +2025-12-26 18:06:13,429 INFO MainThread:189374 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-26 18:06:13,429 INFO MainThread:189374 [wandb_init.py:setup_run_log_directory():714] Logging user logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/logs/debug.log +2025-12-26 18:06:13,429 INFO MainThread:189374 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/logs/debug-internal.log +2025-12-26 18:06:13,429 INFO MainThread:189374 [wandb_init.py:init():841] calling init triggers +2025-12-26 18:06:13,429 INFO MainThread:189374 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'model': {'repo_id': './Models/Devstral-Small-2-24B-HS-CPT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': './sft_dataset.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'instruction_field': 'instruction', 'input_field': 'input', 'output_field': 'output', 'format_type': 'custom', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with \n\n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don\'t trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\ncrates/common_enums/src/transformers.rs::SubscriptionStatus\n\n', 'custom_template': '##INSTRUCTION\n{instruction}<|im_end|>\n{input}<|im_end|>\n{output}<|im_end|>', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'train': {'num_train_epochs': 6, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '1e-4', 'weight_decay': 0.0, 'warmup_ratio': 0.08, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 0.8, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'evaluation_strategy': 'steps', 'eval_steps': 100, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'task2file/sft_devstral_24B_v2', '_wandb': {}} +2025-12-26 18:06:13,429 INFO MainThread:189374 [wandb_init.py:init():889] starting backend +2025-12-26 18:06:13,697 INFO MainThread:189374 [wandb_init.py:init():892] sending inform_init request +2025-12-26 18:06:13,701 INFO MainThread:189374 [wandb_init.py:init():900] backend started and connected +2025-12-26 18:06:13,703 INFO MainThread:189374 [wandb_init.py:init():970] updated telemetry +2025-12-26 18:06:13,704 INFO MainThread:189374 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-26 18:06:14,110 INFO MainThread:189374 [wandb_init.py:init():1041] starting run threads in backend +2025-12-26 18:06:14,225 INFO MainThread:189374 [wandb_run.py:_console_start():2521] atexit reg +2025-12-26 18:06:14,226 INFO MainThread:189374 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-26 18:06:14,226 INFO MainThread:189374 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-26 18:06:14,226 INFO MainThread:189374 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-26 18:06:14,230 INFO MainThread:189374 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-26 18:06:33,265 INFO wandb-AsyncioManager-main:189374 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-26 18:06:33,265 INFO wandb-AsyncioManager-main:189374 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/run-i1cmzyri.wandb b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/run-i1cmzyri.wandb new file mode 100644 index 0000000000000000000000000000000000000000..86b0d7bf8ae55aab7207ffabfbdc6df27c6431ed --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180613-i1cmzyri/run-i1cmzyri.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ad100ab4b5ebb1346324439587c74624e607e5e7499a2b25173b835b4699789 +size 208577 diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/config.yaml b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b162c53c0392f2d43d0b7a82876a2e9ae6f2735a --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/config.yaml @@ -0,0 +1,173 @@ +_wandb: + value: + cli_version: 0.23.1 + e: + b1hui6k7d05xwq8cyz4bv453ig8gyf1q: + args: + - --config + - trainer-kit/SFT/config_instruct.yaml + codePath: trainer-kit/SFT/run_instruct.py + codePathLocal: trainer-kit/SFT/run_instruct.py + cpu_count: 12 + cpu_count_logical: 24 + cudaVersion: "13.0" + disk: + /: + total: "791251738624" + used: "386726633472" + email: shaiksirajuddin9949@gmail.com + executable: /workspace/llm_finetuning_env/bin/python + gpu: NVIDIA A100-SXM4-80GB + gpu_count: 2 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100-SXM4-80GB + uuid: GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100-SXM4-80GB + uuid: GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40 + host: a100-2gpu-shell-session-757d587799-mfdvv + memory: + total: "359047892992" + os: Linux-6.12.46+-x86_64-with-glibc2.35 + program: /workspace/trainer-kit/SFT/run_instruct.py + python: CPython 3.10.12 + root: task2file/sft_devstral_24B_v2 + startedAt: "2025-12-26T18:07:02.184185Z" + writerId: b1hui6k7d05xwq8cyz4bv453ig8gyf1q + m: [] + python_version: 3.10.12 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 98 + "3": + - 15 + - 16 + "4": 3.10.12 + "5": 0.23.1 + "6": 5.0.0.dev0 + "12": 0.23.1 + "13": linux-x86_64 +data: + value: + custom_template: |- + ##INSTRUCTION + {instruction}<|im_end|> + {input}<|im_end|> + {output}<|im_end|> + eval_jsonl: null + eval_split_ratio: 0.1 + format_type: custom + input_field: input + instruction_field: instruction + max_length: 2048 + num_proc: 4 + output_field: output + shuffle: true + system_prompt: | + You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task. + + ## Output Format + + ##OUTPUT + Explain the data flow and why each component must change: + - Flow: [Input → Processing → Output with arrows] + - For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]" + - Explain coupling between components + + ##SELECT + modify::crates/path/to/file.rs::impl::ComponentName + add::crates/another/file.rs::function::AnotherComponent + + + ## Rules + + 1. Use full paths: `remove::crates/folder/file.rs::Type::Name` + 2. Use `::` for nested items: `status::StructName::Type::Name` + 3. Always explain "must change because" and "without this" + 3. Types of components: function, struct, enum, impl, trait + 4. If there is extra information (e.g., enum variants), include that too. + 5. Start with ##OUTPUT, end with ##SELECT, terminate with + + ## Example + + ##TASK + Add webhook subscription support + + ##OUTPUT + The webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don't trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings. + + ##SELECT + crates/common_enums/src/enums.rs::EventClass + crates/common_enums/src/transformers.rs::SubscriptionStatus + + train_jsonl: sft_dataset.jsonl +model: + value: + attn_implementation: null + base_local_dir: base_model + bnb_4bit_compute_dtype: bfloat16 + bnb_4bit_quant_type: nf4 + bnb_4bit_use_double_quant: false + device_map: auto + repo_id: ./Models/Devstral-Small-2-24B-HS-CPT + revision: null + tokenizer_use_fast: true + torch_dtype: bfloat16 + trust_remote_code: true + use_4bit: false +peft: + value: + bias: none + enabled: true + lora_alpha: 16 + lora_dropout: 0.05 + r: 8 + target_modules: auto +run_dir: + value: task2file/sft_devstral_24B_v2 +train: + value: + early_stopping: + enabled: true + metric: eval_loss + min_delta: 0.001 + mode: min + patience: 5 + eval_steps: 100 + evaluation_strategy: steps + gradient_accumulation_steps: 8 + gradient_checkpointing: true + learning_rate: "1e-4" + load_best_model_at_end: true + logging_steps: 2 + lr_scheduler_type: cosine + max_grad_norm: 0.8 + num_train_epochs: 6 + optim: adamw_torch + per_device_eval_batch_size: 1 + per_device_train_batch_size: 1 + resume_from_checkpoint: auto + save_steps: 500 + save_strategy: steps + save_total_limit: 20 + warmup_ratio: 0.08 + weight_decay: 0 diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/output.log b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..b9063082d9985a472ca23481175a73a73b2ff277 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/output.log @@ -0,0 +1,48 @@ +Wandb initialized: project='sft-training', name='auto-generated' +[info] Detected Mistral3 model architecture, loading with specific class +Loading weights: 100%|█| 585/585 [00:12<00:00, 47.03it/s, Materializing param=model.vision_tower.transfor +[info] Ensuring all parameters are materialized... +Traceback (most recent call last): + File "/workspace/trainer-kit/SFT/run_instruct.py", line 983, in + main() + File "/workspace/trainer-kit/SFT/run_instruct.py", line 852, in main + train_ds, eval_ds = build_datasets(cfg, tokenizer) + File "/workspace/trainer-kit/SFT/run_instruct.py", line 414, in build_datasets + ds = load_dataset("json", data_files={"train": train_path}) + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 1492, in load_dataset + builder_instance = load_dataset_builder( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 1137, in load_dataset_builder + dataset_module = dataset_module_factory( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 913, in dataset_module_factory + ).get_module() + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 527, in get_module + data_files = DataFilesDict.from_patterns( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 708, in from_patterns + else DataFilesList.from_patterns( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 601, in from_patterns + resolve_pattern( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 390, in resolve_pattern + raise FileNotFoundError(error_msg) +FileNotFoundError: Unable to find '/workspace/sft_dataset.jsonl' +Traceback (most recent call last): + File "/workspace/trainer-kit/SFT/run_instruct.py", line 983, in + main() + File "/workspace/trainer-kit/SFT/run_instruct.py", line 852, in main + train_ds, eval_ds = build_datasets(cfg, tokenizer) + File "/workspace/trainer-kit/SFT/run_instruct.py", line 414, in build_datasets + ds = load_dataset("json", data_files={"train": train_path}) + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 1492, in load_dataset + builder_instance = load_dataset_builder( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 1137, in load_dataset_builder + dataset_module = dataset_module_factory( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 913, in dataset_module_factory + ).get_module() + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/load.py", line 527, in get_module + data_files = DataFilesDict.from_patterns( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 708, in from_patterns + else DataFilesList.from_patterns( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 601, in from_patterns + resolve_pattern( + File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/data_files.py", line 390, in resolve_pattern + raise FileNotFoundError(error_msg) +FileNotFoundError: Unable to find '/workspace/sft_dataset.jsonl' diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/requirements.txt b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..79a4241d8724f018c9bdfcd7c289f1f14578574b --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/requirements.txt @@ -0,0 +1,104 @@ +exceptiongroup==1.3.1 +wheel==0.45.1 +python-dateutil==2.9.0.post0 +nvidia-ml-py==13.580.82 +huggingface_hub==1.2.3 +idna==3.11 +click==8.3.1 +numpy==2.2.6 +httpx==0.28.1 +tokenizers==0.22.1 +sympy==1.13.1 +yarl==1.22.0 +async-timeout==5.0.1 +datasets==4.4.2 +platformdirs==4.5.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-nvtx-cu12==12.1.105 +smmap==5.0.2 +accelerate==1.12.0 +requests==2.32.5 +aiohttp==3.13.2 +bitsandbytes==0.49.0 +nvidia-cublas-cu12==12.1.3.1 +mpmath==1.3.0 +torchaudio==2.5.1+cu121 +nvidia-cuda-runtime-cu12==12.1.105 +typing-inspection==0.4.2 +GitPython==3.1.45 +xxhash==3.6.0 +nvidia-cusolver-cu12==11.4.5.107 +pydantic_core==2.41.5 +six==1.17.0 +torchvision==0.20.1+cu121 +typing_extensions==4.15.0 +triton==3.1.0 +charset-normalizer==3.4.4 +nvitop==1.6.1 +wandb==0.23.1 +regex==2025.11.3 +pip==25.3 +nvidia-cusparse-cu12==12.1.0.106 +pytz==2025.2 +Jinja2==3.1.6 +psutil==7.2.0 +pillow==12.0.0 +packaging==25.0 +safetensors==0.7.0 +sentry-sdk==2.48.0 +gitdb==4.0.12 +httpcore==1.0.9 +setuptools==80.9.0 +nvidia-cufft-cu12==11.0.2.54 +anyio==4.12.0 +transformers==5.0.0.dev0 +pydantic==2.12.5 +fsspec==2025.10.0 +filelock==3.20.0 +PyYAML==6.0.3 +hf-xet==1.2.0 +nvidia-cudnn-cu12==9.1.0.70 +tqdm==4.67.1 +MarkupSafe==2.1.5 +attrs==25.4.0 +nvidia-cuda-nvrtc-cu12==12.1.105 +peft==0.18.0 +aiohappyeyeballs==2.6.1 +networkx==3.4.2 +nvidia-nvjitlink-cu12==12.9.86 +certifi==2025.11.12 +pyarrow==22.0.0 +dill==0.4.0 +protobuf==6.33.2 +aiosignal==1.4.0 +frozenlist==1.8.0 +urllib3==2.6.2 +propcache==0.4.1 +tzdata==2025.3 +pandas==2.3.3 +annotated-types==0.7.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.21.5 +multidict==6.7.0 +nvidia-curand-cu12==10.3.2.106 +trl==0.26.2 +torch==2.5.1+cu121 +h11==0.16.0 +multiprocess==0.70.18 +typer-slim==0.21.0 +wheel==0.45.1 +tomli==2.0.1 +autocommand==2.2.2 +jaraco.context==5.3.0 +zipp==3.19.2 +packaging==24.2 +inflect==7.3.1 +typing_extensions==4.12.2 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +jaraco.text==3.12.1 +backports.tarfile==1.2.0 +more-itertools==10.3.0 +importlib_metadata==8.0.0 +typeguard==4.3.0 diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/wandb-metadata.json b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..3232a37459b9e374a95bfabddec1198541185d6b --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/wandb-metadata.json @@ -0,0 +1,47 @@ +{ + "os": "Linux-6.12.46+-x86_64-with-glibc2.35", + "python": "CPython 3.10.12", + "startedAt": "2025-12-26T18:07:02.184185Z", + "args": [ + "--config", + "trainer-kit/SFT/config_instruct.yaml" + ], + "program": "/workspace/trainer-kit/SFT/run_instruct.py", + "codePath": "trainer-kit/SFT/run_instruct.py", + "codePathLocal": "trainer-kit/SFT/run_instruct.py", + "email": "shaiksirajuddin9949@gmail.com", + "root": "task2file/sft_devstral_24B_v2", + "host": "a100-2gpu-shell-session-757d587799-mfdvv", + "executable": "/workspace/llm_finetuning_env/bin/python", + "cpu_count": 12, + "cpu_count_logical": 24, + "gpu": "NVIDIA A100-SXM4-80GB", + "gpu_count": 2, + "disk": { + "/": { + "total": "791251738624", + "used": "386726633472" + } + }, + "memory": { + "total": "359047892992" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40" + } + ], + "cudaVersion": "13.0", + "writerId": "b1hui6k7d05xwq8cyz4bv453ig8gyf1q" +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/wandb-summary.json b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..26eb9fe6450816c7c6ef2f464335d313076bb440 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":19},"_runtime":19} \ No newline at end of file diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/logs/debug-core.log b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..b0cba42dd8b22152c62843b7ac870d8b266054b4 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-12-26T18:07:02.269040435Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpn17hlpsl/port-189808.txt","pid":189808,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-26T18:07:02.269706147Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":189808} +{"time":"2025-12-26T18:07:02.26970836Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-189808-189898-3164802245/socket","Net":"unix"}} +{"time":"2025-12-26T18:07:02.451619132Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-26T18:07:02.458183004Z","level":"INFO","msg":"handleInformInit: received","streamId":"oordmylf","id":"1(@)"} +{"time":"2025-12-26T18:07:02.645894292Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"oordmylf","id":"1(@)"} +{"time":"2025-12-26T18:07:21.83310619Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-26T18:07:21.833169189Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-26T18:07:21.833173516Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-26T18:07:21.833268292Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-189808-189898-3164802245/socket","Net":"unix"}} +{"time":"2025-12-26T18:07:21.833292467Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-26T18:07:22.158467464Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-26T18:07:22.158501256Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-26T18:07:22.158518889Z","level":"INFO","msg":"server is closed"} diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/logs/debug-internal.log b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..64dfd65f6b3611a20c4d004173ca0bab217f9b4b --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2025-12-26T18:07:02.458313987Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-26T18:07:02.645556611Z","level":"INFO","msg":"stream: created new stream","id":"oordmylf"} +{"time":"2025-12-26T18:07:02.645689372Z","level":"INFO","msg":"handler: started","stream_id":"oordmylf"} +{"time":"2025-12-26T18:07:02.645880522Z","level":"INFO","msg":"stream: started","id":"oordmylf"} +{"time":"2025-12-26T18:07:02.64591008Z","level":"INFO","msg":"writer: started","stream_id":"oordmylf"} +{"time":"2025-12-26T18:07:02.64593173Z","level":"INFO","msg":"sender: started","stream_id":"oordmylf"} +{"time":"2025-12-26T18:07:21.833167126Z","level":"INFO","msg":"stream: closing","id":"oordmylf"} +{"time":"2025-12-26T18:07:22.023419324Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-26T18:07:22.157584616Z","level":"INFO","msg":"handler: closed","stream_id":"oordmylf"} +{"time":"2025-12-26T18:07:22.157683147Z","level":"INFO","msg":"sender: closed","stream_id":"oordmylf"} +{"time":"2025-12-26T18:07:22.157690597Z","level":"INFO","msg":"stream: closed","id":"oordmylf"} diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/logs/debug.log b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..86f48c5f2da6e74415d4e7a2c4d4e68ae5a4c2d1 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/logs/debug.log @@ -0,0 +1,23 @@ +2025-12-26 18:07:02,185 INFO MainThread:189808 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-26 18:07:02,185 INFO MainThread:189808 [wandb_setup.py:_flush():80] Configure stats pid to 189808 +2025-12-26 18:07:02,186 INFO MainThread:189808 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-12-26 18:07:02,186 INFO MainThread:189808 [wandb_setup.py:_flush():80] Loading settings from /workspace/wandb/settings +2025-12-26 18:07:02,186 INFO MainThread:189808 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-26 18:07:02,186 INFO MainThread:189808 [wandb_init.py:setup_run_log_directory():714] Logging user logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/logs/debug.log +2025-12-26 18:07:02,186 INFO MainThread:189808 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/logs/debug-internal.log +2025-12-26 18:07:02,186 INFO MainThread:189808 [wandb_init.py:init():841] calling init triggers +2025-12-26 18:07:02,186 INFO MainThread:189808 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'model': {'repo_id': './Models/Devstral-Small-2-24B-HS-CPT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'sft_dataset.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'instruction_field': 'instruction', 'input_field': 'input', 'output_field': 'output', 'format_type': 'custom', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with \n\n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don\'t trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\ncrates/common_enums/src/transformers.rs::SubscriptionStatus\n\n', 'custom_template': '##INSTRUCTION\n{instruction}<|im_end|>\n{input}<|im_end|>\n{output}<|im_end|>', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'train': {'num_train_epochs': 6, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '1e-4', 'weight_decay': 0.0, 'warmup_ratio': 0.08, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 0.8, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'evaluation_strategy': 'steps', 'eval_steps': 100, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'task2file/sft_devstral_24B_v2', '_wandb': {}} +2025-12-26 18:07:02,186 INFO MainThread:189808 [wandb_init.py:init():889] starting backend +2025-12-26 18:07:02,451 INFO MainThread:189808 [wandb_init.py:init():892] sending inform_init request +2025-12-26 18:07:02,456 INFO MainThread:189808 [wandb_init.py:init():900] backend started and connected +2025-12-26 18:07:02,459 INFO MainThread:189808 [wandb_init.py:init():970] updated telemetry +2025-12-26 18:07:02,460 INFO MainThread:189808 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-26 18:07:02,828 INFO MainThread:189808 [wandb_init.py:init():1041] starting run threads in backend +2025-12-26 18:07:02,938 INFO MainThread:189808 [wandb_run.py:_console_start():2521] atexit reg +2025-12-26 18:07:02,938 INFO MainThread:189808 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-26 18:07:02,938 INFO MainThread:189808 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-26 18:07:02,938 INFO MainThread:189808 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-26 18:07:02,942 INFO MainThread:189808 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-26 18:07:21,833 INFO wandb-AsyncioManager-main:189808 [service_client.py:_forward_responses():80] Reached EOF. +2025-12-26 18:07:21,833 INFO wandb-AsyncioManager-main:189808 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles. diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/run-oordmylf.wandb b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/run-oordmylf.wandb new file mode 100644 index 0000000000000000000000000000000000000000..cd966f349f83b2a0e55f8e3594f207967287abfb --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180702-oordmylf/run-oordmylf.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65222a2b693b19c75fd9127ee5ba9f80b62d97d491fa6562dbbda73b81643404 +size 208495 diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/config.yaml b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f33d0b300dda19ff511d3d9e88d0f7becb10e1aa --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/config.yaml @@ -0,0 +1,630 @@ +_name_or_path: + value: Models/Devstral-Small-2-24B-HS-CPT +_wandb: + value: + cli_version: 0.23.1 + e: + k9a7zk6glrk7p1134kg1guh8b7acqlob: + args: + - --config + - trainer-kit/SFT/config_instruct.yaml + codePath: trainer-kit/SFT/run_instruct.py + codePathLocal: trainer-kit/SFT/run_instruct.py + cpu_count: 12 + cpu_count_logical: 24 + cudaVersion: "13.0" + disk: + /: + total: "791251738624" + used: "387681259520" + email: shaiksirajuddin9949@gmail.com + executable: /workspace/llm_finetuning_env/bin/python + gpu: NVIDIA A100-SXM4-80GB + gpu_count: 2 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100-SXM4-80GB + uuid: GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100-SXM4-80GB + uuid: GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40 + host: a100-2gpu-shell-session-757d587799-mfdvv + memory: + total: "359047892992" + os: Linux-6.12.46+-x86_64-with-glibc2.35 + program: /workspace/trainer-kit/SFT/run_instruct.py + python: CPython 3.10.12 + root: task2file/sft_devstral_24B_v2 + startedAt: "2025-12-26T18:08:08.383305Z" + writerId: k9a7zk6glrk7p1134kg1guh8b7acqlob + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "2": '*' + "5": 1 + "6": + - 1 + "7": [] + python_version: 3.10.12 + t: + "1": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 98 + "2": + - 1 + - 11 + - 41 + - 49 + - 51 + - 71 + - 98 + "3": + - 2 + - 7 + - 15 + - 16 + - 19 + - 62 + - 66 + "4": 3.10.12 + "5": 0.23.1 + "6": 5.0.0.dev0 + "9": + "1": transformers_trainer + "12": 0.23.1 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +add_cross_attention: + value: false +architectures: + value: + - Mistral3ForConditionalGeneration +auto_find_batch_size: + value: false +average_tokens_across_devices: + value: true +batch_eval_metrics: + value: false +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: null +chunk_size_feed_forward: + value: 0 +cross_attention_hidden_size: + value: null +data: + value: + custom_template: |- + ##INSTRUCTION + {instruction}<|im_end|> + {input}<|im_end|> + {output}<|im_end|> + eval_jsonl: null + eval_split_ratio: 0.1 + format_type: custom + input_field: input + instruction_field: instruction + max_length: 2048 + num_proc: 4 + output_field: output + shuffle: true + system_prompt: | + You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task. + + ## Output Format + + ##OUTPUT + Explain the data flow and why each component must change: + - Flow: [Input → Processing → Output with arrows] + - For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]" + - Explain coupling between components + + ##SELECT + modify::crates/path/to/file.rs::impl::ComponentName + add::crates/another/file.rs::function::AnotherComponent + + + ## Rules + + 1. Use full paths: `remove::crates/folder/file.rs::Type::Name` + 2. Use `::` for nested items: `status::StructName::Type::Name` + 3. Always explain "must change because" and "without this" + 3. Types of components: function, struct, enum, impl, trait + 4. If there is extra information (e.g., enum variants), include that too. + 5. Start with ##OUTPUT, end with ##SELECT, terminate with + + ## Example + + ##TASK + Add webhook subscription support + + ##OUTPUT + The webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don't trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings. + + ##SELECT + crates/common_enums/src/enums.rs::EventClass + crates/common_enums/src/transformers.rs::SubscriptionStatus + + train_jsonl: sft_dataset.jsonl +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +decoder_start_token_id: + value: null +deepspeed: + value: null +disable_tqdm: + value: false +do_eval: + value: true +do_predict: + value: false +do_train: + value: false +dtype: + value: bfloat16 +enable_jit_checkpoint: + value: false +eos_token_id: + value: null +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: 100 +eval_strategy: + value: steps +eval_use_gather_object: + value: false +finetuning_task: + value: null +fp16: + value: false +fp16_full_eval: + value: false +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +full_determinism: + value: false +gradient_accumulation_steps: + value: 8 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: false +group_by_length: + value: false +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: null +hub_revision: + value: null +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +image_token_index: + value: 10 +include_for_metrics: + value: [] +include_num_input_tokens_seen: + value: "no" +is_decoder: + value: false +is_encoder_decoder: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +learning_rate: + value: 0.0001 +length_column_name: + value: length +liger_kernel_config: + value: null +load_best_model_at_end: + value: true +local_rank: + value: -1 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: null +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 2 +logging_strategy: + value: steps +lr_scheduler_kwargs: + value: null +lr_scheduler_type: + value: cosine +max_grad_norm: + value: 0.8 +max_steps: + value: -1 +metric_for_best_model: + value: eval_loss +model: + value: + attn_implementation: null + base_local_dir: base_model + bnb_4bit_compute_dtype: bfloat16 + bnb_4bit_quant_type: nf4 + bnb_4bit_use_double_quant: false + device_map: auto + repo_id: ./Models/Devstral-Small-2-24B-HS-CPT + revision: null + tokenizer_use_fast: true + torch_dtype: bfloat16 + trust_remote_code: true + use_4bit: false +model/num_parameters: + value: 24022764544 +model_type: + value: mistral3 +multimodal_projector_bias: + value: false +neftune_noise_alpha: + value: null +num_train_epochs: + value: 6 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: task2file/sft_devstral_24B_v2/checkpoints +output_hidden_states: + value: false +pad_token_id: + value: null +parallelism_config: + value: null +peft: + value: + bias: none + enabled: true + lora_alpha: 16 + lora_dropout: 0.05 + r: 8 + target_modules: auto +peft_config: + value: + default: + alora_invocation_tokens: null + arrow_config: null + auto_mapping: null + base_model_name_or_path: Models/Devstral-Small-2-24B-HS-CPT + bias: none + corda_config: null + ensure_weight_tying: false + eva_config: null + exclude_modules: null + fan_in_fan_out: false + inference_mode: false + init_lora_weights: true + layer_replication: null + layers_pattern: null + layers_to_transform: null + lora_alpha: 16 + lora_bias: false + lora_dropout: 0.05 + megatron_config: null + megatron_core: megatron.core + modules_to_save: null + peft_type: LORA + peft_version: 0.18.0 + qalora_group_size: 16 + r: 8 + revision: null + runtime_config: + ephemeral_gpu_offload: false + target_modules: + - v_proj + - q_proj + - o_proj + - k_proj + target_parameters: null + task_type: CAUSAL_LM + trainable_token_indices: null + use_dora: false + use_qalora: false + use_rslora: false +per_device_eval_batch_size: + value: 1 +per_device_train_batch_size: + value: 1 +prediction_loss_only: + value: false +prefix: + value: null +problem_type: + value: null +project: + value: huggingface +projector_hidden_act: + value: gelu +push_to_hub: + value: false +remove_unused_columns: + value: false +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +run_dir: + value: task2file/sft_devstral_24B_v2 +run_name: + value: null +save_on_each_node: + value: false +save_only_model: + value: false +save_steps: + value: 500 +save_strategy: + value: steps +save_total_limit: + value: 20 +seed: + value: 42 +sep_token_id: + value: null +skip_memory_metrics: + value: true +spatial_merge_size: + value: 2 +task_specific_params: + value: null +text_config: + value: + _name_or_path: "" + add_cross_attention: false + architectures: null + attention_dropout: 0 + bos_token_id: 1 + chunk_size_feed_forward: 0 + cross_attention_hidden_size: null + decoder_start_token_id: null + dtype: bfloat16 + eos_token_id: 2 + finetuning_task: null + head_dim: 128 + hidden_act: silu + hidden_size: 5120 + id2label: + "0": LABEL_0 + "1": LABEL_1 + initializer_range: 0.02 + intermediate_size: 32768 + is_decoder: false + is_encoder_decoder: false + label2id: + LABEL_0: 0 + LABEL_1: 1 + max_position_embeddings: 393216 + model_type: ministral3 + num_attention_heads: 32 + num_hidden_layers: 40 + num_key_value_heads: 8 + output_attentions: false + output_hidden_states: false + pad_token_id: 11 + prefix: null + problem_type: null + return_dict: true + rms_norm_eps: 1e-05 + rope_parameters: + beta_fast: 32 + beta_slow: 1 + factor: 48 + llama_4_scaling_beta: 0.1 + mscale: 1 + mscale_all_dim: 1 + original_max_position_embeddings: 8192 + rope_theta: 1e+08 + rope_type: yarn + type: yarn + sep_token_id: null + sliding_window: null + task_specific_params: null + tie_word_embeddings: false + tokenizer_class: null + use_cache: true + vocab_size: 131072 +tf32: + value: null +tie_word_embeddings: + value: false +tokenizer_class: + value: null +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_empty_cache_steps: + value: null +trackio_space_id: + value: trackio +train: + value: + early_stopping: + enabled: true + metric: eval_loss + min_delta: 0.001 + mode: min + patience: 5 + eval_steps: 100 + evaluation_strategy: steps + gradient_accumulation_steps: 8 + gradient_checkpointing: true + learning_rate: "1e-4" + load_best_model_at_end: true + logging_steps: 2 + lr_scheduler_type: cosine + max_grad_norm: 0.8 + num_train_epochs: 6 + optim: adamw_torch + per_device_eval_batch_size: 1 + per_device_train_batch_size: 1 + resume_from_checkpoint: auto + save_steps: 500 + save_strategy: steps + save_total_limit: 20 + warmup_ratio: 0.08 + weight_decay: 0 +transformers_version: + value: 5.0.0.dev0 +use_cache: + value: false +use_cpu: + value: false +use_liger_kernel: + value: false +vision_config: + value: + _name_or_path: "" + add_cross_attention: false + architectures: null + attention_dropout: 0 + bos_token_id: null + chunk_size_feed_forward: 0 + cross_attention_hidden_size: null + decoder_start_token_id: null + dtype: bfloat16 + eos_token_id: null + finetuning_task: null + head_dim: 64 + hidden_act: silu + hidden_size: 1024 + id2label: + "0": LABEL_0 + "1": LABEL_1 + image_size: 1540 + initializer_range: 0.02 + intermediate_size: 4096 + is_decoder: false + is_encoder_decoder: false + label2id: + LABEL_0: 0 + LABEL_1: 1 + model_type: pixtral + num_attention_heads: 16 + num_channels: 3 + num_hidden_layers: 24 + output_attentions: false + output_hidden_states: false + pad_token_id: null + patch_size: 14 + prefix: null + problem_type: null + return_dict: true + rope_parameters: + rope_theta: 10000 + rope_type: default + sep_token_id: null + task_specific_params: null + tie_word_embeddings: true + tokenizer_class: null +vision_feature_layer: + value: -1 +warmup_ratio: + value: 0.08 +warmup_steps: + value: 0.08 +weight_decay: + value: 0 diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/output.log b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7a9727d7e8881f804c091717a07620794ac0e939 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/output.log @@ -0,0 +1,3898 @@ +Wandb initialized: project='sft-training', name='auto-generated' +[info] Detected Mistral3 model architecture, loading with specific class +Loading weights: 100%|█| 585/585 [00:12<00:00, 46.98it/s, Materializing param=model.vision_tower.transfor +[info] Ensuring all parameters are materialized... +Generating train split: 21067 examples [00:00, 115482.00 examples/s] +Formatting train instructions (num_proc=4): 100%|█████████| 18960/18960 [00:02<00:00, 7935.71 examples/s] +Formatting eval instructions (num_proc=4): 100%|████████████| 2107/2107 [00:02<00:00, 1010.84 examples/s] +Tokenizing and masking train (num_proc=4): 100%|██████████| 18960/18960 [00:15<00:00, 1259.40 examples/s] +Tokenizing and masking eval (num_proc=4): 100%|██████████████| 2107/2107 [00:03<00:00, 614.09 examples/s] +warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead. +Early stopping enabled: patience=5, min_delta=0.001 +Starting instruction fine-tuning... + 0%| | 0/14220 [00:00 task2file/sft_devstral_24B_v2/best_adapter +Final eval_loss=0.6706293225288391, ppl=1.955467553274469 +--- Merge: task2file/sft_devstral_24B_v2/best_adapter + Models/Devstral-Small-2-24B-HS-CPT -> task2file/sft_devstral_24B_v2/Models/Devstral-Small-2-24B-HS-CPT-SFT_v2 --- +Loading weights: 100%|█| 585/585 [00:09<00:00, 58.83it/s, Materializing param=model.v +Writing model shards: 100%|██████████████████████████| 27/27 [01:16<00:00, 2.84s/it] +--- Merge complete --- diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/requirements.txt b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..79a4241d8724f018c9bdfcd7c289f1f14578574b --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/requirements.txt @@ -0,0 +1,104 @@ +exceptiongroup==1.3.1 +wheel==0.45.1 +python-dateutil==2.9.0.post0 +nvidia-ml-py==13.580.82 +huggingface_hub==1.2.3 +idna==3.11 +click==8.3.1 +numpy==2.2.6 +httpx==0.28.1 +tokenizers==0.22.1 +sympy==1.13.1 +yarl==1.22.0 +async-timeout==5.0.1 +datasets==4.4.2 +platformdirs==4.5.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-nvtx-cu12==12.1.105 +smmap==5.0.2 +accelerate==1.12.0 +requests==2.32.5 +aiohttp==3.13.2 +bitsandbytes==0.49.0 +nvidia-cublas-cu12==12.1.3.1 +mpmath==1.3.0 +torchaudio==2.5.1+cu121 +nvidia-cuda-runtime-cu12==12.1.105 +typing-inspection==0.4.2 +GitPython==3.1.45 +xxhash==3.6.0 +nvidia-cusolver-cu12==11.4.5.107 +pydantic_core==2.41.5 +six==1.17.0 +torchvision==0.20.1+cu121 +typing_extensions==4.15.0 +triton==3.1.0 +charset-normalizer==3.4.4 +nvitop==1.6.1 +wandb==0.23.1 +regex==2025.11.3 +pip==25.3 +nvidia-cusparse-cu12==12.1.0.106 +pytz==2025.2 +Jinja2==3.1.6 +psutil==7.2.0 +pillow==12.0.0 +packaging==25.0 +safetensors==0.7.0 +sentry-sdk==2.48.0 +gitdb==4.0.12 +httpcore==1.0.9 +setuptools==80.9.0 +nvidia-cufft-cu12==11.0.2.54 +anyio==4.12.0 +transformers==5.0.0.dev0 +pydantic==2.12.5 +fsspec==2025.10.0 +filelock==3.20.0 +PyYAML==6.0.3 +hf-xet==1.2.0 +nvidia-cudnn-cu12==9.1.0.70 +tqdm==4.67.1 +MarkupSafe==2.1.5 +attrs==25.4.0 +nvidia-cuda-nvrtc-cu12==12.1.105 +peft==0.18.0 +aiohappyeyeballs==2.6.1 +networkx==3.4.2 +nvidia-nvjitlink-cu12==12.9.86 +certifi==2025.11.12 +pyarrow==22.0.0 +dill==0.4.0 +protobuf==6.33.2 +aiosignal==1.4.0 +frozenlist==1.8.0 +urllib3==2.6.2 +propcache==0.4.1 +tzdata==2025.3 +pandas==2.3.3 +annotated-types==0.7.0 +shellingham==1.5.4 +nvidia-nccl-cu12==2.21.5 +multidict==6.7.0 +nvidia-curand-cu12==10.3.2.106 +trl==0.26.2 +torch==2.5.1+cu121 +h11==0.16.0 +multiprocess==0.70.18 +typer-slim==0.21.0 +wheel==0.45.1 +tomli==2.0.1 +autocommand==2.2.2 +jaraco.context==5.3.0 +zipp==3.19.2 +packaging==24.2 +inflect==7.3.1 +typing_extensions==4.12.2 +platformdirs==4.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +jaraco.text==3.12.1 +backports.tarfile==1.2.0 +more-itertools==10.3.0 +importlib_metadata==8.0.0 +typeguard==4.3.0 diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/wandb-metadata.json b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..29bb3d6703b061f2ef95ee37817de064e6ca3264 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/wandb-metadata.json @@ -0,0 +1,47 @@ +{ + "os": "Linux-6.12.46+-x86_64-with-glibc2.35", + "python": "CPython 3.10.12", + "startedAt": "2025-12-26T18:08:08.383305Z", + "args": [ + "--config", + "trainer-kit/SFT/config_instruct.yaml" + ], + "program": "/workspace/trainer-kit/SFT/run_instruct.py", + "codePath": "trainer-kit/SFT/run_instruct.py", + "codePathLocal": "trainer-kit/SFT/run_instruct.py", + "email": "shaiksirajuddin9949@gmail.com", + "root": "task2file/sft_devstral_24B_v2", + "host": "a100-2gpu-shell-session-757d587799-mfdvv", + "executable": "/workspace/llm_finetuning_env/bin/python", + "cpu_count": 12, + "cpu_count_logical": 24, + "gpu": "NVIDIA A100-SXM4-80GB", + "gpu_count": 2, + "disk": { + "/": { + "total": "791251738624", + "used": "387681259520" + } + }, + "memory": { + "total": "359047892992" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba" + }, + { + "name": "NVIDIA A100-SXM4-80GB", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40" + } + ], + "cudaVersion": "13.0", + "writerId": "k9a7zk6glrk7p1134kg1guh8b7acqlob" +} \ No newline at end of file diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/wandb-summary.json b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..a5cfc9cd9be5fb12be9b1727852dbfae0b776667 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/files/wandb-summary.json @@ -0,0 +1 @@ +{"eval/runtime":511.6513,"_timestamp":1.766894424089547e+09,"train/learning_rate":5.0960525730763455e-05,"train_runtime":121379.3179,"_wandb":{"runtime":122035},"eval/samples_per_second":4.118,"train_loss":0.6813117427967097,"total_flos":7.892056292508187e+18,"train/loss":0.5559571981430054,"eval/steps_per_second":4.118,"_step":3877,"train/global_step":7600,"train_samples_per_second":0.937,"train/grad_norm":1.2778210639953613,"train_steps_per_second":0.117,"eval/loss":0.6706293225288391,"_runtime":122035,"train/epoch":3.2067510548523206} \ No newline at end of file diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug-core.log b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..6ba382701a433c6f686983b09823d5cd4a40cd4e --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug-core.log @@ -0,0 +1,16 @@ +{"time":"2025-12-26T18:08:08.471552693Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp_2pxhrkk/port-190322.txt","pid":190322,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-12-26T18:08:08.472277502Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":190322} +{"time":"2025-12-26T18:08:08.472253441Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-190322-190397-1549185738/socket","Net":"unix"}} +{"time":"2025-12-26T18:08:08.653761101Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2025-12-26T18:08:08.660887612Z","level":"INFO","msg":"handleInformInit: received","streamId":"ny9q48hd","id":"1(@)"} +{"time":"2025-12-26T18:08:08.822064564Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ny9q48hd","id":"1(@)"} +{"time":"2025-12-28T04:02:05.051125329Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"ny9q48hd","id":"1(@)"} +{"time":"2025-12-28T04:02:05.052538266Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"ny9q48hd","id":"1(@)"} +{"time":"2025-12-28T04:02:05.107259931Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"} +{"time":"2025-12-28T04:02:05.107301964Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"} +{"time":"2025-12-28T04:02:05.107312563Z","level":"INFO","msg":"connection: closing","id":"1(@)"} +{"time":"2025-12-28T04:02:05.107351058Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"} +{"time":"2025-12-28T04:02:05.107355378Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"} +{"time":"2025-12-28T04:02:05.107365658Z","level":"INFO","msg":"server is shutting down"} +{"time":"2025-12-28T04:02:05.107515239Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-190322-190397-1549185738/socket","Net":"unix"}} +{"time":"2025-12-28T04:02:05.107566032Z","level":"INFO","msg":"server is closed"} diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug-internal.log b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..0f91037f83a83206ed5992bb506a61dc164da769 --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2025-12-26T18:08:08.66103332Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"} +{"time":"2025-12-26T18:08:08.82172381Z","level":"INFO","msg":"stream: created new stream","id":"ny9q48hd"} +{"time":"2025-12-26T18:08:08.821819478Z","level":"INFO","msg":"handler: started","stream_id":"ny9q48hd"} +{"time":"2025-12-26T18:08:08.822049155Z","level":"INFO","msg":"stream: started","id":"ny9q48hd"} +{"time":"2025-12-26T18:08:08.822072296Z","level":"INFO","msg":"writer: started","stream_id":"ny9q48hd"} +{"time":"2025-12-26T18:08:08.822098276Z","level":"INFO","msg":"sender: started","stream_id":"ny9q48hd"} +{"time":"2025-12-28T04:02:04.935383596Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-12-28T04:02:05.045953421Z","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-12-28T04:02:05.051806259Z","level":"INFO","msg":"stream: closing","id":"ny9q48hd"} +{"time":"2025-12-28T04:02:05.051833004Z","level":"INFO","msg":"handler: closed","stream_id":"ny9q48hd"} +{"time":"2025-12-28T04:02:05.051917075Z","level":"INFO","msg":"sender: closed","stream_id":"ny9q48hd"} +{"time":"2025-12-28T04:02:05.051937152Z","level":"INFO","msg":"stream: closed","id":"ny9q48hd"} diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug.log b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0d4749de3aa5f0726c27004e2dfdcc433243200c --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug.log @@ -0,0 +1,29 @@ +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1 +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Configure stats pid to 190322 +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Loading settings from /workspace/wandb/settings +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_setup.py:_flush():80] Loading settings from environment variables +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:setup_run_log_directory():714] Logging user logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug.log +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to task2file/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/logs/debug-internal.log +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:init():841] calling init triggers +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:init():846] wandb.init called with sweep_config: {} +config: {'model': {'repo_id': './Models/Devstral-Small-2-24B-HS-CPT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'sft_dataset.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'instruction_field': 'instruction', 'input_field': 'input', 'output_field': 'output', 'format_type': 'custom', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with \n\n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don\'t trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\ncrates/common_enums/src/transformers.rs::SubscriptionStatus\n\n', 'custom_template': '##INSTRUCTION\n{instruction}<|im_end|>\n{input}<|im_end|>\n{output}<|im_end|>', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'train': {'num_train_epochs': 6, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '1e-4', 'weight_decay': 0.0, 'warmup_ratio': 0.08, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 0.8, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'evaluation_strategy': 'steps', 'eval_steps': 100, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'task2file/sft_devstral_24B_v2', '_wandb': {}} +2025-12-26 18:08:08,385 INFO MainThread:190322 [wandb_init.py:init():889] starting backend +2025-12-26 18:08:08,653 INFO MainThread:190322 [wandb_init.py:init():892] sending inform_init request +2025-12-26 18:08:08,658 INFO MainThread:190322 [wandb_init.py:init():900] backend started and connected +2025-12-26 18:08:08,661 INFO MainThread:190322 [wandb_init.py:init():970] updated telemetry +2025-12-26 18:08:08,662 INFO MainThread:190322 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout +2025-12-26 18:08:09,021 INFO MainThread:190322 [wandb_init.py:init():1041] starting run threads in backend +2025-12-26 18:08:09,134 INFO MainThread:190322 [wandb_run.py:_console_start():2521] atexit reg +2025-12-26 18:08:09,134 INFO MainThread:190322 [wandb_run.py:_redirect():2369] redirect: wrap_raw +2025-12-26 18:08:09,135 INFO MainThread:190322 [wandb_run.py:_redirect():2438] Wrapping output streams. +2025-12-26 18:08:09,135 INFO MainThread:190322 [wandb_run.py:_redirect():2461] Redirects installed. +2025-12-26 18:08:09,138 INFO MainThread:190322 [wandb_init.py:init():1081] run started, returning control to user process +2025-12-26 18:08:52,955 INFO MainThread:190322 [wandb_run.py:_config_callback():1396] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.0', 'base_model_name_or_path': 'Models/Devstral-Small-2-24B-HS-CPT', 'revision': None, 'inference_mode': False, 'r': 8, 'target_modules': ['v_proj', 'q_proj', 'o_proj', 'k_proj'], 'exclude_modules': None, 'lora_alpha': 16, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'image_token_index': 10, 'projector_hidden_act': 'gelu', 'vision_feature_layer': -1, 'vision_config': {'hidden_size': 1024, 'intermediate_size': 4096, 'num_hidden_layers': 24, 'num_attention_heads': 16, 'num_channels': 3, 'patch_size': 14, 'image_size': 1540, 'attention_dropout': 0.0, 'hidden_act': 'silu', 'head_dim': 64, 'initializer_range': 0.02, 'rope_parameters': {'rope_theta': 10000.0, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': None, 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': '', 'model_type': 'pixtral', 'output_attentions': False}, 'text_config': {'vocab_size': 131072, 'max_position_embeddings': 393216, 'hidden_size': 5120, 'intermediate_size': 32768, 'num_hidden_layers': 40, 'num_attention_heads': 32, 'sliding_window': None, 'head_dim': 128, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': True, 'attention_dropout': 0.0, 'rope_parameters': {'beta_fast': 32.0, 'beta_slow': 1.0, 'factor': 48.0, 'llama_4_scaling_beta': 0.1, 'mscale': 1.0, 'mscale_all_dim': 1.0, 'original_max_position_embeddings': 8192, 'rope_theta': 100000000.0, 'rope_type': 'yarn', 'type': 'yarn'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': None, 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': 11, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': '', 'model_type': 'ministral3', 'output_attentions': False}, 'multimodal_projector_bias': False, 'spatial_merge_size': 2, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': ['Mistral3ForConditionalGeneration'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': 'Models/Devstral-Small-2-24B-HS-CPT', 'transformers_version': '5.0.0.dev0', 'model_type': 'mistral3', 'use_cache': False, 'output_attentions': False, 'output_dir': 'task2file/sft_devstral_24B_v2/checkpoints', 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 0.8, 'num_train_epochs': 6.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.08, 'warmup_steps': 0.08, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': 20, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True} +2025-12-26 18:08:52,965 INFO MainThread:190322 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 24022764544 - > +2025-12-26 18:08:52,965 INFO MainThread:190322 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 24022764544 None +2025-12-28 04:02:04,643 INFO MainThread:190322 [wandb_run.py:_finish():2287] finishing run sirajuddin-shaik-007/sft-training/ny9q48hd +2025-12-28 04:02:04,645 INFO MainThread:190322 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0 +2025-12-28 04:02:04,646 INFO MainThread:190322 [wandb_run.py:_restore():2468] restore +2025-12-28 04:02:04,646 INFO MainThread:190322 [wandb_run.py:_restore():2474] restore done +2025-12-28 04:02:05,050 INFO MainThread:190322 [wandb_run.py:_footer_sync_info():3862] logging synced files diff --git a/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/run-ny9q48hd.wandb b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/run-ny9q48hd.wandb new file mode 100644 index 0000000000000000000000000000000000000000..06c7d7a6586161a42af1738174f38f536a1dfc7a --- /dev/null +++ b/sft_devstral_24B_v2/wandb/run-20251226_180808-ny9q48hd/run-ny9q48hd.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83732f43ba8db2e52d5f37f7e6e4a426ed0ffe211e98f07339fe9de983e6698c +size 63827640