| { | |
| "best_metric": 0.8083848357200623, | |
| "best_model_checkpoint": "./kaggle/working/eGTZANplus/checkpoint-220", | |
| "epoch": 20.0, | |
| "eval_steps": 10, | |
| "global_step": 1080, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.4207828044891357, | |
| "learning_rate": 0.00019814814814814814, | |
| "loss": 2.4003, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "eval_accuracy": 0.19576719576719576, | |
| "eval_loss": 2.282846689224243, | |
| "eval_runtime": 1.8415, | |
| "eval_samples_per_second": 102.636, | |
| "eval_steps_per_second": 6.517, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.2310184240341187, | |
| "learning_rate": 0.0001962962962962963, | |
| "loss": 2.1703, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "eval_accuracy": 0.35978835978835977, | |
| "eval_loss": 1.9852432012557983, | |
| "eval_runtime": 1.793, | |
| "eval_samples_per_second": 105.41, | |
| "eval_steps_per_second": 6.693, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.0627870559692383, | |
| "learning_rate": 0.00019444444444444446, | |
| "loss": 1.9696, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_accuracy": 0.3915343915343915, | |
| "eval_loss": 1.8232808113098145, | |
| "eval_runtime": 1.786, | |
| "eval_samples_per_second": 105.821, | |
| "eval_steps_per_second": 6.719, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.5207180976867676, | |
| "learning_rate": 0.0001925925925925926, | |
| "loss": 1.8051, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "eval_accuracy": 0.48677248677248675, | |
| "eval_loss": 1.6591798067092896, | |
| "eval_runtime": 1.7501, | |
| "eval_samples_per_second": 107.997, | |
| "eval_steps_per_second": 6.857, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 2.221734046936035, | |
| "learning_rate": 0.00019074074074074075, | |
| "loss": 1.6692, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "eval_accuracy": 0.582010582010582, | |
| "eval_loss": 1.5287415981292725, | |
| "eval_runtime": 1.7993, | |
| "eval_samples_per_second": 105.039, | |
| "eval_steps_per_second": 6.669, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.5369292497634888, | |
| "learning_rate": 0.0001890740740740741, | |
| "loss": 1.5283, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "eval_accuracy": 0.5608465608465608, | |
| "eval_loss": 1.4252889156341553, | |
| "eval_runtime": 1.7582, | |
| "eval_samples_per_second": 107.493, | |
| "eval_steps_per_second": 6.825, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.9959373474121094, | |
| "learning_rate": 0.00018722222222222222, | |
| "loss": 1.3981, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "eval_accuracy": 0.5925925925925926, | |
| "eval_loss": 1.3883891105651855, | |
| "eval_runtime": 1.7749, | |
| "eval_samples_per_second": 106.485, | |
| "eval_steps_per_second": 6.761, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.101576805114746, | |
| "learning_rate": 0.00018537037037037038, | |
| "loss": 1.3047, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "eval_accuracy": 0.5767195767195767, | |
| "eval_loss": 1.356843113899231, | |
| "eval_runtime": 1.7875, | |
| "eval_samples_per_second": 105.735, | |
| "eval_steps_per_second": 6.713, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.9240992069244385, | |
| "learning_rate": 0.00018351851851851854, | |
| "loss": 1.1325, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "eval_accuracy": 0.6349206349206349, | |
| "eval_loss": 1.2104465961456299, | |
| "eval_runtime": 1.7741, | |
| "eval_samples_per_second": 106.533, | |
| "eval_steps_per_second": 6.764, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.6294556856155396, | |
| "learning_rate": 0.00018166666666666667, | |
| "loss": 1.2004, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "eval_accuracy": 0.6137566137566137, | |
| "eval_loss": 1.263272762298584, | |
| "eval_runtime": 1.8419, | |
| "eval_samples_per_second": 102.609, | |
| "eval_steps_per_second": 6.515, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 4.842734336853027, | |
| "learning_rate": 0.0001798148148148148, | |
| "loss": 1.0475, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "eval_accuracy": 0.5555555555555556, | |
| "eval_loss": 1.3616496324539185, | |
| "eval_runtime": 1.7824, | |
| "eval_samples_per_second": 106.036, | |
| "eval_steps_per_second": 6.732, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 1.8519538640975952, | |
| "learning_rate": 0.00017796296296296296, | |
| "loss": 0.9801, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "eval_accuracy": 0.671957671957672, | |
| "eval_loss": 1.1471754312515259, | |
| "eval_runtime": 1.796, | |
| "eval_samples_per_second": 105.234, | |
| "eval_steps_per_second": 6.682, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 3.018026351928711, | |
| "learning_rate": 0.00017611111111111112, | |
| "loss": 0.862, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "eval_accuracy": 0.6984126984126984, | |
| "eval_loss": 1.0452642440795898, | |
| "eval_runtime": 1.7578, | |
| "eval_samples_per_second": 107.521, | |
| "eval_steps_per_second": 6.827, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 2.8672127723693848, | |
| "learning_rate": 0.00017425925925925928, | |
| "loss": 0.8905, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "eval_accuracy": 0.6825396825396826, | |
| "eval_loss": 0.9718140363693237, | |
| "eval_runtime": 1.8323, | |
| "eval_samples_per_second": 103.148, | |
| "eval_steps_per_second": 6.549, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 3.5106003284454346, | |
| "learning_rate": 0.00017240740740740742, | |
| "loss": 0.7839, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "eval_accuracy": 0.6666666666666666, | |
| "eval_loss": 1.0531541109085083, | |
| "eval_runtime": 1.7655, | |
| "eval_samples_per_second": 107.049, | |
| "eval_steps_per_second": 6.797, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 2.7532589435577393, | |
| "learning_rate": 0.00017055555555555555, | |
| "loss": 0.8304, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "eval_accuracy": 0.6878306878306878, | |
| "eval_loss": 0.96842360496521, | |
| "eval_runtime": 1.8371, | |
| "eval_samples_per_second": 102.881, | |
| "eval_steps_per_second": 6.532, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 2.1222331523895264, | |
| "learning_rate": 0.0001687037037037037, | |
| "loss": 0.883, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "eval_accuracy": 0.6931216931216931, | |
| "eval_loss": 0.9298208951950073, | |
| "eval_runtime": 1.7867, | |
| "eval_samples_per_second": 105.782, | |
| "eval_steps_per_second": 6.716, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 2.5858914852142334, | |
| "learning_rate": 0.00016685185185185187, | |
| "loss": 0.5714, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "eval_accuracy": 0.6772486772486772, | |
| "eval_loss": 0.9491019248962402, | |
| "eval_runtime": 1.7856, | |
| "eval_samples_per_second": 105.846, | |
| "eval_steps_per_second": 6.72, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 1.7296024560928345, | |
| "learning_rate": 0.000165, | |
| "loss": 0.5209, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "eval_accuracy": 0.6984126984126984, | |
| "eval_loss": 0.914806604385376, | |
| "eval_runtime": 1.7453, | |
| "eval_samples_per_second": 108.289, | |
| "eval_steps_per_second": 6.875, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 4.235101699829102, | |
| "learning_rate": 0.00016314814814814816, | |
| "loss": 0.5404, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "eval_accuracy": 0.671957671957672, | |
| "eval_loss": 1.0290465354919434, | |
| "eval_runtime": 1.8123, | |
| "eval_samples_per_second": 104.285, | |
| "eval_steps_per_second": 6.621, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 3.8817615509033203, | |
| "learning_rate": 0.0001612962962962963, | |
| "loss": 0.6133, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "eval_accuracy": 0.7142857142857143, | |
| "eval_loss": 0.9116460680961609, | |
| "eval_runtime": 1.7735, | |
| "eval_samples_per_second": 106.57, | |
| "eval_steps_per_second": 6.766, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 1.743445634841919, | |
| "learning_rate": 0.00015944444444444445, | |
| "loss": 0.4347, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "eval_accuracy": 0.7354497354497355, | |
| "eval_loss": 0.8083848357200623, | |
| "eval_runtime": 1.8193, | |
| "eval_samples_per_second": 103.884, | |
| "eval_steps_per_second": 6.596, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 1.8867310285568237, | |
| "learning_rate": 0.0001575925925925926, | |
| "loss": 0.3659, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "eval_accuracy": 0.7142857142857143, | |
| "eval_loss": 0.890904426574707, | |
| "eval_runtime": 1.7392, | |
| "eval_samples_per_second": 108.672, | |
| "eval_steps_per_second": 6.9, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 2.56878399848938, | |
| "learning_rate": 0.00015574074074074074, | |
| "loss": 0.4439, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "eval_accuracy": 0.6825396825396826, | |
| "eval_loss": 0.9554860591888428, | |
| "eval_runtime": 1.7559, | |
| "eval_samples_per_second": 107.64, | |
| "eval_steps_per_second": 6.834, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 1.9487425088882446, | |
| "learning_rate": 0.0001538888888888889, | |
| "loss": 0.3335, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "eval_accuracy": 0.708994708994709, | |
| "eval_loss": 0.931969404220581, | |
| "eval_runtime": 1.8636, | |
| "eval_samples_per_second": 101.417, | |
| "eval_steps_per_second": 6.439, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 2.4911906719207764, | |
| "learning_rate": 0.00015203703703703703, | |
| "loss": 0.3695, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "eval_accuracy": 0.7037037037037037, | |
| "eval_loss": 0.9643996357917786, | |
| "eval_runtime": 1.743, | |
| "eval_samples_per_second": 108.437, | |
| "eval_steps_per_second": 6.885, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.4799601137638092, | |
| "learning_rate": 0.0001501851851851852, | |
| "loss": 0.3018, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.6455026455026455, | |
| "eval_loss": 1.1127641201019287, | |
| "eval_runtime": 1.8057, | |
| "eval_samples_per_second": 104.667, | |
| "eval_steps_per_second": 6.646, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 5.19, | |
| "grad_norm": 0.8545930981636047, | |
| "learning_rate": 0.00014833333333333335, | |
| "loss": 0.2418, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 5.19, | |
| "eval_accuracy": 0.7301587301587301, | |
| "eval_loss": 0.8752605319023132, | |
| "eval_runtime": 1.7714, | |
| "eval_samples_per_second": 106.698, | |
| "eval_steps_per_second": 6.774, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 5.37, | |
| "grad_norm": 2.0490822792053223, | |
| "learning_rate": 0.00014648148148148148, | |
| "loss": 0.2305, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 5.37, | |
| "eval_accuracy": 0.7142857142857143, | |
| "eval_loss": 0.9517038464546204, | |
| "eval_runtime": 1.7422, | |
| "eval_samples_per_second": 108.483, | |
| "eval_steps_per_second": 6.888, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 5.56, | |
| "grad_norm": 1.5348315238952637, | |
| "learning_rate": 0.00014462962962962962, | |
| "loss": 0.238, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 5.56, | |
| "eval_accuracy": 0.7248677248677249, | |
| "eval_loss": 0.9478802680969238, | |
| "eval_runtime": 1.7999, | |
| "eval_samples_per_second": 105.006, | |
| "eval_steps_per_second": 6.667, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 5.74, | |
| "grad_norm": 2.6169273853302, | |
| "learning_rate": 0.00014277777777777778, | |
| "loss": 0.2099, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 5.74, | |
| "eval_accuracy": 0.671957671957672, | |
| "eval_loss": 1.103389024734497, | |
| "eval_runtime": 1.8453, | |
| "eval_samples_per_second": 102.42, | |
| "eval_steps_per_second": 6.503, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 5.93, | |
| "grad_norm": 2.5781023502349854, | |
| "learning_rate": 0.00014092592592592594, | |
| "loss": 0.2284, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 5.93, | |
| "eval_accuracy": 0.6825396825396826, | |
| "eval_loss": 1.031624674797058, | |
| "eval_runtime": 1.7579, | |
| "eval_samples_per_second": 107.517, | |
| "eval_steps_per_second": 6.826, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 6.11, | |
| "grad_norm": 3.042239189147949, | |
| "learning_rate": 0.0001390740740740741, | |
| "loss": 0.1694, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 6.11, | |
| "eval_accuracy": 0.6613756613756614, | |
| "eval_loss": 1.1174468994140625, | |
| "eval_runtime": 1.7854, | |
| "eval_samples_per_second": 105.856, | |
| "eval_steps_per_second": 6.721, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 6.3, | |
| "grad_norm": 0.8211657404899597, | |
| "learning_rate": 0.00013722222222222223, | |
| "loss": 0.1715, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 6.3, | |
| "eval_accuracy": 0.6772486772486772, | |
| "eval_loss": 1.1067023277282715, | |
| "eval_runtime": 1.8157, | |
| "eval_samples_per_second": 104.091, | |
| "eval_steps_per_second": 6.609, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 6.48, | |
| "grad_norm": 1.9425742626190186, | |
| "learning_rate": 0.00013537037037037036, | |
| "loss": 0.123, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 6.48, | |
| "eval_accuracy": 0.7142857142857143, | |
| "eval_loss": 1.0037899017333984, | |
| "eval_runtime": 1.786, | |
| "eval_samples_per_second": 105.821, | |
| "eval_steps_per_second": 6.719, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 6.67, | |
| "grad_norm": 2.7061989307403564, | |
| "learning_rate": 0.00013351851851851852, | |
| "loss": 0.1297, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 6.67, | |
| "eval_accuracy": 0.6772486772486772, | |
| "eval_loss": 1.1142699718475342, | |
| "eval_runtime": 1.7368, | |
| "eval_samples_per_second": 108.818, | |
| "eval_steps_per_second": 6.909, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 6.85, | |
| "grad_norm": 2.478459358215332, | |
| "learning_rate": 0.00013166666666666668, | |
| "loss": 0.2191, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 6.85, | |
| "eval_accuracy": 0.7354497354497355, | |
| "eval_loss": 0.9896882176399231, | |
| "eval_runtime": 1.7802, | |
| "eval_samples_per_second": 106.167, | |
| "eval_steps_per_second": 6.741, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "grad_norm": 1.6921576261520386, | |
| "learning_rate": 0.0001298148148148148, | |
| "loss": 0.1206, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "eval_accuracy": 0.7407407407407407, | |
| "eval_loss": 0.962655782699585, | |
| "eval_runtime": 1.7667, | |
| "eval_samples_per_second": 106.982, | |
| "eval_steps_per_second": 6.793, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 7.22, | |
| "grad_norm": 0.8060858845710754, | |
| "learning_rate": 0.00012796296296296297, | |
| "loss": 0.1071, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 7.22, | |
| "eval_accuracy": 0.7513227513227513, | |
| "eval_loss": 1.0495431423187256, | |
| "eval_runtime": 1.7473, | |
| "eval_samples_per_second": 108.168, | |
| "eval_steps_per_second": 6.868, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 7.41, | |
| "grad_norm": 0.38671812415122986, | |
| "learning_rate": 0.0001261111111111111, | |
| "loss": 0.1102, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 7.41, | |
| "eval_accuracy": 0.7301587301587301, | |
| "eval_loss": 1.0441887378692627, | |
| "eval_runtime": 1.7747, | |
| "eval_samples_per_second": 106.496, | |
| "eval_steps_per_second": 6.762, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 7.59, | |
| "grad_norm": 1.2801034450531006, | |
| "learning_rate": 0.0001242592592592593, | |
| "loss": 0.1269, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 7.59, | |
| "eval_accuracy": 0.7407407407407407, | |
| "eval_loss": 1.0281165838241577, | |
| "eval_runtime": 1.811, | |
| "eval_samples_per_second": 104.363, | |
| "eval_steps_per_second": 6.626, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 7.78, | |
| "grad_norm": 0.92644864320755, | |
| "learning_rate": 0.00012240740740740742, | |
| "loss": 0.0694, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 7.78, | |
| "eval_accuracy": 0.7354497354497355, | |
| "eval_loss": 1.0361741781234741, | |
| "eval_runtime": 1.7423, | |
| "eval_samples_per_second": 108.479, | |
| "eval_steps_per_second": 6.888, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 7.96, | |
| "grad_norm": 0.8203582167625427, | |
| "learning_rate": 0.00012055555555555555, | |
| "loss": 0.0548, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 7.96, | |
| "eval_accuracy": 0.746031746031746, | |
| "eval_loss": 1.071204423904419, | |
| "eval_runtime": 1.7384, | |
| "eval_samples_per_second": 108.723, | |
| "eval_steps_per_second": 6.903, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 8.15, | |
| "grad_norm": 4.490820407867432, | |
| "learning_rate": 0.00011870370370370371, | |
| "loss": 0.062, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 8.15, | |
| "eval_accuracy": 0.7301587301587301, | |
| "eval_loss": 1.035632610321045, | |
| "eval_runtime": 1.8141, | |
| "eval_samples_per_second": 104.182, | |
| "eval_steps_per_second": 6.615, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 8.33, | |
| "grad_norm": 1.979749321937561, | |
| "learning_rate": 0.00011685185185185186, | |
| "loss": 0.0542, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 8.33, | |
| "eval_accuracy": 0.6984126984126984, | |
| "eval_loss": 1.2573037147521973, | |
| "eval_runtime": 1.7824, | |
| "eval_samples_per_second": 106.034, | |
| "eval_steps_per_second": 6.732, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 8.52, | |
| "grad_norm": 4.157647609710693, | |
| "learning_rate": 0.00011499999999999999, | |
| "loss": 0.0823, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 8.52, | |
| "eval_accuracy": 0.7195767195767195, | |
| "eval_loss": 1.1037700176239014, | |
| "eval_runtime": 1.7489, | |
| "eval_samples_per_second": 108.066, | |
| "eval_steps_per_second": 6.861, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 8.7, | |
| "grad_norm": 0.08767159283161163, | |
| "learning_rate": 0.00011314814814814816, | |
| "loss": 0.1354, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 8.7, | |
| "eval_accuracy": 0.7407407407407407, | |
| "eval_loss": 1.0803223848342896, | |
| "eval_runtime": 1.7889, | |
| "eval_samples_per_second": 105.654, | |
| "eval_steps_per_second": 6.708, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 8.89, | |
| "grad_norm": 0.6974061131477356, | |
| "learning_rate": 0.0001112962962962963, | |
| "loss": 0.0798, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 8.89, | |
| "eval_accuracy": 0.671957671957672, | |
| "eval_loss": 1.2207469940185547, | |
| "eval_runtime": 1.7456, | |
| "eval_samples_per_second": 108.27, | |
| "eval_steps_per_second": 6.874, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 9.07, | |
| "grad_norm": 2.0027213096618652, | |
| "learning_rate": 0.00010944444444444445, | |
| "loss": 0.0963, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 9.07, | |
| "eval_accuracy": 0.656084656084656, | |
| "eval_loss": 1.337466835975647, | |
| "eval_runtime": 1.7654, | |
| "eval_samples_per_second": 107.06, | |
| "eval_steps_per_second": 6.797, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 9.26, | |
| "grad_norm": 0.14471650123596191, | |
| "learning_rate": 0.0001075925925925926, | |
| "loss": 0.0557, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 9.26, | |
| "eval_accuracy": 0.6984126984126984, | |
| "eval_loss": 1.2044044733047485, | |
| "eval_runtime": 1.9948, | |
| "eval_samples_per_second": 94.745, | |
| "eval_steps_per_second": 6.016, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 9.44, | |
| "grad_norm": 0.07393530756235123, | |
| "learning_rate": 0.00010574074074074075, | |
| "loss": 0.0491, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 9.44, | |
| "eval_accuracy": 0.7248677248677249, | |
| "eval_loss": 1.18802809715271, | |
| "eval_runtime": 1.8204, | |
| "eval_samples_per_second": 103.822, | |
| "eval_steps_per_second": 6.592, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 9.63, | |
| "grad_norm": 0.12744389474391937, | |
| "learning_rate": 0.0001038888888888889, | |
| "loss": 0.0502, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 9.63, | |
| "eval_accuracy": 0.746031746031746, | |
| "eval_loss": 1.098527193069458, | |
| "eval_runtime": 1.7601, | |
| "eval_samples_per_second": 107.378, | |
| "eval_steps_per_second": 6.818, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 9.81, | |
| "grad_norm": 0.07471567392349243, | |
| "learning_rate": 0.00010203703703703704, | |
| "loss": 0.0396, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 9.81, | |
| "eval_accuracy": 0.708994708994709, | |
| "eval_loss": 1.214396595954895, | |
| "eval_runtime": 1.7884, | |
| "eval_samples_per_second": 105.68, | |
| "eval_steps_per_second": 6.71, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.16710619628429413, | |
| "learning_rate": 0.00010018518518518518, | |
| "loss": 0.0717, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.7037037037037037, | |
| "eval_loss": 1.2163357734680176, | |
| "eval_runtime": 1.7401, | |
| "eval_samples_per_second": 108.615, | |
| "eval_steps_per_second": 6.896, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 10.19, | |
| "grad_norm": 0.07553374022245407, | |
| "learning_rate": 9.833333333333333e-05, | |
| "loss": 0.0279, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 10.19, | |
| "eval_accuracy": 0.7142857142857143, | |
| "eval_loss": 1.119241714477539, | |
| "eval_runtime": 1.766, | |
| "eval_samples_per_second": 107.023, | |
| "eval_steps_per_second": 6.795, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 10.37, | |
| "grad_norm": 0.07353632897138596, | |
| "learning_rate": 9.648148148148149e-05, | |
| "loss": 0.0329, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 10.37, | |
| "eval_accuracy": 0.7354497354497355, | |
| "eval_loss": 1.1961112022399902, | |
| "eval_runtime": 1.8216, | |
| "eval_samples_per_second": 103.758, | |
| "eval_steps_per_second": 6.588, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 10.56, | |
| "grad_norm": 0.5441647171974182, | |
| "learning_rate": 9.462962962962963e-05, | |
| "loss": 0.028, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 10.56, | |
| "eval_accuracy": 0.6984126984126984, | |
| "eval_loss": 1.1282387971878052, | |
| "eval_runtime": 1.7883, | |
| "eval_samples_per_second": 105.689, | |
| "eval_steps_per_second": 6.71, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 10.74, | |
| "grad_norm": 0.07243653386831284, | |
| "learning_rate": 9.277777777777778e-05, | |
| "loss": 0.0373, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 10.74, | |
| "eval_accuracy": 0.7195767195767195, | |
| "eval_loss": 1.0716224908828735, | |
| "eval_runtime": 1.736, | |
| "eval_samples_per_second": 108.873, | |
| "eval_steps_per_second": 6.913, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 10.93, | |
| "grad_norm": 0.04851379618048668, | |
| "learning_rate": 9.092592592592593e-05, | |
| "loss": 0.0368, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 10.93, | |
| "eval_accuracy": 0.7142857142857143, | |
| "eval_loss": 1.1750774383544922, | |
| "eval_runtime": 1.7848, | |
| "eval_samples_per_second": 105.895, | |
| "eval_steps_per_second": 6.723, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 11.11, | |
| "grad_norm": 0.05160636082291603, | |
| "learning_rate": 8.907407407407407e-05, | |
| "loss": 0.0485, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 11.11, | |
| "eval_accuracy": 0.7354497354497355, | |
| "eval_loss": 1.0984432697296143, | |
| "eval_runtime": 1.7772, | |
| "eval_samples_per_second": 106.345, | |
| "eval_steps_per_second": 6.752, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 11.3, | |
| "grad_norm": 0.054380565881729126, | |
| "learning_rate": 8.722222222222223e-05, | |
| "loss": 0.0234, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 11.3, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.0418734550476074, | |
| "eval_runtime": 1.7977, | |
| "eval_samples_per_second": 105.132, | |
| "eval_steps_per_second": 6.675, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 11.48, | |
| "grad_norm": 0.32195061445236206, | |
| "learning_rate": 8.537037037037038e-05, | |
| "loss": 0.028, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 11.48, | |
| "eval_accuracy": 0.7566137566137566, | |
| "eval_loss": 1.0536975860595703, | |
| "eval_runtime": 1.7586, | |
| "eval_samples_per_second": 107.47, | |
| "eval_steps_per_second": 6.823, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 11.67, | |
| "grad_norm": 1.8460614681243896, | |
| "learning_rate": 8.351851851851852e-05, | |
| "loss": 0.0237, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 11.67, | |
| "eval_accuracy": 0.746031746031746, | |
| "eval_loss": 1.0571786165237427, | |
| "eval_runtime": 1.7901, | |
| "eval_samples_per_second": 105.578, | |
| "eval_steps_per_second": 6.703, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 11.85, | |
| "grad_norm": 1.7614848613739014, | |
| "learning_rate": 8.166666666666667e-05, | |
| "loss": 0.0198, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 11.85, | |
| "eval_accuracy": 0.746031746031746, | |
| "eval_loss": 1.0192136764526367, | |
| "eval_runtime": 1.7683, | |
| "eval_samples_per_second": 106.885, | |
| "eval_steps_per_second": 6.786, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 12.04, | |
| "grad_norm": 0.22871683537960052, | |
| "learning_rate": 7.981481481481481e-05, | |
| "loss": 0.02, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 12.04, | |
| "eval_accuracy": 0.7195767195767195, | |
| "eval_loss": 1.244175672531128, | |
| "eval_runtime": 1.8603, | |
| "eval_samples_per_second": 101.595, | |
| "eval_steps_per_second": 6.45, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 12.22, | |
| "grad_norm": 0.03752712532877922, | |
| "learning_rate": 7.796296296296297e-05, | |
| "loss": 0.0216, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 12.22, | |
| "eval_accuracy": 0.7407407407407407, | |
| "eval_loss": 1.1395213603973389, | |
| "eval_runtime": 1.7992, | |
| "eval_samples_per_second": 105.048, | |
| "eval_steps_per_second": 6.67, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 12.41, | |
| "grad_norm": 0.09251418709754944, | |
| "learning_rate": 7.61111111111111e-05, | |
| "loss": 0.0309, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 12.41, | |
| "eval_accuracy": 0.7354497354497355, | |
| "eval_loss": 1.1767151355743408, | |
| "eval_runtime": 1.8204, | |
| "eval_samples_per_second": 103.823, | |
| "eval_steps_per_second": 6.592, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 12.59, | |
| "grad_norm": 0.03858701139688492, | |
| "learning_rate": 7.425925925925927e-05, | |
| "loss": 0.0315, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 12.59, | |
| "eval_accuracy": 0.7248677248677249, | |
| "eval_loss": 1.1881897449493408, | |
| "eval_runtime": 1.7853, | |
| "eval_samples_per_second": 105.862, | |
| "eval_steps_per_second": 6.721, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 12.78, | |
| "grad_norm": 0.2986956536769867, | |
| "learning_rate": 7.240740740740741e-05, | |
| "loss": 0.017, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 12.78, | |
| "eval_accuracy": 0.7354497354497355, | |
| "eval_loss": 1.1652072668075562, | |
| "eval_runtime": 1.8006, | |
| "eval_samples_per_second": 104.965, | |
| "eval_steps_per_second": 6.664, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 12.96, | |
| "grad_norm": 0.23789283633232117, | |
| "learning_rate": 7.055555555555556e-05, | |
| "loss": 0.02, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 12.96, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.1011323928833008, | |
| "eval_runtime": 1.7393, | |
| "eval_samples_per_second": 108.665, | |
| "eval_steps_per_second": 6.899, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 13.15, | |
| "grad_norm": 0.0361974723637104, | |
| "learning_rate": 6.87037037037037e-05, | |
| "loss": 0.0174, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 13.15, | |
| "eval_accuracy": 0.7354497354497355, | |
| "eval_loss": 1.092558741569519, | |
| "eval_runtime": 1.8005, | |
| "eval_samples_per_second": 104.97, | |
| "eval_steps_per_second": 6.665, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 13.33, | |
| "grad_norm": 0.04739515110850334, | |
| "learning_rate": 6.685185185185185e-05, | |
| "loss": 0.012, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 13.33, | |
| "eval_accuracy": 0.746031746031746, | |
| "eval_loss": 1.0852241516113281, | |
| "eval_runtime": 1.787, | |
| "eval_samples_per_second": 105.766, | |
| "eval_steps_per_second": 6.715, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 13.52, | |
| "grad_norm": 0.035977743566036224, | |
| "learning_rate": 6.500000000000001e-05, | |
| "loss": 0.0296, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 13.52, | |
| "eval_accuracy": 0.7513227513227513, | |
| "eval_loss": 1.0534002780914307, | |
| "eval_runtime": 1.7706, | |
| "eval_samples_per_second": 106.746, | |
| "eval_steps_per_second": 6.778, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 13.7, | |
| "grad_norm": 0.3354228436946869, | |
| "learning_rate": 6.314814814814815e-05, | |
| "loss": 0.0142, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 13.7, | |
| "eval_accuracy": 0.746031746031746, | |
| "eval_loss": 1.0607830286026, | |
| "eval_runtime": 1.8039, | |
| "eval_samples_per_second": 104.775, | |
| "eval_steps_per_second": 6.652, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 13.89, | |
| "grad_norm": 0.031177503988146782, | |
| "learning_rate": 6.12962962962963e-05, | |
| "loss": 0.0199, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 13.89, | |
| "eval_accuracy": 0.746031746031746, | |
| "eval_loss": 1.0850036144256592, | |
| "eval_runtime": 1.7472, | |
| "eval_samples_per_second": 108.174, | |
| "eval_steps_per_second": 6.868, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 14.07, | |
| "grad_norm": 0.2141834944486618, | |
| "learning_rate": 5.9444444444444445e-05, | |
| "loss": 0.0169, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 14.07, | |
| "eval_accuracy": 0.7566137566137566, | |
| "eval_loss": 1.0736693143844604, | |
| "eval_runtime": 1.7821, | |
| "eval_samples_per_second": 106.054, | |
| "eval_steps_per_second": 6.734, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 14.26, | |
| "grad_norm": 0.028399189934134483, | |
| "learning_rate": 5.75925925925926e-05, | |
| "loss": 0.0139, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 14.26, | |
| "eval_accuracy": 0.7566137566137566, | |
| "eval_loss": 1.0717233419418335, | |
| "eval_runtime": 1.8135, | |
| "eval_samples_per_second": 104.221, | |
| "eval_steps_per_second": 6.617, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 14.44, | |
| "grad_norm": 0.03289506584405899, | |
| "learning_rate": 5.574074074074075e-05, | |
| "loss": 0.0173, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 14.44, | |
| "eval_accuracy": 0.7566137566137566, | |
| "eval_loss": 1.0707134008407593, | |
| "eval_runtime": 1.7856, | |
| "eval_samples_per_second": 105.849, | |
| "eval_steps_per_second": 6.721, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 14.63, | |
| "grad_norm": 0.032911308109760284, | |
| "learning_rate": 5.388888888888889e-05, | |
| "loss": 0.0101, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 14.63, | |
| "eval_accuracy": 0.7566137566137566, | |
| "eval_loss": 1.070402979850769, | |
| "eval_runtime": 1.7933, | |
| "eval_samples_per_second": 105.391, | |
| "eval_steps_per_second": 6.691, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 14.81, | |
| "grad_norm": 0.43361806869506836, | |
| "learning_rate": 5.203703703703704e-05, | |
| "loss": 0.0286, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 14.81, | |
| "eval_accuracy": 0.7671957671957672, | |
| "eval_loss": 1.0845017433166504, | |
| "eval_runtime": 1.7994, | |
| "eval_samples_per_second": 105.033, | |
| "eval_steps_per_second": 6.669, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.05939367786049843, | |
| "learning_rate": 5.018518518518519e-05, | |
| "loss": 0.0135, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_accuracy": 0.7513227513227513, | |
| "eval_loss": 1.0972745418548584, | |
| "eval_runtime": 1.7785, | |
| "eval_samples_per_second": 106.271, | |
| "eval_steps_per_second": 6.747, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 15.19, | |
| "grad_norm": 0.030746394768357277, | |
| "learning_rate": 4.8333333333333334e-05, | |
| "loss": 0.0129, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 15.19, | |
| "eval_accuracy": 0.7513227513227513, | |
| "eval_loss": 1.0909744501113892, | |
| "eval_runtime": 1.7304, | |
| "eval_samples_per_second": 109.222, | |
| "eval_steps_per_second": 6.935, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 15.37, | |
| "grad_norm": 0.026390748098492622, | |
| "learning_rate": 4.648148148148148e-05, | |
| "loss": 0.0117, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 15.37, | |
| "eval_accuracy": 0.7671957671957672, | |
| "eval_loss": 1.0890551805496216, | |
| "eval_runtime": 1.8164, | |
| "eval_samples_per_second": 104.051, | |
| "eval_steps_per_second": 6.606, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 15.56, | |
| "grad_norm": 0.028341053053736687, | |
| "learning_rate": 4.462962962962963e-05, | |
| "loss": 0.014, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 15.56, | |
| "eval_accuracy": 0.7566137566137566, | |
| "eval_loss": 1.0884122848510742, | |
| "eval_runtime": 1.8336, | |
| "eval_samples_per_second": 103.079, | |
| "eval_steps_per_second": 6.545, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 15.74, | |
| "grad_norm": 0.027172435075044632, | |
| "learning_rate": 4.277777777777778e-05, | |
| "loss": 0.0093, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 15.74, | |
| "eval_accuracy": 0.7513227513227513, | |
| "eval_loss": 1.0879539251327515, | |
| "eval_runtime": 1.7368, | |
| "eval_samples_per_second": 108.818, | |
| "eval_steps_per_second": 6.909, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 15.93, | |
| "grad_norm": 0.4558853209018707, | |
| "learning_rate": 4.092592592592593e-05, | |
| "loss": 0.0264, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 15.93, | |
| "eval_accuracy": 0.7566137566137566, | |
| "eval_loss": 1.0861279964447021, | |
| "eval_runtime": 1.8295, | |
| "eval_samples_per_second": 103.306, | |
| "eval_steps_per_second": 6.559, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 16.11, | |
| "grad_norm": 0.023086287081241608, | |
| "learning_rate": 3.9074074074074076e-05, | |
| "loss": 0.0117, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 16.11, | |
| "eval_accuracy": 0.7513227513227513, | |
| "eval_loss": 1.0812128782272339, | |
| "eval_runtime": 1.783, | |
| "eval_samples_per_second": 106.0, | |
| "eval_steps_per_second": 6.73, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 16.3, | |
| "grad_norm": 0.16555258631706238, | |
| "learning_rate": 3.722222222222222e-05, | |
| "loss": 0.0131, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 16.3, | |
| "eval_accuracy": 0.7513227513227513, | |
| "eval_loss": 1.084083080291748, | |
| "eval_runtime": 1.7979, | |
| "eval_samples_per_second": 105.125, | |
| "eval_steps_per_second": 6.675, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 16.48, | |
| "grad_norm": 0.1985342651605606, | |
| "learning_rate": 3.537037037037037e-05, | |
| "loss": 0.0107, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 16.48, | |
| "eval_accuracy": 0.7513227513227513, | |
| "eval_loss": 1.0908081531524658, | |
| "eval_runtime": 1.8371, | |
| "eval_samples_per_second": 102.877, | |
| "eval_steps_per_second": 6.532, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 16.67, | |
| "grad_norm": 0.023619532585144043, | |
| "learning_rate": 3.351851851851852e-05, | |
| "loss": 0.0253, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 16.67, | |
| "eval_accuracy": 0.7566137566137566, | |
| "eval_loss": 1.0818437337875366, | |
| "eval_runtime": 1.8128, | |
| "eval_samples_per_second": 104.258, | |
| "eval_steps_per_second": 6.62, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 16.85, | |
| "grad_norm": 0.031866107136011124, | |
| "learning_rate": 3.1666666666666666e-05, | |
| "loss": 0.0113, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 16.85, | |
| "eval_accuracy": 0.7671957671957672, | |
| "eval_loss": 1.0804176330566406, | |
| "eval_runtime": 1.7557, | |
| "eval_samples_per_second": 107.647, | |
| "eval_steps_per_second": 6.835, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 17.04, | |
| "grad_norm": 0.027054764330387115, | |
| "learning_rate": 2.981481481481482e-05, | |
| "loss": 0.0117, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 17.04, | |
| "eval_accuracy": 0.7671957671957672, | |
| "eval_loss": 1.0813896656036377, | |
| "eval_runtime": 1.8358, | |
| "eval_samples_per_second": 102.952, | |
| "eval_steps_per_second": 6.537, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 17.22, | |
| "grad_norm": 0.025050414726138115, | |
| "learning_rate": 2.7962962962962965e-05, | |
| "loss": 0.0158, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 17.22, | |
| "eval_accuracy": 0.7566137566137566, | |
| "eval_loss": 1.0813225507736206, | |
| "eval_runtime": 1.7643, | |
| "eval_samples_per_second": 107.126, | |
| "eval_steps_per_second": 6.802, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 17.41, | |
| "grad_norm": 0.024830004200339317, | |
| "learning_rate": 2.6111111111111114e-05, | |
| "loss": 0.011, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 17.41, | |
| "eval_accuracy": 0.7671957671957672, | |
| "eval_loss": 1.080676794052124, | |
| "eval_runtime": 1.759, | |
| "eval_samples_per_second": 107.45, | |
| "eval_steps_per_second": 6.822, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 17.59, | |
| "grad_norm": 0.024760620668530464, | |
| "learning_rate": 2.425925925925926e-05, | |
| "loss": 0.0137, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 17.59, | |
| "eval_accuracy": 0.7671957671957672, | |
| "eval_loss": 1.0803221464157104, | |
| "eval_runtime": 1.7971, | |
| "eval_samples_per_second": 105.168, | |
| "eval_steps_per_second": 6.677, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 17.78, | |
| "grad_norm": 0.025229470804333687, | |
| "learning_rate": 2.240740740740741e-05, | |
| "loss": 0.0112, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 17.78, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.0807117223739624, | |
| "eval_runtime": 1.7675, | |
| "eval_samples_per_second": 106.93, | |
| "eval_steps_per_second": 6.789, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 17.96, | |
| "grad_norm": 0.02313585951924324, | |
| "learning_rate": 2.0555555555555555e-05, | |
| "loss": 0.0172, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 17.96, | |
| "eval_accuracy": 0.7566137566137566, | |
| "eval_loss": 1.0821946859359741, | |
| "eval_runtime": 1.8179, | |
| "eval_samples_per_second": 103.964, | |
| "eval_steps_per_second": 6.601, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 18.15, | |
| "grad_norm": 0.024956317618489265, | |
| "learning_rate": 1.8703703703703704e-05, | |
| "loss": 0.0132, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 18.15, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.0860090255737305, | |
| "eval_runtime": 1.7729, | |
| "eval_samples_per_second": 106.607, | |
| "eval_steps_per_second": 6.769, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 18.33, | |
| "grad_norm": 0.02182234823703766, | |
| "learning_rate": 1.6851851851851853e-05, | |
| "loss": 0.0127, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 18.33, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.0875723361968994, | |
| "eval_runtime": 1.7863, | |
| "eval_samples_per_second": 105.804, | |
| "eval_steps_per_second": 6.718, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 18.52, | |
| "grad_norm": 0.024420464411377907, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.0152, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 18.52, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.0873754024505615, | |
| "eval_runtime": 1.7723, | |
| "eval_samples_per_second": 106.644, | |
| "eval_steps_per_second": 6.771, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 18.7, | |
| "grad_norm": 0.18311668932437897, | |
| "learning_rate": 1.3148148148148148e-05, | |
| "loss": 0.0096, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 18.7, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.088024377822876, | |
| "eval_runtime": 1.8979, | |
| "eval_samples_per_second": 99.583, | |
| "eval_steps_per_second": 6.323, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 18.89, | |
| "grad_norm": 0.023139068856835365, | |
| "learning_rate": 1.1296296296296297e-05, | |
| "loss": 0.0107, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 18.89, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.08987557888031, | |
| "eval_runtime": 1.8132, | |
| "eval_samples_per_second": 104.237, | |
| "eval_steps_per_second": 6.618, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 19.07, | |
| "grad_norm": 0.024323537945747375, | |
| "learning_rate": 9.444444444444445e-06, | |
| "loss": 0.0124, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 19.07, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.0899451971054077, | |
| "eval_runtime": 1.7841, | |
| "eval_samples_per_second": 105.934, | |
| "eval_steps_per_second": 6.726, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 19.26, | |
| "grad_norm": 0.20473988354206085, | |
| "learning_rate": 7.592592592592593e-06, | |
| "loss": 0.0187, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 19.26, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.0915277004241943, | |
| "eval_runtime": 1.7828, | |
| "eval_samples_per_second": 106.015, | |
| "eval_steps_per_second": 6.731, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 19.44, | |
| "grad_norm": 0.021954894065856934, | |
| "learning_rate": 5.740740740740741e-06, | |
| "loss": 0.0159, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 19.44, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.0916674137115479, | |
| "eval_runtime": 1.7554, | |
| "eval_samples_per_second": 107.665, | |
| "eval_steps_per_second": 6.836, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 19.63, | |
| "grad_norm": 0.02447775937616825, | |
| "learning_rate": 3.888888888888889e-06, | |
| "loss": 0.0107, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 19.63, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.091030240058899, | |
| "eval_runtime": 1.7566, | |
| "eval_samples_per_second": 107.597, | |
| "eval_steps_per_second": 6.832, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 19.81, | |
| "grad_norm": 0.02190612629055977, | |
| "learning_rate": 2.0370370370370375e-06, | |
| "loss": 0.0105, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 19.81, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.0911825895309448, | |
| "eval_runtime": 1.7879, | |
| "eval_samples_per_second": 105.71, | |
| "eval_steps_per_second": 6.712, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.0392175130546093, | |
| "learning_rate": 1.851851851851852e-07, | |
| "loss": 0.0076, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_accuracy": 0.7619047619047619, | |
| "eval_loss": 1.0909953117370605, | |
| "eval_runtime": 1.7898, | |
| "eval_samples_per_second": 105.596, | |
| "eval_steps_per_second": 6.704, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "step": 1080, | |
| "total_flos": 2.681093741830963e+18, | |
| "train_loss": 0.29336747460895113, | |
| "train_runtime": 795.2059, | |
| "train_samples_per_second": 42.681, | |
| "train_steps_per_second": 1.358 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1080, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 10, | |
| "total_flos": 2.681093741830963e+18, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |