{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.993342210386152, "eval_steps": 500, "global_step": 1125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002663115845539281, "grad_norm": 59.669442519158444, "learning_rate": 4.424778761061947e-07, "loss": 11.0815, "step": 1 }, { "epoch": 0.005326231691078562, "grad_norm": 59.77300379138749, "learning_rate": 8.849557522123894e-07, "loss": 11.0703, "step": 2 }, { "epoch": 0.007989347536617843, "grad_norm": 59.37811338851668, "learning_rate": 1.3274336283185841e-06, "loss": 11.1149, "step": 3 }, { "epoch": 0.010652463382157125, "grad_norm": 59.714257927262075, "learning_rate": 1.7699115044247788e-06, "loss": 11.1, "step": 4 }, { "epoch": 0.013315579227696404, "grad_norm": 62.19325541849273, "learning_rate": 2.2123893805309734e-06, "loss": 10.9008, "step": 5 }, { "epoch": 0.015978695073235686, "grad_norm": 64.3469313247898, "learning_rate": 2.6548672566371683e-06, "loss": 10.7897, "step": 6 }, { "epoch": 0.018641810918774968, "grad_norm": 64.70693307946331, "learning_rate": 3.097345132743363e-06, "loss": 10.6244, "step": 7 }, { "epoch": 0.02130492676431425, "grad_norm": 100.07904925734698, "learning_rate": 3.5398230088495575e-06, "loss": 9.3505, "step": 8 }, { "epoch": 0.023968042609853527, "grad_norm": 121.42213770896274, "learning_rate": 3.982300884955752e-06, "loss": 8.5961, "step": 9 }, { "epoch": 0.02663115845539281, "grad_norm": 64.96997432704501, "learning_rate": 4.424778761061947e-06, "loss": 3.5386, "step": 10 }, { "epoch": 0.02929427430093209, "grad_norm": 53.5067123571589, "learning_rate": 4.867256637168142e-06, "loss": 3.1169, "step": 11 }, { "epoch": 0.03195739014647137, "grad_norm": 34.28454533456946, "learning_rate": 5.3097345132743365e-06, "loss": 2.3171, "step": 12 }, { "epoch": 0.03462050599201065, "grad_norm": 28.02284592011359, "learning_rate": 5.752212389380531e-06, "loss": 2.1704, "step": 13 }, { "epoch": 0.037283621837549935, "grad_norm": 6.230233716943746, "learning_rate": 6.194690265486726e-06, "loss": 1.3702, "step": 14 }, { "epoch": 0.03994673768308921, "grad_norm": 4.8265444090252325, "learning_rate": 6.6371681415929215e-06, "loss": 1.2994, "step": 15 }, { "epoch": 0.0426098535286285, "grad_norm": 3.4989649353882544, "learning_rate": 7.079646017699115e-06, "loss": 1.1939, "step": 16 }, { "epoch": 0.045272969374167776, "grad_norm": 2.548022240081304, "learning_rate": 7.52212389380531e-06, "loss": 1.1113, "step": 17 }, { "epoch": 0.047936085219707054, "grad_norm": 1.7785073197319812, "learning_rate": 7.964601769911505e-06, "loss": 1.0099, "step": 18 }, { "epoch": 0.05059920106524634, "grad_norm": 52.43472197468591, "learning_rate": 8.407079646017701e-06, "loss": 1.0002, "step": 19 }, { "epoch": 0.05326231691078562, "grad_norm": 18.71256882921437, "learning_rate": 8.849557522123894e-06, "loss": 0.9335, "step": 20 }, { "epoch": 0.0559254327563249, "grad_norm": 1.6748381666125123, "learning_rate": 9.29203539823009e-06, "loss": 0.8897, "step": 21 }, { "epoch": 0.05858854860186418, "grad_norm": 1.2119772296620004, "learning_rate": 9.734513274336284e-06, "loss": 0.8728, "step": 22 }, { "epoch": 0.06125166444740346, "grad_norm": 0.9292233025769583, "learning_rate": 1.0176991150442479e-05, "loss": 0.8443, "step": 23 }, { "epoch": 0.06391478029294274, "grad_norm": 0.8058222924733704, "learning_rate": 1.0619469026548673e-05, "loss": 0.8065, "step": 24 }, { "epoch": 0.06657789613848203, "grad_norm": 0.7676888976773729, "learning_rate": 1.1061946902654869e-05, "loss": 0.744, "step": 25 }, { "epoch": 0.0692410119840213, "grad_norm": 1.1442962246712427, "learning_rate": 1.1504424778761062e-05, "loss": 0.7962, "step": 26 }, { "epoch": 0.07190412782956059, "grad_norm": 0.8086732801653846, "learning_rate": 1.1946902654867258e-05, "loss": 0.7546, "step": 27 }, { "epoch": 0.07456724367509987, "grad_norm": 0.6032687314644429, "learning_rate": 1.2389380530973452e-05, "loss": 0.6961, "step": 28 }, { "epoch": 0.07723035952063914, "grad_norm": 0.8050008569135423, "learning_rate": 1.2831858407079647e-05, "loss": 0.7181, "step": 29 }, { "epoch": 0.07989347536617843, "grad_norm": 0.7760170053857292, "learning_rate": 1.3274336283185843e-05, "loss": 0.7011, "step": 30 }, { "epoch": 0.08255659121171771, "grad_norm": 0.6911853454916363, "learning_rate": 1.3716814159292036e-05, "loss": 0.6767, "step": 31 }, { "epoch": 0.085219707057257, "grad_norm": 0.5690990372888421, "learning_rate": 1.415929203539823e-05, "loss": 0.6657, "step": 32 }, { "epoch": 0.08788282290279627, "grad_norm": 0.46539236587043925, "learning_rate": 1.4601769911504426e-05, "loss": 0.6585, "step": 33 }, { "epoch": 0.09054593874833555, "grad_norm": 0.6011651474231043, "learning_rate": 1.504424778761062e-05, "loss": 0.6571, "step": 34 }, { "epoch": 0.09320905459387484, "grad_norm": 0.6055438783984222, "learning_rate": 1.5486725663716813e-05, "loss": 0.6307, "step": 35 }, { "epoch": 0.09587217043941411, "grad_norm": 0.4930140407791457, "learning_rate": 1.592920353982301e-05, "loss": 0.638, "step": 36 }, { "epoch": 0.0985352862849534, "grad_norm": 0.38727032176053555, "learning_rate": 1.6371681415929206e-05, "loss": 0.6189, "step": 37 }, { "epoch": 0.10119840213049268, "grad_norm": 0.46992360907642716, "learning_rate": 1.6814159292035402e-05, "loss": 0.6242, "step": 38 }, { "epoch": 0.10386151797603196, "grad_norm": 0.5002104790615647, "learning_rate": 1.7256637168141594e-05, "loss": 0.6087, "step": 39 }, { "epoch": 0.10652463382157124, "grad_norm": 0.4378982855259104, "learning_rate": 1.7699115044247787e-05, "loss": 0.6112, "step": 40 }, { "epoch": 0.10918774966711052, "grad_norm": 0.343549106950523, "learning_rate": 1.8141592920353983e-05, "loss": 0.6251, "step": 41 }, { "epoch": 0.1118508655126498, "grad_norm": 0.43140422077824325, "learning_rate": 1.858407079646018e-05, "loss": 0.625, "step": 42 }, { "epoch": 0.11451398135818908, "grad_norm": 0.44945895418028914, "learning_rate": 1.9026548672566372e-05, "loss": 0.576, "step": 43 }, { "epoch": 0.11717709720372836, "grad_norm": 0.33640715838659224, "learning_rate": 1.946902654867257e-05, "loss": 0.602, "step": 44 }, { "epoch": 0.11984021304926765, "grad_norm": 0.3602083165810118, "learning_rate": 1.991150442477876e-05, "loss": 0.5707, "step": 45 }, { "epoch": 0.12250332889480692, "grad_norm": 1.7341245223857158, "learning_rate": 2.0353982300884957e-05, "loss": 0.5662, "step": 46 }, { "epoch": 0.12516644474034622, "grad_norm": 0.42320706053839496, "learning_rate": 2.079646017699115e-05, "loss": 0.5718, "step": 47 }, { "epoch": 0.1278295605858855, "grad_norm": 0.34356067841011745, "learning_rate": 2.1238938053097346e-05, "loss": 0.5652, "step": 48 }, { "epoch": 0.13049267643142476, "grad_norm": 0.37607875054105366, "learning_rate": 2.1681415929203542e-05, "loss": 0.6079, "step": 49 }, { "epoch": 0.13315579227696406, "grad_norm": 0.355877489349339, "learning_rate": 2.2123893805309738e-05, "loss": 0.5414, "step": 50 }, { "epoch": 0.13581890812250333, "grad_norm": 0.3531413648567738, "learning_rate": 2.2566371681415928e-05, "loss": 0.5383, "step": 51 }, { "epoch": 0.1384820239680426, "grad_norm": 0.3900867327584249, "learning_rate": 2.3008849557522124e-05, "loss": 0.5607, "step": 52 }, { "epoch": 0.1411451398135819, "grad_norm": 0.29096561379999103, "learning_rate": 2.345132743362832e-05, "loss": 0.5428, "step": 53 }, { "epoch": 0.14380825565912117, "grad_norm": 0.34882597172967983, "learning_rate": 2.3893805309734516e-05, "loss": 0.5597, "step": 54 }, { "epoch": 0.14647137150466044, "grad_norm": 0.31745047102841745, "learning_rate": 2.433628318584071e-05, "loss": 0.5427, "step": 55 }, { "epoch": 0.14913448735019974, "grad_norm": 0.3429464925874952, "learning_rate": 2.4778761061946905e-05, "loss": 0.5418, "step": 56 }, { "epoch": 0.151797603195739, "grad_norm": 0.28154789184935636, "learning_rate": 2.5221238938053098e-05, "loss": 0.5701, "step": 57 }, { "epoch": 0.15446071904127828, "grad_norm": 0.3141148216942468, "learning_rate": 2.5663716814159294e-05, "loss": 0.5279, "step": 58 }, { "epoch": 0.15712383488681758, "grad_norm": 0.3077683025338142, "learning_rate": 2.610619469026549e-05, "loss": 0.5443, "step": 59 }, { "epoch": 0.15978695073235685, "grad_norm": 0.35329472069062134, "learning_rate": 2.6548672566371686e-05, "loss": 0.5657, "step": 60 }, { "epoch": 0.16245006657789615, "grad_norm": 0.30082869981695665, "learning_rate": 2.6991150442477875e-05, "loss": 0.5386, "step": 61 }, { "epoch": 0.16511318242343542, "grad_norm": 0.3705381333041911, "learning_rate": 2.743362831858407e-05, "loss": 0.5417, "step": 62 }, { "epoch": 0.1677762982689747, "grad_norm": 0.3424625742113855, "learning_rate": 2.7876106194690264e-05, "loss": 0.5334, "step": 63 }, { "epoch": 0.170439414114514, "grad_norm": 0.2904098798351202, "learning_rate": 2.831858407079646e-05, "loss": 0.5424, "step": 64 }, { "epoch": 0.17310252996005326, "grad_norm": 0.32851572085926894, "learning_rate": 2.8761061946902656e-05, "loss": 0.5231, "step": 65 }, { "epoch": 0.17576564580559254, "grad_norm": 0.29034784648982725, "learning_rate": 2.9203539823008852e-05, "loss": 0.5394, "step": 66 }, { "epoch": 0.17842876165113183, "grad_norm": 0.33213549417249844, "learning_rate": 2.964601769911505e-05, "loss": 0.54, "step": 67 }, { "epoch": 0.1810918774966711, "grad_norm": 0.2751631826164567, "learning_rate": 3.008849557522124e-05, "loss": 0.5254, "step": 68 }, { "epoch": 0.18375499334221038, "grad_norm": 0.3037009657021324, "learning_rate": 3.0530973451327434e-05, "loss": 0.5216, "step": 69 }, { "epoch": 0.18641810918774968, "grad_norm": 0.30105360826964594, "learning_rate": 3.097345132743363e-05, "loss": 0.5111, "step": 70 }, { "epoch": 0.18908122503328895, "grad_norm": 0.3202863693523833, "learning_rate": 3.1415929203539826e-05, "loss": 0.537, "step": 71 }, { "epoch": 0.19174434087882822, "grad_norm": 0.3294366280935238, "learning_rate": 3.185840707964602e-05, "loss": 0.5215, "step": 72 }, { "epoch": 0.19440745672436752, "grad_norm": 0.32228297514585236, "learning_rate": 3.230088495575221e-05, "loss": 0.536, "step": 73 }, { "epoch": 0.1970705725699068, "grad_norm": 0.31224977631197853, "learning_rate": 3.274336283185841e-05, "loss": 0.5133, "step": 74 }, { "epoch": 0.19973368841544606, "grad_norm": 0.34249789697496347, "learning_rate": 3.3185840707964604e-05, "loss": 0.5187, "step": 75 }, { "epoch": 0.20239680426098536, "grad_norm": 0.3014674455677291, "learning_rate": 3.3628318584070804e-05, "loss": 0.5173, "step": 76 }, { "epoch": 0.20505992010652463, "grad_norm": 0.31181209074311145, "learning_rate": 3.407079646017699e-05, "loss": 0.4938, "step": 77 }, { "epoch": 0.20772303595206393, "grad_norm": 0.3421599429123891, "learning_rate": 3.451327433628319e-05, "loss": 0.5178, "step": 78 }, { "epoch": 0.2103861517976032, "grad_norm": 0.32144698779599035, "learning_rate": 3.495575221238938e-05, "loss": 0.529, "step": 79 }, { "epoch": 0.21304926764314247, "grad_norm": 0.30829102288383803, "learning_rate": 3.5398230088495574e-05, "loss": 0.5045, "step": 80 }, { "epoch": 0.21571238348868177, "grad_norm": 0.3320673147021741, "learning_rate": 3.5840707964601774e-05, "loss": 0.5193, "step": 81 }, { "epoch": 0.21837549933422104, "grad_norm": 0.3257493459194373, "learning_rate": 3.628318584070797e-05, "loss": 0.5161, "step": 82 }, { "epoch": 0.2210386151797603, "grad_norm": 0.3451069209364067, "learning_rate": 3.672566371681416e-05, "loss": 0.4902, "step": 83 }, { "epoch": 0.2237017310252996, "grad_norm": 0.38062902785170477, "learning_rate": 3.716814159292036e-05, "loss": 0.5106, "step": 84 }, { "epoch": 0.22636484687083888, "grad_norm": 0.3437845837066077, "learning_rate": 3.7610619469026545e-05, "loss": 0.5072, "step": 85 }, { "epoch": 0.22902796271637815, "grad_norm": 0.4369801740657791, "learning_rate": 3.8053097345132744e-05, "loss": 0.5016, "step": 86 }, { "epoch": 0.23169107856191745, "grad_norm": 0.39323367167161793, "learning_rate": 3.849557522123894e-05, "loss": 0.5126, "step": 87 }, { "epoch": 0.23435419440745672, "grad_norm": 0.3804923058106557, "learning_rate": 3.893805309734514e-05, "loss": 0.5169, "step": 88 }, { "epoch": 0.237017310252996, "grad_norm": 0.3991475997522414, "learning_rate": 3.938053097345133e-05, "loss": 0.5206, "step": 89 }, { "epoch": 0.2396804260985353, "grad_norm": 0.3345983998430803, "learning_rate": 3.982300884955752e-05, "loss": 0.5126, "step": 90 }, { "epoch": 0.24234354194407456, "grad_norm": 0.37605023011424904, "learning_rate": 4.026548672566372e-05, "loss": 0.517, "step": 91 }, { "epoch": 0.24500665778961384, "grad_norm": 0.30015095297467786, "learning_rate": 4.0707964601769914e-05, "loss": 0.5146, "step": 92 }, { "epoch": 0.24766977363515313, "grad_norm": 0.37615535541775885, "learning_rate": 4.115044247787611e-05, "loss": 0.4897, "step": 93 }, { "epoch": 0.25033288948069243, "grad_norm": 0.32506469165922075, "learning_rate": 4.15929203539823e-05, "loss": 0.5033, "step": 94 }, { "epoch": 0.2529960053262317, "grad_norm": 0.3955130401533768, "learning_rate": 4.20353982300885e-05, "loss": 0.517, "step": 95 }, { "epoch": 0.255659121171771, "grad_norm": 0.38256193351931217, "learning_rate": 4.247787610619469e-05, "loss": 0.4903, "step": 96 }, { "epoch": 0.2583222370173103, "grad_norm": 0.3757931359073768, "learning_rate": 4.2920353982300885e-05, "loss": 0.4881, "step": 97 }, { "epoch": 0.2609853528628495, "grad_norm": 0.4073525724085135, "learning_rate": 4.3362831858407084e-05, "loss": 0.4981, "step": 98 }, { "epoch": 0.2636484687083888, "grad_norm": 0.42226304140119747, "learning_rate": 4.380530973451328e-05, "loss": 0.4777, "step": 99 }, { "epoch": 0.2663115845539281, "grad_norm": 0.47546631243940135, "learning_rate": 4.4247787610619477e-05, "loss": 0.5012, "step": 100 }, { "epoch": 0.26897470039946736, "grad_norm": 0.38067024978966585, "learning_rate": 4.469026548672566e-05, "loss": 0.5038, "step": 101 }, { "epoch": 0.27163781624500666, "grad_norm": 0.3549335612107799, "learning_rate": 4.5132743362831855e-05, "loss": 0.5046, "step": 102 }, { "epoch": 0.27430093209054596, "grad_norm": 0.4081532806299182, "learning_rate": 4.5575221238938055e-05, "loss": 0.4816, "step": 103 }, { "epoch": 0.2769640479360852, "grad_norm": 0.35702973975911423, "learning_rate": 4.601769911504425e-05, "loss": 0.4969, "step": 104 }, { "epoch": 0.2796271637816245, "grad_norm": 0.3750952303695297, "learning_rate": 4.646017699115045e-05, "loss": 0.5129, "step": 105 }, { "epoch": 0.2822902796271638, "grad_norm": 0.3713537523929101, "learning_rate": 4.690265486725664e-05, "loss": 0.4871, "step": 106 }, { "epoch": 0.28495339547270304, "grad_norm": 0.47534354342607993, "learning_rate": 4.734513274336283e-05, "loss": 0.4971, "step": 107 }, { "epoch": 0.28761651131824234, "grad_norm": 0.41826478296211245, "learning_rate": 4.778761061946903e-05, "loss": 0.4943, "step": 108 }, { "epoch": 0.29027962716378164, "grad_norm": 0.39759514237849775, "learning_rate": 4.823008849557522e-05, "loss": 0.5014, "step": 109 }, { "epoch": 0.2929427430093209, "grad_norm": 0.4548008624547614, "learning_rate": 4.867256637168142e-05, "loss": 0.5067, "step": 110 }, { "epoch": 0.2956058588548602, "grad_norm": 0.4618812739465874, "learning_rate": 4.911504424778761e-05, "loss": 0.487, "step": 111 }, { "epoch": 0.2982689747003995, "grad_norm": 0.31165613667101594, "learning_rate": 4.955752212389381e-05, "loss": 0.4908, "step": 112 }, { "epoch": 0.3009320905459387, "grad_norm": 0.45735168765249185, "learning_rate": 5e-05, "loss": 0.4924, "step": 113 }, { "epoch": 0.303595206391478, "grad_norm": 0.4659242945372524, "learning_rate": 4.9950592885375493e-05, "loss": 0.49, "step": 114 }, { "epoch": 0.3062583222370173, "grad_norm": 0.3422222311667708, "learning_rate": 4.990118577075099e-05, "loss": 0.4902, "step": 115 }, { "epoch": 0.30892143808255657, "grad_norm": 0.5702864889691999, "learning_rate": 4.985177865612648e-05, "loss": 0.4712, "step": 116 }, { "epoch": 0.31158455392809586, "grad_norm": 0.31000398399919754, "learning_rate": 4.980237154150198e-05, "loss": 0.4729, "step": 117 }, { "epoch": 0.31424766977363516, "grad_norm": 0.5329093367544124, "learning_rate": 4.975296442687747e-05, "loss": 0.4979, "step": 118 }, { "epoch": 0.3169107856191744, "grad_norm": 0.41581595613618844, "learning_rate": 4.970355731225297e-05, "loss": 0.4979, "step": 119 }, { "epoch": 0.3195739014647137, "grad_norm": 0.5898871183617019, "learning_rate": 4.965415019762846e-05, "loss": 0.4841, "step": 120 }, { "epoch": 0.322237017310253, "grad_norm": 0.5277745967026336, "learning_rate": 4.960474308300396e-05, "loss": 0.494, "step": 121 }, { "epoch": 0.3249001331557923, "grad_norm": 0.6707049603761084, "learning_rate": 4.955533596837945e-05, "loss": 0.4816, "step": 122 }, { "epoch": 0.32756324900133155, "grad_norm": 0.39379278723705347, "learning_rate": 4.950592885375494e-05, "loss": 0.4708, "step": 123 }, { "epoch": 0.33022636484687085, "grad_norm": 0.5682660745624962, "learning_rate": 4.945652173913044e-05, "loss": 0.4844, "step": 124 }, { "epoch": 0.33288948069241014, "grad_norm": 0.4164160620027728, "learning_rate": 4.940711462450593e-05, "loss": 0.4577, "step": 125 }, { "epoch": 0.3355525965379494, "grad_norm": 0.5359420179155978, "learning_rate": 4.9357707509881426e-05, "loss": 0.4723, "step": 126 }, { "epoch": 0.3382157123834887, "grad_norm": 0.5026386563312899, "learning_rate": 4.930830039525692e-05, "loss": 0.4706, "step": 127 }, { "epoch": 0.340878828229028, "grad_norm": 0.5189502106027113, "learning_rate": 4.9258893280632415e-05, "loss": 0.4814, "step": 128 }, { "epoch": 0.34354194407456723, "grad_norm": 0.46462849504368775, "learning_rate": 4.9209486166007906e-05, "loss": 0.4735, "step": 129 }, { "epoch": 0.34620505992010653, "grad_norm": 0.5495458064144569, "learning_rate": 4.9160079051383404e-05, "loss": 0.4964, "step": 130 }, { "epoch": 0.3488681757656458, "grad_norm": 0.4136354389486864, "learning_rate": 4.9110671936758895e-05, "loss": 0.4937, "step": 131 }, { "epoch": 0.35153129161118507, "grad_norm": 0.49819742888588847, "learning_rate": 4.906126482213439e-05, "loss": 0.4929, "step": 132 }, { "epoch": 0.35419440745672437, "grad_norm": 0.5211986557669676, "learning_rate": 4.901185770750988e-05, "loss": 0.4722, "step": 133 }, { "epoch": 0.35685752330226367, "grad_norm": 0.3743611868649684, "learning_rate": 4.896245059288538e-05, "loss": 0.4852, "step": 134 }, { "epoch": 0.3595206391478029, "grad_norm": 0.47244102498767254, "learning_rate": 4.891304347826087e-05, "loss": 0.4846, "step": 135 }, { "epoch": 0.3621837549933422, "grad_norm": 0.39536123377896054, "learning_rate": 4.886363636363637e-05, "loss": 0.4812, "step": 136 }, { "epoch": 0.3648468708388815, "grad_norm": 0.39389579963168014, "learning_rate": 4.881422924901186e-05, "loss": 0.4814, "step": 137 }, { "epoch": 0.36750998668442075, "grad_norm": 0.5517767967854046, "learning_rate": 4.876482213438736e-05, "loss": 0.4605, "step": 138 }, { "epoch": 0.37017310252996005, "grad_norm": 0.3371092349408584, "learning_rate": 4.871541501976285e-05, "loss": 0.4919, "step": 139 }, { "epoch": 0.37283621837549935, "grad_norm": 0.5454997328166629, "learning_rate": 4.866600790513835e-05, "loss": 0.478, "step": 140 }, { "epoch": 0.3754993342210386, "grad_norm": 0.38191662974594565, "learning_rate": 4.861660079051384e-05, "loss": 0.4675, "step": 141 }, { "epoch": 0.3781624500665779, "grad_norm": 0.44622867680541506, "learning_rate": 4.8567193675889336e-05, "loss": 0.4767, "step": 142 }, { "epoch": 0.3808255659121172, "grad_norm": 0.40615171610446554, "learning_rate": 4.851778656126482e-05, "loss": 0.4796, "step": 143 }, { "epoch": 0.38348868175765644, "grad_norm": 0.4067512139515564, "learning_rate": 4.846837944664032e-05, "loss": 0.4921, "step": 144 }, { "epoch": 0.38615179760319573, "grad_norm": 0.3764557796844728, "learning_rate": 4.841897233201581e-05, "loss": 0.4859, "step": 145 }, { "epoch": 0.38881491344873503, "grad_norm": 0.4154794205261891, "learning_rate": 4.836956521739131e-05, "loss": 0.4673, "step": 146 }, { "epoch": 0.3914780292942743, "grad_norm": 0.4269745611686079, "learning_rate": 4.83201581027668e-05, "loss": 0.4551, "step": 147 }, { "epoch": 0.3941411451398136, "grad_norm": 0.38377387438781274, "learning_rate": 4.8270750988142296e-05, "loss": 0.487, "step": 148 }, { "epoch": 0.3968042609853529, "grad_norm": 0.5603533831020405, "learning_rate": 4.822134387351779e-05, "loss": 0.4849, "step": 149 }, { "epoch": 0.3994673768308921, "grad_norm": 0.3973953941114295, "learning_rate": 4.8171936758893284e-05, "loss": 0.4776, "step": 150 }, { "epoch": 0.4021304926764314, "grad_norm": 0.4956339650363368, "learning_rate": 4.8122529644268775e-05, "loss": 0.4588, "step": 151 }, { "epoch": 0.4047936085219707, "grad_norm": 0.38460346615021695, "learning_rate": 4.807312252964427e-05, "loss": 0.4737, "step": 152 }, { "epoch": 0.40745672436750996, "grad_norm": 0.5226991882164052, "learning_rate": 4.8023715415019764e-05, "loss": 0.4827, "step": 153 }, { "epoch": 0.41011984021304926, "grad_norm": 0.3418933085513387, "learning_rate": 4.797430830039526e-05, "loss": 0.4594, "step": 154 }, { "epoch": 0.41278295605858856, "grad_norm": 0.41779277140490917, "learning_rate": 4.792490118577075e-05, "loss": 0.4738, "step": 155 }, { "epoch": 0.41544607190412786, "grad_norm": 0.40524225841023903, "learning_rate": 4.787549407114625e-05, "loss": 0.4725, "step": 156 }, { "epoch": 0.4181091877496671, "grad_norm": 0.37804713363928255, "learning_rate": 4.782608695652174e-05, "loss": 0.476, "step": 157 }, { "epoch": 0.4207723035952064, "grad_norm": 0.32987544007452513, "learning_rate": 4.777667984189724e-05, "loss": 0.4606, "step": 158 }, { "epoch": 0.4234354194407457, "grad_norm": 0.32638522089295396, "learning_rate": 4.772727272727273e-05, "loss": 0.4796, "step": 159 }, { "epoch": 0.42609853528628494, "grad_norm": 0.3653611962183669, "learning_rate": 4.767786561264823e-05, "loss": 0.4703, "step": 160 }, { "epoch": 0.42876165113182424, "grad_norm": 0.39387144328442575, "learning_rate": 4.762845849802372e-05, "loss": 0.4821, "step": 161 }, { "epoch": 0.43142476697736354, "grad_norm": 0.473795283228247, "learning_rate": 4.757905138339921e-05, "loss": 0.4638, "step": 162 }, { "epoch": 0.4340878828229028, "grad_norm": 0.33040966306125785, "learning_rate": 4.75296442687747e-05, "loss": 0.4734, "step": 163 }, { "epoch": 0.4367509986684421, "grad_norm": 0.42723446550700767, "learning_rate": 4.74802371541502e-05, "loss": 0.4809, "step": 164 }, { "epoch": 0.4394141145139814, "grad_norm": 0.3675475725903659, "learning_rate": 4.743083003952569e-05, "loss": 0.4586, "step": 165 }, { "epoch": 0.4420772303595206, "grad_norm": 0.4219979464151687, "learning_rate": 4.738142292490119e-05, "loss": 0.4678, "step": 166 }, { "epoch": 0.4447403462050599, "grad_norm": 0.3857740050906692, "learning_rate": 4.733201581027668e-05, "loss": 0.4633, "step": 167 }, { "epoch": 0.4474034620505992, "grad_norm": 0.365686963876862, "learning_rate": 4.7282608695652177e-05, "loss": 0.4712, "step": 168 }, { "epoch": 0.45006657789613846, "grad_norm": 0.43242439287350204, "learning_rate": 4.723320158102767e-05, "loss": 0.4751, "step": 169 }, { "epoch": 0.45272969374167776, "grad_norm": 0.3908982963736634, "learning_rate": 4.7183794466403165e-05, "loss": 0.4723, "step": 170 }, { "epoch": 0.45539280958721706, "grad_norm": 0.4693769425526856, "learning_rate": 4.7134387351778656e-05, "loss": 0.4511, "step": 171 }, { "epoch": 0.4580559254327563, "grad_norm": 0.3437754359793867, "learning_rate": 4.7084980237154154e-05, "loss": 0.4634, "step": 172 }, { "epoch": 0.4607190412782956, "grad_norm": 0.5270401669346302, "learning_rate": 4.7035573122529645e-05, "loss": 0.4621, "step": 173 }, { "epoch": 0.4633821571238349, "grad_norm": 0.4696714456346351, "learning_rate": 4.698616600790514e-05, "loss": 0.4544, "step": 174 }, { "epoch": 0.46604527296937415, "grad_norm": 0.5068508932227126, "learning_rate": 4.6936758893280634e-05, "loss": 0.4506, "step": 175 }, { "epoch": 0.46870838881491345, "grad_norm": 0.503240500645686, "learning_rate": 4.688735177865613e-05, "loss": 0.4653, "step": 176 }, { "epoch": 0.47137150466045274, "grad_norm": 0.4373004531246149, "learning_rate": 4.683794466403162e-05, "loss": 0.4711, "step": 177 }, { "epoch": 0.474034620505992, "grad_norm": 0.3777218592654747, "learning_rate": 4.678853754940712e-05, "loss": 0.466, "step": 178 }, { "epoch": 0.4766977363515313, "grad_norm": 0.5064461910000716, "learning_rate": 4.673913043478261e-05, "loss": 0.4516, "step": 179 }, { "epoch": 0.4793608521970706, "grad_norm": 0.37515242222191797, "learning_rate": 4.668972332015811e-05, "loss": 0.4708, "step": 180 }, { "epoch": 0.48202396804260983, "grad_norm": 0.44905049367290634, "learning_rate": 4.66403162055336e-05, "loss": 0.4462, "step": 181 }, { "epoch": 0.48468708388814913, "grad_norm": 0.37911463481430624, "learning_rate": 4.659090909090909e-05, "loss": 0.4451, "step": 182 }, { "epoch": 0.4873501997336884, "grad_norm": 0.3830462171805543, "learning_rate": 4.654150197628458e-05, "loss": 0.4682, "step": 183 }, { "epoch": 0.49001331557922767, "grad_norm": 0.41200778908045926, "learning_rate": 4.649209486166008e-05, "loss": 0.4497, "step": 184 }, { "epoch": 0.49267643142476697, "grad_norm": 0.4315187398326425, "learning_rate": 4.644268774703557e-05, "loss": 0.4752, "step": 185 }, { "epoch": 0.49533954727030627, "grad_norm": 0.4519541174810682, "learning_rate": 4.639328063241107e-05, "loss": 0.4764, "step": 186 }, { "epoch": 0.4980026631158455, "grad_norm": 0.4089102997614078, "learning_rate": 4.634387351778656e-05, "loss": 0.4663, "step": 187 }, { "epoch": 0.5006657789613849, "grad_norm": 0.352791614063271, "learning_rate": 4.629446640316206e-05, "loss": 0.4671, "step": 188 }, { "epoch": 0.5033288948069241, "grad_norm": 0.3866144187741864, "learning_rate": 4.624505928853755e-05, "loss": 0.4746, "step": 189 }, { "epoch": 0.5059920106524634, "grad_norm": 0.4028526989391047, "learning_rate": 4.6195652173913046e-05, "loss": 0.4811, "step": 190 }, { "epoch": 0.5086551264980027, "grad_norm": 0.4580432915919317, "learning_rate": 4.614624505928854e-05, "loss": 0.4678, "step": 191 }, { "epoch": 0.511318242343542, "grad_norm": 0.47798645545842755, "learning_rate": 4.6096837944664035e-05, "loss": 0.4514, "step": 192 }, { "epoch": 0.5139813581890812, "grad_norm": 0.40636636658954495, "learning_rate": 4.6047430830039526e-05, "loss": 0.4356, "step": 193 }, { "epoch": 0.5166444740346205, "grad_norm": 0.4206946394322433, "learning_rate": 4.5998023715415024e-05, "loss": 0.4637, "step": 194 }, { "epoch": 0.5193075898801598, "grad_norm": 0.4977083130622833, "learning_rate": 4.5948616600790515e-05, "loss": 0.4525, "step": 195 }, { "epoch": 0.521970705725699, "grad_norm": 0.3826090231131446, "learning_rate": 4.589920948616601e-05, "loss": 0.4647, "step": 196 }, { "epoch": 0.5246338215712384, "grad_norm": 0.443905698975846, "learning_rate": 4.5849802371541504e-05, "loss": 0.466, "step": 197 }, { "epoch": 0.5272969374167776, "grad_norm": 0.34058976392880835, "learning_rate": 4.5800395256917e-05, "loss": 0.4462, "step": 198 }, { "epoch": 0.5299600532623169, "grad_norm": 0.3708303032984336, "learning_rate": 4.575098814229249e-05, "loss": 0.4638, "step": 199 }, { "epoch": 0.5326231691078562, "grad_norm": 0.4046635861089521, "learning_rate": 4.570158102766799e-05, "loss": 0.4702, "step": 200 }, { "epoch": 0.5352862849533955, "grad_norm": 0.390485621135718, "learning_rate": 4.565217391304348e-05, "loss": 0.467, "step": 201 }, { "epoch": 0.5379494007989347, "grad_norm": 0.36389394329456204, "learning_rate": 4.560276679841897e-05, "loss": 0.4676, "step": 202 }, { "epoch": 0.5406125166444741, "grad_norm": 0.36415110756708385, "learning_rate": 4.555335968379447e-05, "loss": 0.4508, "step": 203 }, { "epoch": 0.5432756324900133, "grad_norm": 0.5185630368770853, "learning_rate": 4.550395256916996e-05, "loss": 0.4835, "step": 204 }, { "epoch": 0.5459387483355526, "grad_norm": 0.3004205195451817, "learning_rate": 4.545454545454546e-05, "loss": 0.4655, "step": 205 }, { "epoch": 0.5486018641810919, "grad_norm": 0.40992528241944887, "learning_rate": 4.540513833992095e-05, "loss": 0.4516, "step": 206 }, { "epoch": 0.5512649800266312, "grad_norm": 0.3462175317121373, "learning_rate": 4.535573122529644e-05, "loss": 0.4471, "step": 207 }, { "epoch": 0.5539280958721704, "grad_norm": 0.4220985656684442, "learning_rate": 4.530632411067194e-05, "loss": 0.4483, "step": 208 }, { "epoch": 0.5565912117177098, "grad_norm": 0.2992081906139443, "learning_rate": 4.525691699604743e-05, "loss": 0.4659, "step": 209 }, { "epoch": 0.559254327563249, "grad_norm": 0.34958390386904065, "learning_rate": 4.520750988142293e-05, "loss": 0.4594, "step": 210 }, { "epoch": 0.5619174434087882, "grad_norm": 0.36711080919022626, "learning_rate": 4.515810276679842e-05, "loss": 0.4329, "step": 211 }, { "epoch": 0.5645805592543276, "grad_norm": 0.32211416124144243, "learning_rate": 4.5108695652173916e-05, "loss": 0.4487, "step": 212 }, { "epoch": 0.5672436750998668, "grad_norm": 0.38626649006957514, "learning_rate": 4.505928853754941e-05, "loss": 0.4544, "step": 213 }, { "epoch": 0.5699067909454061, "grad_norm": 0.4022394284778984, "learning_rate": 4.5009881422924905e-05, "loss": 0.4505, "step": 214 }, { "epoch": 0.5725699067909454, "grad_norm": 0.3174185878452103, "learning_rate": 4.4960474308300396e-05, "loss": 0.4652, "step": 215 }, { "epoch": 0.5752330226364847, "grad_norm": 0.3872997977647099, "learning_rate": 4.4911067193675893e-05, "loss": 0.4771, "step": 216 }, { "epoch": 0.5778961384820239, "grad_norm": 0.2832157450180407, "learning_rate": 4.4861660079051384e-05, "loss": 0.4535, "step": 217 }, { "epoch": 0.5805592543275633, "grad_norm": 0.3394496956003534, "learning_rate": 4.481225296442688e-05, "loss": 0.4401, "step": 218 }, { "epoch": 0.5832223701731025, "grad_norm": 0.29084562762850125, "learning_rate": 4.476284584980237e-05, "loss": 0.445, "step": 219 }, { "epoch": 0.5858854860186418, "grad_norm": 0.30783953367051076, "learning_rate": 4.471343873517787e-05, "loss": 0.437, "step": 220 }, { "epoch": 0.5885486018641811, "grad_norm": 0.3183591003829617, "learning_rate": 4.466403162055336e-05, "loss": 0.4549, "step": 221 }, { "epoch": 0.5912117177097204, "grad_norm": 0.30102542208170724, "learning_rate": 4.461462450592885e-05, "loss": 0.4455, "step": 222 }, { "epoch": 0.5938748335552596, "grad_norm": 0.36209246659651434, "learning_rate": 4.456521739130435e-05, "loss": 0.4401, "step": 223 }, { "epoch": 0.596537949400799, "grad_norm": 0.3264752372953629, "learning_rate": 4.451581027667984e-05, "loss": 0.4379, "step": 224 }, { "epoch": 0.5992010652463382, "grad_norm": 0.38508783562543825, "learning_rate": 4.446640316205534e-05, "loss": 0.4617, "step": 225 }, { "epoch": 0.6018641810918774, "grad_norm": 0.3397449828204806, "learning_rate": 4.441699604743083e-05, "loss": 0.4516, "step": 226 }, { "epoch": 0.6045272969374168, "grad_norm": 0.3587152523608094, "learning_rate": 4.436758893280633e-05, "loss": 0.4627, "step": 227 }, { "epoch": 0.607190412782956, "grad_norm": 0.3533298903513862, "learning_rate": 4.431818181818182e-05, "loss": 0.4539, "step": 228 }, { "epoch": 0.6098535286284953, "grad_norm": 0.4031621223527615, "learning_rate": 4.426877470355732e-05, "loss": 0.4475, "step": 229 }, { "epoch": 0.6125166444740346, "grad_norm": 0.31598897434214096, "learning_rate": 4.421936758893281e-05, "loss": 0.4594, "step": 230 }, { "epoch": 0.6151797603195739, "grad_norm": 0.39490506767356415, "learning_rate": 4.4169960474308306e-05, "loss": 0.4481, "step": 231 }, { "epoch": 0.6178428761651131, "grad_norm": 0.34551286464789904, "learning_rate": 4.41205533596838e-05, "loss": 0.4417, "step": 232 }, { "epoch": 0.6205059920106525, "grad_norm": 0.3471665108105545, "learning_rate": 4.4071146245059295e-05, "loss": 0.444, "step": 233 }, { "epoch": 0.6231691078561917, "grad_norm": 0.3236727871934815, "learning_rate": 4.4021739130434786e-05, "loss": 0.4465, "step": 234 }, { "epoch": 0.625832223701731, "grad_norm": 0.3951638876292987, "learning_rate": 4.397233201581028e-05, "loss": 0.4476, "step": 235 }, { "epoch": 0.6284953395472703, "grad_norm": 0.3186324774552031, "learning_rate": 4.3922924901185774e-05, "loss": 0.4359, "step": 236 }, { "epoch": 0.6311584553928096, "grad_norm": 0.3446758582788272, "learning_rate": 4.387351778656127e-05, "loss": 0.4425, "step": 237 }, { "epoch": 0.6338215712383488, "grad_norm": 0.3712178318421026, "learning_rate": 4.382411067193676e-05, "loss": 0.4479, "step": 238 }, { "epoch": 0.6364846870838882, "grad_norm": 0.2869593917948936, "learning_rate": 4.377470355731226e-05, "loss": 0.4487, "step": 239 }, { "epoch": 0.6391478029294274, "grad_norm": 0.35621809137402505, "learning_rate": 4.3725296442687745e-05, "loss": 0.459, "step": 240 }, { "epoch": 0.6418109187749668, "grad_norm": 0.3219598029099912, "learning_rate": 4.367588932806324e-05, "loss": 0.4486, "step": 241 }, { "epoch": 0.644474034620506, "grad_norm": 0.345671883817814, "learning_rate": 4.3626482213438734e-05, "loss": 0.4494, "step": 242 }, { "epoch": 0.6471371504660453, "grad_norm": 0.3326228424406132, "learning_rate": 4.357707509881423e-05, "loss": 0.467, "step": 243 }, { "epoch": 0.6498002663115846, "grad_norm": 0.42093399894851624, "learning_rate": 4.352766798418972e-05, "loss": 0.4361, "step": 244 }, { "epoch": 0.6524633821571239, "grad_norm": 0.4162222276319394, "learning_rate": 4.347826086956522e-05, "loss": 0.4606, "step": 245 }, { "epoch": 0.6551264980026631, "grad_norm": 0.36750359997980137, "learning_rate": 4.342885375494071e-05, "loss": 0.4429, "step": 246 }, { "epoch": 0.6577896138482024, "grad_norm": 0.5483612794064252, "learning_rate": 4.337944664031621e-05, "loss": 0.4533, "step": 247 }, { "epoch": 0.6604527296937417, "grad_norm": 0.3506444877775761, "learning_rate": 4.33300395256917e-05, "loss": 0.4469, "step": 248 }, { "epoch": 0.6631158455392809, "grad_norm": 0.49614493451666597, "learning_rate": 4.32806324110672e-05, "loss": 0.4511, "step": 249 }, { "epoch": 0.6657789613848203, "grad_norm": 0.38209500350480796, "learning_rate": 4.323122529644269e-05, "loss": 0.4556, "step": 250 }, { "epoch": 0.6684420772303595, "grad_norm": 0.3909575859613948, "learning_rate": 4.318181818181819e-05, "loss": 0.4573, "step": 251 }, { "epoch": 0.6711051930758988, "grad_norm": 0.41081105341671875, "learning_rate": 4.313241106719368e-05, "loss": 0.4319, "step": 252 }, { "epoch": 0.6737683089214381, "grad_norm": 0.3263282193938601, "learning_rate": 4.3083003952569175e-05, "loss": 0.4477, "step": 253 }, { "epoch": 0.6764314247669774, "grad_norm": 0.30906206450856727, "learning_rate": 4.3033596837944666e-05, "loss": 0.449, "step": 254 }, { "epoch": 0.6790945406125166, "grad_norm": 0.4519613203178409, "learning_rate": 4.2984189723320164e-05, "loss": 0.4411, "step": 255 }, { "epoch": 0.681757656458056, "grad_norm": 0.4018486844337667, "learning_rate": 4.2934782608695655e-05, "loss": 0.4402, "step": 256 }, { "epoch": 0.6844207723035952, "grad_norm": 0.41908409625079107, "learning_rate": 4.288537549407115e-05, "loss": 0.4531, "step": 257 }, { "epoch": 0.6870838881491345, "grad_norm": 0.34694110159483726, "learning_rate": 4.2835968379446644e-05, "loss": 0.4533, "step": 258 }, { "epoch": 0.6897470039946738, "grad_norm": 0.4051995527756752, "learning_rate": 4.2786561264822135e-05, "loss": 0.4533, "step": 259 }, { "epoch": 0.6924101198402131, "grad_norm": 0.3557731708549695, "learning_rate": 4.2737154150197626e-05, "loss": 0.4665, "step": 260 }, { "epoch": 0.6950732356857523, "grad_norm": 0.387832077012766, "learning_rate": 4.2687747035573124e-05, "loss": 0.4407, "step": 261 }, { "epoch": 0.6977363515312917, "grad_norm": 0.38082367574409703, "learning_rate": 4.2638339920948615e-05, "loss": 0.453, "step": 262 }, { "epoch": 0.7003994673768309, "grad_norm": 0.33683683724829466, "learning_rate": 4.258893280632411e-05, "loss": 0.4635, "step": 263 }, { "epoch": 0.7030625832223701, "grad_norm": 0.4169335496839881, "learning_rate": 4.2539525691699603e-05, "loss": 0.4563, "step": 264 }, { "epoch": 0.7057256990679095, "grad_norm": 0.3214835965167982, "learning_rate": 4.24901185770751e-05, "loss": 0.4542, "step": 265 }, { "epoch": 0.7083888149134487, "grad_norm": 0.3530582715253166, "learning_rate": 4.244071146245059e-05, "loss": 0.4331, "step": 266 }, { "epoch": 0.711051930758988, "grad_norm": 0.36340494740289614, "learning_rate": 4.239130434782609e-05, "loss": 0.4394, "step": 267 }, { "epoch": 0.7137150466045273, "grad_norm": 0.3874861034018051, "learning_rate": 4.234189723320158e-05, "loss": 0.4297, "step": 268 }, { "epoch": 0.7163781624500666, "grad_norm": 0.387734289004501, "learning_rate": 4.229249011857708e-05, "loss": 0.4518, "step": 269 }, { "epoch": 0.7190412782956058, "grad_norm": 0.3011771126496286, "learning_rate": 4.224308300395257e-05, "loss": 0.4369, "step": 270 }, { "epoch": 0.7217043941411452, "grad_norm": 0.41746724783245387, "learning_rate": 4.219367588932807e-05, "loss": 0.4509, "step": 271 }, { "epoch": 0.7243675099866844, "grad_norm": 0.3395798145391856, "learning_rate": 4.214426877470356e-05, "loss": 0.4643, "step": 272 }, { "epoch": 0.7270306258322237, "grad_norm": 0.4118033460496559, "learning_rate": 4.2094861660079056e-05, "loss": 0.4238, "step": 273 }, { "epoch": 0.729693741677763, "grad_norm": 0.2988995865914867, "learning_rate": 4.204545454545455e-05, "loss": 0.4414, "step": 274 }, { "epoch": 0.7323568575233023, "grad_norm": 0.4755302873686915, "learning_rate": 4.1996047430830045e-05, "loss": 0.4408, "step": 275 }, { "epoch": 0.7350199733688415, "grad_norm": 0.3321861192448237, "learning_rate": 4.1946640316205536e-05, "loss": 0.4471, "step": 276 }, { "epoch": 0.7376830892143809, "grad_norm": 0.45541818319145366, "learning_rate": 4.1897233201581034e-05, "loss": 0.4473, "step": 277 }, { "epoch": 0.7403462050599201, "grad_norm": 0.37099566890533026, "learning_rate": 4.1847826086956525e-05, "loss": 0.4495, "step": 278 }, { "epoch": 0.7430093209054593, "grad_norm": 0.4035270770785246, "learning_rate": 4.1798418972332016e-05, "loss": 0.4513, "step": 279 }, { "epoch": 0.7456724367509987, "grad_norm": 0.3441312582159767, "learning_rate": 4.174901185770751e-05, "loss": 0.4358, "step": 280 }, { "epoch": 0.748335552596538, "grad_norm": 0.44606462407083225, "learning_rate": 4.1699604743083005e-05, "loss": 0.4441, "step": 281 }, { "epoch": 0.7509986684420772, "grad_norm": 0.41551217890891706, "learning_rate": 4.1650197628458496e-05, "loss": 0.4389, "step": 282 }, { "epoch": 0.7536617842876165, "grad_norm": 0.3972988958201408, "learning_rate": 4.160079051383399e-05, "loss": 0.4375, "step": 283 }, { "epoch": 0.7563249001331558, "grad_norm": 0.47085225893645843, "learning_rate": 4.1551383399209484e-05, "loss": 0.4567, "step": 284 }, { "epoch": 0.758988015978695, "grad_norm": 0.34543261673414827, "learning_rate": 4.150197628458498e-05, "loss": 0.4459, "step": 285 }, { "epoch": 0.7616511318242344, "grad_norm": 0.43195994812681116, "learning_rate": 4.145256916996047e-05, "loss": 0.4589, "step": 286 }, { "epoch": 0.7643142476697736, "grad_norm": 0.3459436864735825, "learning_rate": 4.140316205533597e-05, "loss": 0.4599, "step": 287 }, { "epoch": 0.7669773635153129, "grad_norm": 0.36207300529867464, "learning_rate": 4.135375494071146e-05, "loss": 0.4303, "step": 288 }, { "epoch": 0.7696404793608522, "grad_norm": 0.41345784501066335, "learning_rate": 4.130434782608696e-05, "loss": 0.4271, "step": 289 }, { "epoch": 0.7723035952063915, "grad_norm": 0.3159838632384483, "learning_rate": 4.125494071146245e-05, "loss": 0.4559, "step": 290 }, { "epoch": 0.7749667110519307, "grad_norm": 0.3812699162571922, "learning_rate": 4.120553359683795e-05, "loss": 0.4342, "step": 291 }, { "epoch": 0.7776298268974701, "grad_norm": 0.37911131885498967, "learning_rate": 4.115612648221344e-05, "loss": 0.4362, "step": 292 }, { "epoch": 0.7802929427430093, "grad_norm": 0.29763254355588903, "learning_rate": 4.110671936758894e-05, "loss": 0.438, "step": 293 }, { "epoch": 0.7829560585885486, "grad_norm": 0.42619217859831243, "learning_rate": 4.105731225296443e-05, "loss": 0.4359, "step": 294 }, { "epoch": 0.7856191744340879, "grad_norm": 0.3300550679665931, "learning_rate": 4.1007905138339926e-05, "loss": 0.43, "step": 295 }, { "epoch": 0.7882822902796272, "grad_norm": 0.36668560763021596, "learning_rate": 4.095849802371542e-05, "loss": 0.4307, "step": 296 }, { "epoch": 0.7909454061251664, "grad_norm": 0.4285864023060217, "learning_rate": 4.0909090909090915e-05, "loss": 0.44, "step": 297 }, { "epoch": 0.7936085219707057, "grad_norm": 0.40308733058892654, "learning_rate": 4.0859683794466406e-05, "loss": 0.4438, "step": 298 }, { "epoch": 0.796271637816245, "grad_norm": 0.48251508562888784, "learning_rate": 4.08102766798419e-05, "loss": 0.465, "step": 299 }, { "epoch": 0.7989347536617842, "grad_norm": 0.3630289677972406, "learning_rate": 4.076086956521739e-05, "loss": 0.4472, "step": 300 }, { "epoch": 0.8015978695073236, "grad_norm": 0.39496674097555107, "learning_rate": 4.0711462450592886e-05, "loss": 0.4391, "step": 301 }, { "epoch": 0.8042609853528628, "grad_norm": 0.3844393845604204, "learning_rate": 4.0662055335968377e-05, "loss": 0.4594, "step": 302 }, { "epoch": 0.8069241011984021, "grad_norm": 0.41185922961873794, "learning_rate": 4.0612648221343874e-05, "loss": 0.4302, "step": 303 }, { "epoch": 0.8095872170439414, "grad_norm": 0.3856385433600225, "learning_rate": 4.0563241106719365e-05, "loss": 0.4436, "step": 304 }, { "epoch": 0.8122503328894807, "grad_norm": 0.38840299488987834, "learning_rate": 4.051383399209486e-05, "loss": 0.4536, "step": 305 }, { "epoch": 0.8149134487350199, "grad_norm": 0.3814150713404761, "learning_rate": 4.0464426877470354e-05, "loss": 0.4478, "step": 306 }, { "epoch": 0.8175765645805593, "grad_norm": 0.3688695146114231, "learning_rate": 4.041501976284585e-05, "loss": 0.4371, "step": 307 }, { "epoch": 0.8202396804260985, "grad_norm": 0.4525942844580142, "learning_rate": 4.036561264822134e-05, "loss": 0.4291, "step": 308 }, { "epoch": 0.8229027962716379, "grad_norm": 0.4052871924274271, "learning_rate": 4.031620553359684e-05, "loss": 0.4441, "step": 309 }, { "epoch": 0.8255659121171771, "grad_norm": 0.39806513754399514, "learning_rate": 4.026679841897233e-05, "loss": 0.4411, "step": 310 }, { "epoch": 0.8282290279627164, "grad_norm": 0.3805049053303521, "learning_rate": 4.021739130434783e-05, "loss": 0.4366, "step": 311 }, { "epoch": 0.8308921438082557, "grad_norm": 0.4001908389883243, "learning_rate": 4.016798418972332e-05, "loss": 0.4481, "step": 312 }, { "epoch": 0.833555259653795, "grad_norm": 0.3685478975261263, "learning_rate": 4.011857707509882e-05, "loss": 0.4444, "step": 313 }, { "epoch": 0.8362183754993342, "grad_norm": 0.3338436350006864, "learning_rate": 4.006916996047431e-05, "loss": 0.4479, "step": 314 }, { "epoch": 0.8388814913448736, "grad_norm": 0.41429245260714803, "learning_rate": 4.001976284584981e-05, "loss": 0.449, "step": 315 }, { "epoch": 0.8415446071904128, "grad_norm": 0.4423411865525233, "learning_rate": 3.99703557312253e-05, "loss": 0.4659, "step": 316 }, { "epoch": 0.844207723035952, "grad_norm": 0.2957853011048819, "learning_rate": 3.9920948616600796e-05, "loss": 0.4251, "step": 317 }, { "epoch": 0.8468708388814914, "grad_norm": 0.4030160825498704, "learning_rate": 3.987154150197629e-05, "loss": 0.4371, "step": 318 }, { "epoch": 0.8495339547270306, "grad_norm": 0.3580572215645172, "learning_rate": 3.982213438735178e-05, "loss": 0.4227, "step": 319 }, { "epoch": 0.8521970705725699, "grad_norm": 0.39710125591854223, "learning_rate": 3.9772727272727275e-05, "loss": 0.4293, "step": 320 }, { "epoch": 0.8548601864181092, "grad_norm": 0.4051765562646604, "learning_rate": 3.9723320158102766e-05, "loss": 0.4334, "step": 321 }, { "epoch": 0.8575233022636485, "grad_norm": 0.41675278060825943, "learning_rate": 3.9673913043478264e-05, "loss": 0.4386, "step": 322 }, { "epoch": 0.8601864181091877, "grad_norm": 0.4375405045592726, "learning_rate": 3.9624505928853755e-05, "loss": 0.4533, "step": 323 }, { "epoch": 0.8628495339547271, "grad_norm": 0.4043621563504148, "learning_rate": 3.957509881422925e-05, "loss": 0.4497, "step": 324 }, { "epoch": 0.8655126498002663, "grad_norm": 0.37983530045601516, "learning_rate": 3.9525691699604744e-05, "loss": 0.4392, "step": 325 }, { "epoch": 0.8681757656458056, "grad_norm": 0.4289732652538706, "learning_rate": 3.947628458498024e-05, "loss": 0.4401, "step": 326 }, { "epoch": 0.8708388814913449, "grad_norm": 0.34033600614743714, "learning_rate": 3.942687747035573e-05, "loss": 0.453, "step": 327 }, { "epoch": 0.8735019973368842, "grad_norm": 0.399300367168935, "learning_rate": 3.937747035573123e-05, "loss": 0.433, "step": 328 }, { "epoch": 0.8761651131824234, "grad_norm": 0.36717092389818584, "learning_rate": 3.932806324110672e-05, "loss": 0.4523, "step": 329 }, { "epoch": 0.8788282290279628, "grad_norm": 0.43669770511305556, "learning_rate": 3.927865612648222e-05, "loss": 0.437, "step": 330 }, { "epoch": 0.881491344873502, "grad_norm": 0.3631294987791108, "learning_rate": 3.922924901185771e-05, "loss": 0.4335, "step": 331 }, { "epoch": 0.8841544607190412, "grad_norm": 0.45116504976872973, "learning_rate": 3.917984189723321e-05, "loss": 0.4562, "step": 332 }, { "epoch": 0.8868175765645806, "grad_norm": 0.3163566159546663, "learning_rate": 3.91304347826087e-05, "loss": 0.4286, "step": 333 }, { "epoch": 0.8894806924101198, "grad_norm": 0.49699702016497876, "learning_rate": 3.90810276679842e-05, "loss": 0.4214, "step": 334 }, { "epoch": 0.8921438082556591, "grad_norm": 0.4164898463983148, "learning_rate": 3.903162055335969e-05, "loss": 0.4354, "step": 335 }, { "epoch": 0.8948069241011984, "grad_norm": 0.39631778611383006, "learning_rate": 3.8982213438735186e-05, "loss": 0.4389, "step": 336 }, { "epoch": 0.8974700399467377, "grad_norm": 0.4545892509897146, "learning_rate": 3.893280632411067e-05, "loss": 0.4312, "step": 337 }, { "epoch": 0.9001331557922769, "grad_norm": 0.41988367228289636, "learning_rate": 3.888339920948617e-05, "loss": 0.4433, "step": 338 }, { "epoch": 0.9027962716378163, "grad_norm": 0.3123307577517813, "learning_rate": 3.883399209486166e-05, "loss": 0.4272, "step": 339 }, { "epoch": 0.9054593874833555, "grad_norm": 0.31692127951353677, "learning_rate": 3.8784584980237156e-05, "loss": 0.4292, "step": 340 }, { "epoch": 0.9081225033288948, "grad_norm": 0.33613245505768613, "learning_rate": 3.873517786561265e-05, "loss": 0.4249, "step": 341 }, { "epoch": 0.9107856191744341, "grad_norm": 0.30559768683570065, "learning_rate": 3.8685770750988145e-05, "loss": 0.4398, "step": 342 }, { "epoch": 0.9134487350199734, "grad_norm": 0.3939981911193064, "learning_rate": 3.8636363636363636e-05, "loss": 0.4335, "step": 343 }, { "epoch": 0.9161118508655126, "grad_norm": 0.33858345690029085, "learning_rate": 3.8586956521739134e-05, "loss": 0.4451, "step": 344 }, { "epoch": 0.918774966711052, "grad_norm": 0.3422872934004404, "learning_rate": 3.8537549407114625e-05, "loss": 0.4353, "step": 345 }, { "epoch": 0.9214380825565912, "grad_norm": 0.3280283881293896, "learning_rate": 3.848814229249012e-05, "loss": 0.4336, "step": 346 }, { "epoch": 0.9241011984021305, "grad_norm": 0.3212166344001671, "learning_rate": 3.8438735177865614e-05, "loss": 0.4436, "step": 347 }, { "epoch": 0.9267643142476698, "grad_norm": 0.29779879718680563, "learning_rate": 3.838932806324111e-05, "loss": 0.4224, "step": 348 }, { "epoch": 0.929427430093209, "grad_norm": 0.32257209602500175, "learning_rate": 3.83399209486166e-05, "loss": 0.4324, "step": 349 }, { "epoch": 0.9320905459387483, "grad_norm": 0.3283760169277036, "learning_rate": 3.82905138339921e-05, "loss": 0.4312, "step": 350 }, { "epoch": 0.9347536617842876, "grad_norm": 0.29560048048387905, "learning_rate": 3.824110671936759e-05, "loss": 0.438, "step": 351 }, { "epoch": 0.9374167776298269, "grad_norm": 0.31047996971013586, "learning_rate": 3.819169960474309e-05, "loss": 0.436, "step": 352 }, { "epoch": 0.9400798934753661, "grad_norm": 0.3203340478559344, "learning_rate": 3.814229249011858e-05, "loss": 0.4178, "step": 353 }, { "epoch": 0.9427430093209055, "grad_norm": 0.3000799797652741, "learning_rate": 3.809288537549408e-05, "loss": 0.4283, "step": 354 }, { "epoch": 0.9454061251664447, "grad_norm": 0.31625082964426837, "learning_rate": 3.804347826086957e-05, "loss": 0.4355, "step": 355 }, { "epoch": 0.948069241011984, "grad_norm": 0.38688019968777704, "learning_rate": 3.7994071146245066e-05, "loss": 0.4561, "step": 356 }, { "epoch": 0.9507323568575233, "grad_norm": 0.309916135809927, "learning_rate": 3.794466403162055e-05, "loss": 0.4323, "step": 357 }, { "epoch": 0.9533954727030626, "grad_norm": 0.4119303884073823, "learning_rate": 3.789525691699605e-05, "loss": 0.4346, "step": 358 }, { "epoch": 0.9560585885486018, "grad_norm": 0.36057463061333933, "learning_rate": 3.784584980237154e-05, "loss": 0.4521, "step": 359 }, { "epoch": 0.9587217043941412, "grad_norm": 0.3385683676369823, "learning_rate": 3.779644268774704e-05, "loss": 0.4186, "step": 360 }, { "epoch": 0.9613848202396804, "grad_norm": 0.40056553056875543, "learning_rate": 3.774703557312253e-05, "loss": 0.4577, "step": 361 }, { "epoch": 0.9640479360852197, "grad_norm": 0.3362167210172609, "learning_rate": 3.7697628458498026e-05, "loss": 0.4232, "step": 362 }, { "epoch": 0.966711051930759, "grad_norm": 0.39765353196088127, "learning_rate": 3.764822134387352e-05, "loss": 0.4441, "step": 363 }, { "epoch": 0.9693741677762983, "grad_norm": 0.34508268417865146, "learning_rate": 3.7598814229249015e-05, "loss": 0.4339, "step": 364 }, { "epoch": 0.9720372836218375, "grad_norm": 0.346158165413465, "learning_rate": 3.7549407114624506e-05, "loss": 0.4314, "step": 365 }, { "epoch": 0.9747003994673769, "grad_norm": 0.38758138562436, "learning_rate": 3.7500000000000003e-05, "loss": 0.4479, "step": 366 }, { "epoch": 0.9773635153129161, "grad_norm": 0.3616955496837348, "learning_rate": 3.7450592885375494e-05, "loss": 0.4295, "step": 367 }, { "epoch": 0.9800266311584553, "grad_norm": 0.36330419598482033, "learning_rate": 3.740118577075099e-05, "loss": 0.431, "step": 368 }, { "epoch": 0.9826897470039947, "grad_norm": 0.38220931731215757, "learning_rate": 3.735177865612648e-05, "loss": 0.4411, "step": 369 }, { "epoch": 0.9853528628495339, "grad_norm": 0.32482883893874537, "learning_rate": 3.730237154150198e-05, "loss": 0.4352, "step": 370 }, { "epoch": 0.9880159786950732, "grad_norm": 0.3797976983855516, "learning_rate": 3.725296442687747e-05, "loss": 0.4273, "step": 371 }, { "epoch": 0.9906790945406125, "grad_norm": 0.3333203576267911, "learning_rate": 3.720355731225297e-05, "loss": 0.4353, "step": 372 }, { "epoch": 0.9933422103861518, "grad_norm": 0.3565932063789887, "learning_rate": 3.715415019762846e-05, "loss": 0.4312, "step": 373 }, { "epoch": 0.996005326231691, "grad_norm": 0.35499721260713074, "learning_rate": 3.710474308300396e-05, "loss": 0.4328, "step": 374 }, { "epoch": 0.9986684420772304, "grad_norm": 0.34312841144350587, "learning_rate": 3.705533596837945e-05, "loss": 0.4238, "step": 375 }, { "epoch": 1.0, "grad_norm": 0.34312841144350587, "learning_rate": 3.700592885375494e-05, "loss": 0.4292, "step": 376 }, { "epoch": 1.0026631158455392, "grad_norm": 0.523484923884555, "learning_rate": 3.695652173913043e-05, "loss": 0.3827, "step": 377 }, { "epoch": 1.0053262316910785, "grad_norm": 0.44981178204276556, "learning_rate": 3.690711462450593e-05, "loss": 0.3497, "step": 378 }, { "epoch": 1.007989347536618, "grad_norm": 0.30585009680415987, "learning_rate": 3.685770750988142e-05, "loss": 0.3667, "step": 379 }, { "epoch": 1.0106524633821572, "grad_norm": 0.3734972975740805, "learning_rate": 3.680830039525692e-05, "loss": 0.365, "step": 380 }, { "epoch": 1.0133155792276964, "grad_norm": 0.32549667969227175, "learning_rate": 3.675889328063241e-05, "loss": 0.3756, "step": 381 }, { "epoch": 1.0159786950732357, "grad_norm": 0.4493130971817616, "learning_rate": 3.670948616600791e-05, "loss": 0.358, "step": 382 }, { "epoch": 1.018641810918775, "grad_norm": 0.40705895511048784, "learning_rate": 3.66600790513834e-05, "loss": 0.3711, "step": 383 }, { "epoch": 1.0213049267643142, "grad_norm": 0.3979472669944709, "learning_rate": 3.6610671936758896e-05, "loss": 0.3613, "step": 384 }, { "epoch": 1.0239680426098536, "grad_norm": 0.44247177084982264, "learning_rate": 3.656126482213439e-05, "loss": 0.3461, "step": 385 }, { "epoch": 1.0266311584553929, "grad_norm": 0.3643767210189153, "learning_rate": 3.6511857707509884e-05, "loss": 0.3682, "step": 386 }, { "epoch": 1.0292942743009321, "grad_norm": 0.3710522218627508, "learning_rate": 3.6462450592885375e-05, "loss": 0.3616, "step": 387 }, { "epoch": 1.0319573901464714, "grad_norm": 0.39199235847196745, "learning_rate": 3.641304347826087e-05, "loss": 0.3373, "step": 388 }, { "epoch": 1.0346205059920106, "grad_norm": 0.3716307271666748, "learning_rate": 3.6363636363636364e-05, "loss": 0.3783, "step": 389 }, { "epoch": 1.0372836218375499, "grad_norm": 0.39593613574016095, "learning_rate": 3.631422924901186e-05, "loss": 0.3605, "step": 390 }, { "epoch": 1.0399467376830893, "grad_norm": 0.3741049180680241, "learning_rate": 3.626482213438735e-05, "loss": 0.3643, "step": 391 }, { "epoch": 1.0426098535286286, "grad_norm": 0.39560887666458844, "learning_rate": 3.621541501976285e-05, "loss": 0.3873, "step": 392 }, { "epoch": 1.0452729693741678, "grad_norm": 0.4542194912059658, "learning_rate": 3.616600790513834e-05, "loss": 0.3517, "step": 393 }, { "epoch": 1.047936085219707, "grad_norm": 0.3376853296582342, "learning_rate": 3.611660079051384e-05, "loss": 0.3746, "step": 394 }, { "epoch": 1.0505992010652463, "grad_norm": 0.38846148578122447, "learning_rate": 3.606719367588933e-05, "loss": 0.3389, "step": 395 }, { "epoch": 1.0532623169107855, "grad_norm": 0.32360005393691865, "learning_rate": 3.601778656126482e-05, "loss": 0.3663, "step": 396 }, { "epoch": 1.055925432756325, "grad_norm": 0.326112805381814, "learning_rate": 3.596837944664031e-05, "loss": 0.3581, "step": 397 }, { "epoch": 1.0585885486018642, "grad_norm": 0.28926622056464246, "learning_rate": 3.591897233201581e-05, "loss": 0.358, "step": 398 }, { "epoch": 1.0612516644474035, "grad_norm": 0.3055465293423247, "learning_rate": 3.58695652173913e-05, "loss": 0.3617, "step": 399 }, { "epoch": 1.0639147802929427, "grad_norm": 0.33022021713183336, "learning_rate": 3.58201581027668e-05, "loss": 0.353, "step": 400 }, { "epoch": 1.066577896138482, "grad_norm": 0.29024468585164404, "learning_rate": 3.577075098814229e-05, "loss": 0.355, "step": 401 }, { "epoch": 1.0692410119840212, "grad_norm": 0.2733040275941461, "learning_rate": 3.572134387351779e-05, "loss": 0.3574, "step": 402 }, { "epoch": 1.0719041278295607, "grad_norm": 0.3226214256196561, "learning_rate": 3.567193675889328e-05, "loss": 0.3528, "step": 403 }, { "epoch": 1.0745672436751, "grad_norm": 0.31534151465175414, "learning_rate": 3.5622529644268777e-05, "loss": 0.3539, "step": 404 }, { "epoch": 1.0772303595206392, "grad_norm": 0.2751061424659443, "learning_rate": 3.557312252964427e-05, "loss": 0.3667, "step": 405 }, { "epoch": 1.0798934753661784, "grad_norm": 0.3612676719250419, "learning_rate": 3.5523715415019765e-05, "loss": 0.3541, "step": 406 }, { "epoch": 1.0825565912117177, "grad_norm": 0.3011759295136269, "learning_rate": 3.5474308300395256e-05, "loss": 0.3606, "step": 407 }, { "epoch": 1.085219707057257, "grad_norm": 0.3978993850172965, "learning_rate": 3.5424901185770754e-05, "loss": 0.3626, "step": 408 }, { "epoch": 1.0878828229027964, "grad_norm": 0.2872210237523896, "learning_rate": 3.5375494071146245e-05, "loss": 0.3889, "step": 409 }, { "epoch": 1.0905459387483356, "grad_norm": 0.443073058318771, "learning_rate": 3.532608695652174e-05, "loss": 0.3535, "step": 410 }, { "epoch": 1.0932090545938749, "grad_norm": 0.33127012106810017, "learning_rate": 3.5276679841897234e-05, "loss": 0.3459, "step": 411 }, { "epoch": 1.095872170439414, "grad_norm": 0.2919448905657829, "learning_rate": 3.522727272727273e-05, "loss": 0.365, "step": 412 }, { "epoch": 1.0985352862849533, "grad_norm": 0.33466018716475304, "learning_rate": 3.517786561264822e-05, "loss": 0.3625, "step": 413 }, { "epoch": 1.1011984021304926, "grad_norm": 0.3413607594653121, "learning_rate": 3.512845849802372e-05, "loss": 0.3724, "step": 414 }, { "epoch": 1.103861517976032, "grad_norm": 0.35737975021729407, "learning_rate": 3.507905138339921e-05, "loss": 0.3774, "step": 415 }, { "epoch": 1.1065246338215713, "grad_norm": 0.34162270993471044, "learning_rate": 3.50296442687747e-05, "loss": 0.3686, "step": 416 }, { "epoch": 1.1091877496671105, "grad_norm": 0.35133143811370443, "learning_rate": 3.49802371541502e-05, "loss": 0.3699, "step": 417 }, { "epoch": 1.1118508655126498, "grad_norm": 0.3579722853716089, "learning_rate": 3.493083003952569e-05, "loss": 0.3505, "step": 418 }, { "epoch": 1.114513981358189, "grad_norm": 0.2618428057689255, "learning_rate": 3.488142292490119e-05, "loss": 0.3463, "step": 419 }, { "epoch": 1.1171770972037283, "grad_norm": 0.35732356240927676, "learning_rate": 3.483201581027668e-05, "loss": 0.3473, "step": 420 }, { "epoch": 1.1198402130492677, "grad_norm": 0.34101793627943705, "learning_rate": 3.478260869565218e-05, "loss": 0.3738, "step": 421 }, { "epoch": 1.122503328894807, "grad_norm": 0.3005835100136546, "learning_rate": 3.473320158102767e-05, "loss": 0.3748, "step": 422 }, { "epoch": 1.1251664447403462, "grad_norm": 0.3512554307406862, "learning_rate": 3.4683794466403166e-05, "loss": 0.3578, "step": 423 }, { "epoch": 1.1278295605858855, "grad_norm": 0.3037958675770476, "learning_rate": 3.463438735177866e-05, "loss": 0.3812, "step": 424 }, { "epoch": 1.1304926764314247, "grad_norm": 0.33131881019625853, "learning_rate": 3.4584980237154155e-05, "loss": 0.3475, "step": 425 }, { "epoch": 1.133155792276964, "grad_norm": 0.2887902456682679, "learning_rate": 3.4535573122529646e-05, "loss": 0.3658, "step": 426 }, { "epoch": 1.1358189081225034, "grad_norm": 0.3429001374635811, "learning_rate": 3.4486166007905144e-05, "loss": 0.37, "step": 427 }, { "epoch": 1.1384820239680427, "grad_norm": 0.32345869994940707, "learning_rate": 3.4436758893280635e-05, "loss": 0.3325, "step": 428 }, { "epoch": 1.141145139813582, "grad_norm": 0.3183193536956743, "learning_rate": 3.438735177865613e-05, "loss": 0.3597, "step": 429 }, { "epoch": 1.1438082556591211, "grad_norm": 0.3300209265208329, "learning_rate": 3.4337944664031624e-05, "loss": 0.3718, "step": 430 }, { "epoch": 1.1464713715046604, "grad_norm": 0.31339838507600637, "learning_rate": 3.428853754940712e-05, "loss": 0.3505, "step": 431 }, { "epoch": 1.1491344873501999, "grad_norm": 0.30103241701187505, "learning_rate": 3.423913043478261e-05, "loss": 0.3515, "step": 432 }, { "epoch": 1.151797603195739, "grad_norm": 0.33142077936580827, "learning_rate": 3.418972332015811e-05, "loss": 0.3454, "step": 433 }, { "epoch": 1.1544607190412783, "grad_norm": 0.26672583595142774, "learning_rate": 3.41403162055336e-05, "loss": 0.3557, "step": 434 }, { "epoch": 1.1571238348868176, "grad_norm": 0.29810972252935447, "learning_rate": 3.409090909090909e-05, "loss": 0.3627, "step": 435 }, { "epoch": 1.1597869507323568, "grad_norm": 0.4004613882147666, "learning_rate": 3.404150197628458e-05, "loss": 0.3596, "step": 436 }, { "epoch": 1.162450066577896, "grad_norm": 0.3230914038022782, "learning_rate": 3.399209486166008e-05, "loss": 0.3494, "step": 437 }, { "epoch": 1.1651131824234353, "grad_norm": 0.26213767359417905, "learning_rate": 3.394268774703557e-05, "loss": 0.3686, "step": 438 }, { "epoch": 1.1677762982689748, "grad_norm": 0.4095014774133373, "learning_rate": 3.389328063241107e-05, "loss": 0.3688, "step": 439 }, { "epoch": 1.170439414114514, "grad_norm": 0.266377270998587, "learning_rate": 3.384387351778656e-05, "loss": 0.3648, "step": 440 }, { "epoch": 1.1731025299600533, "grad_norm": 0.32985529288585497, "learning_rate": 3.379446640316206e-05, "loss": 0.3703, "step": 441 }, { "epoch": 1.1757656458055925, "grad_norm": 0.3629424885940422, "learning_rate": 3.374505928853755e-05, "loss": 0.3502, "step": 442 }, { "epoch": 1.1784287616511318, "grad_norm": 0.29079091604622403, "learning_rate": 3.369565217391305e-05, "loss": 0.3696, "step": 443 }, { "epoch": 1.1810918774966712, "grad_norm": 0.36019836895937174, "learning_rate": 3.364624505928854e-05, "loss": 0.3507, "step": 444 }, { "epoch": 1.1837549933422105, "grad_norm": 0.3710021105040673, "learning_rate": 3.3596837944664036e-05, "loss": 0.3458, "step": 445 }, { "epoch": 1.1864181091877497, "grad_norm": 0.2814671230360335, "learning_rate": 3.354743083003953e-05, "loss": 0.3625, "step": 446 }, { "epoch": 1.189081225033289, "grad_norm": 0.39752143956114194, "learning_rate": 3.3498023715415025e-05, "loss": 0.3372, "step": 447 }, { "epoch": 1.1917443408788282, "grad_norm": 0.3447518628047081, "learning_rate": 3.3448616600790516e-05, "loss": 0.352, "step": 448 }, { "epoch": 1.1944074567243674, "grad_norm": 0.23476338435026442, "learning_rate": 3.3399209486166014e-05, "loss": 0.3433, "step": 449 }, { "epoch": 1.1970705725699067, "grad_norm": 0.41285793244761565, "learning_rate": 3.3349802371541505e-05, "loss": 0.3507, "step": 450 }, { "epoch": 1.1997336884154461, "grad_norm": 0.2756526642604148, "learning_rate": 3.3300395256917e-05, "loss": 0.3679, "step": 451 }, { "epoch": 1.2023968042609854, "grad_norm": 0.35361646973541144, "learning_rate": 3.325098814229249e-05, "loss": 0.3771, "step": 452 }, { "epoch": 1.2050599201065246, "grad_norm": 0.3011012199917682, "learning_rate": 3.320158102766799e-05, "loss": 0.3501, "step": 453 }, { "epoch": 1.2077230359520639, "grad_norm": 0.2753809532139054, "learning_rate": 3.3152173913043475e-05, "loss": 0.3751, "step": 454 }, { "epoch": 1.2103861517976031, "grad_norm": 0.345446601586865, "learning_rate": 3.310276679841897e-05, "loss": 0.3675, "step": 455 }, { "epoch": 1.2130492676431426, "grad_norm": 0.3105483046559569, "learning_rate": 3.3053359683794464e-05, "loss": 0.3473, "step": 456 }, { "epoch": 1.2157123834886818, "grad_norm": 0.31097501000340777, "learning_rate": 3.300395256916996e-05, "loss": 0.3685, "step": 457 }, { "epoch": 1.218375499334221, "grad_norm": 0.35861972517870744, "learning_rate": 3.295454545454545e-05, "loss": 0.3493, "step": 458 }, { "epoch": 1.2210386151797603, "grad_norm": 0.2497414905559577, "learning_rate": 3.290513833992095e-05, "loss": 0.3596, "step": 459 }, { "epoch": 1.2237017310252996, "grad_norm": 0.3260671903675003, "learning_rate": 3.285573122529644e-05, "loss": 0.3584, "step": 460 }, { "epoch": 1.2263648468708388, "grad_norm": 0.303125715747872, "learning_rate": 3.280632411067194e-05, "loss": 0.3468, "step": 461 }, { "epoch": 1.229027962716378, "grad_norm": 0.2894307336548194, "learning_rate": 3.275691699604743e-05, "loss": 0.3589, "step": 462 }, { "epoch": 1.2316910785619175, "grad_norm": 0.3081296705994847, "learning_rate": 3.270750988142293e-05, "loss": 0.3586, "step": 463 }, { "epoch": 1.2343541944074568, "grad_norm": 0.2926327290593828, "learning_rate": 3.265810276679842e-05, "loss": 0.3594, "step": 464 }, { "epoch": 1.237017310252996, "grad_norm": 0.3050352656827861, "learning_rate": 3.260869565217392e-05, "loss": 0.3794, "step": 465 }, { "epoch": 1.2396804260985352, "grad_norm": 0.34421850278839233, "learning_rate": 3.255928853754941e-05, "loss": 0.3448, "step": 466 }, { "epoch": 1.2423435419440745, "grad_norm": 0.3178141996560178, "learning_rate": 3.2509881422924906e-05, "loss": 0.3596, "step": 467 }, { "epoch": 1.245006657789614, "grad_norm": 0.36055320312739547, "learning_rate": 3.24604743083004e-05, "loss": 0.3374, "step": 468 }, { "epoch": 1.2476697736351532, "grad_norm": 0.2584894490878346, "learning_rate": 3.2411067193675894e-05, "loss": 0.3381, "step": 469 }, { "epoch": 1.2503328894806924, "grad_norm": 0.3556442871963007, "learning_rate": 3.2361660079051385e-05, "loss": 0.3757, "step": 470 }, { "epoch": 1.2529960053262317, "grad_norm": 0.2936471278443274, "learning_rate": 3.231225296442688e-05, "loss": 0.3612, "step": 471 }, { "epoch": 1.255659121171771, "grad_norm": 0.34920820452723006, "learning_rate": 3.2262845849802374e-05, "loss": 0.3571, "step": 472 }, { "epoch": 1.2583222370173104, "grad_norm": 0.27353129045046504, "learning_rate": 3.221343873517787e-05, "loss": 0.366, "step": 473 }, { "epoch": 1.2609853528628494, "grad_norm": 0.3336825600119343, "learning_rate": 3.2164031620553356e-05, "loss": 0.3682, "step": 474 }, { "epoch": 1.2636484687083889, "grad_norm": 0.28422664920281926, "learning_rate": 3.2114624505928854e-05, "loss": 0.3574, "step": 475 }, { "epoch": 1.2663115845539281, "grad_norm": 0.27995772097533356, "learning_rate": 3.2065217391304345e-05, "loss": 0.3577, "step": 476 }, { "epoch": 1.2689747003994674, "grad_norm": 0.3073145651684054, "learning_rate": 3.201581027667984e-05, "loss": 0.356, "step": 477 }, { "epoch": 1.2716378162450066, "grad_norm": 0.2926799912079748, "learning_rate": 3.1966403162055334e-05, "loss": 0.3398, "step": 478 }, { "epoch": 1.2743009320905458, "grad_norm": 0.2638946062975387, "learning_rate": 3.191699604743083e-05, "loss": 0.3742, "step": 479 }, { "epoch": 1.2769640479360853, "grad_norm": 0.3188095670364053, "learning_rate": 3.186758893280632e-05, "loss": 0.3564, "step": 480 }, { "epoch": 1.2796271637816246, "grad_norm": 0.2620162833825017, "learning_rate": 3.181818181818182e-05, "loss": 0.36, "step": 481 }, { "epoch": 1.2822902796271638, "grad_norm": 0.34823059030048475, "learning_rate": 3.176877470355731e-05, "loss": 0.3595, "step": 482 }, { "epoch": 1.284953395472703, "grad_norm": 0.31553137736166625, "learning_rate": 3.171936758893281e-05, "loss": 0.3599, "step": 483 }, { "epoch": 1.2876165113182423, "grad_norm": 0.2955708469323441, "learning_rate": 3.16699604743083e-05, "loss": 0.3402, "step": 484 }, { "epoch": 1.2902796271637818, "grad_norm": 0.3913482669169413, "learning_rate": 3.16205533596838e-05, "loss": 0.3758, "step": 485 }, { "epoch": 1.2929427430093208, "grad_norm": 0.35700628657251265, "learning_rate": 3.157114624505929e-05, "loss": 0.3581, "step": 486 }, { "epoch": 1.2956058588548602, "grad_norm": 0.3014863988052369, "learning_rate": 3.152173913043479e-05, "loss": 0.3554, "step": 487 }, { "epoch": 1.2982689747003995, "grad_norm": 0.3644987716917946, "learning_rate": 3.147233201581028e-05, "loss": 0.3562, "step": 488 }, { "epoch": 1.3009320905459387, "grad_norm": 0.30956500239595414, "learning_rate": 3.1422924901185775e-05, "loss": 0.3454, "step": 489 }, { "epoch": 1.303595206391478, "grad_norm": 0.4175232794253573, "learning_rate": 3.1373517786561266e-05, "loss": 0.3641, "step": 490 }, { "epoch": 1.3062583222370172, "grad_norm": 0.28246226404029123, "learning_rate": 3.1324110671936764e-05, "loss": 0.3601, "step": 491 }, { "epoch": 1.3089214380825567, "grad_norm": 0.3755376891190061, "learning_rate": 3.1274703557312255e-05, "loss": 0.3774, "step": 492 }, { "epoch": 1.311584553928096, "grad_norm": 0.27298674883257873, "learning_rate": 3.1225296442687746e-05, "loss": 0.3627, "step": 493 }, { "epoch": 1.3142476697736352, "grad_norm": 0.3706229801540267, "learning_rate": 3.117588932806324e-05, "loss": 0.3735, "step": 494 }, { "epoch": 1.3169107856191744, "grad_norm": 0.28143910738942546, "learning_rate": 3.1126482213438735e-05, "loss": 0.3725, "step": 495 }, { "epoch": 1.3195739014647137, "grad_norm": 0.3349025665393724, "learning_rate": 3.1077075098814226e-05, "loss": 0.3659, "step": 496 }, { "epoch": 1.3222370173102531, "grad_norm": 0.29588987329109573, "learning_rate": 3.1027667984189724e-05, "loss": 0.3749, "step": 497 }, { "epoch": 1.3249001331557924, "grad_norm": 0.27901948593654424, "learning_rate": 3.0978260869565215e-05, "loss": 0.3555, "step": 498 }, { "epoch": 1.3275632490013316, "grad_norm": 0.3180943674654497, "learning_rate": 3.092885375494071e-05, "loss": 0.3399, "step": 499 }, { "epoch": 1.3302263648468708, "grad_norm": 0.3257820898386027, "learning_rate": 3.0879446640316203e-05, "loss": 0.3592, "step": 500 }, { "epoch": 1.33288948069241, "grad_norm": 0.29341640703427146, "learning_rate": 3.08300395256917e-05, "loss": 0.3602, "step": 501 }, { "epoch": 1.3355525965379493, "grad_norm": 0.2975810782284494, "learning_rate": 3.078063241106719e-05, "loss": 0.3392, "step": 502 }, { "epoch": 1.3382157123834886, "grad_norm": 0.26682712897635374, "learning_rate": 3.073122529644269e-05, "loss": 0.3539, "step": 503 }, { "epoch": 1.340878828229028, "grad_norm": 0.29028707302441564, "learning_rate": 3.068181818181818e-05, "loss": 0.3511, "step": 504 }, { "epoch": 1.3435419440745673, "grad_norm": 0.32760242848226895, "learning_rate": 3.063241106719368e-05, "loss": 0.3804, "step": 505 }, { "epoch": 1.3462050599201065, "grad_norm": 0.3092786220233137, "learning_rate": 3.058300395256917e-05, "loss": 0.3699, "step": 506 }, { "epoch": 1.3488681757656458, "grad_norm": 0.3020724813833627, "learning_rate": 3.053359683794467e-05, "loss": 0.3676, "step": 507 }, { "epoch": 1.351531291611185, "grad_norm": 0.2824033966398368, "learning_rate": 3.0484189723320162e-05, "loss": 0.3729, "step": 508 }, { "epoch": 1.3541944074567245, "grad_norm": 0.3618887388165828, "learning_rate": 3.0434782608695656e-05, "loss": 0.3554, "step": 509 }, { "epoch": 1.3568575233022637, "grad_norm": 0.28130180514019887, "learning_rate": 3.038537549407115e-05, "loss": 0.3553, "step": 510 }, { "epoch": 1.359520639147803, "grad_norm": 0.2893653104001468, "learning_rate": 3.0335968379446645e-05, "loss": 0.3782, "step": 511 }, { "epoch": 1.3621837549933422, "grad_norm": 0.3469803538239057, "learning_rate": 3.0286561264822133e-05, "loss": 0.3464, "step": 512 }, { "epoch": 1.3648468708388815, "grad_norm": 0.2732418490440155, "learning_rate": 3.0237154150197627e-05, "loss": 0.3616, "step": 513 }, { "epoch": 1.3675099866844207, "grad_norm": 0.28562062527552706, "learning_rate": 3.018774703557312e-05, "loss": 0.3535, "step": 514 }, { "epoch": 1.37017310252996, "grad_norm": 0.2658369004792245, "learning_rate": 3.0138339920948616e-05, "loss": 0.3725, "step": 515 }, { "epoch": 1.3728362183754994, "grad_norm": 0.29358847654377684, "learning_rate": 3.008893280632411e-05, "loss": 0.3496, "step": 516 }, { "epoch": 1.3754993342210386, "grad_norm": 0.27539943140564604, "learning_rate": 3.0039525691699605e-05, "loss": 0.369, "step": 517 }, { "epoch": 1.378162450066578, "grad_norm": 0.300263236071914, "learning_rate": 2.99901185770751e-05, "loss": 0.3585, "step": 518 }, { "epoch": 1.3808255659121171, "grad_norm": 0.31613231965587374, "learning_rate": 2.9940711462450593e-05, "loss": 0.3777, "step": 519 }, { "epoch": 1.3834886817576564, "grad_norm": 0.2770700909868314, "learning_rate": 2.9891304347826088e-05, "loss": 0.3561, "step": 520 }, { "epoch": 1.3861517976031958, "grad_norm": 0.3050401099786546, "learning_rate": 2.9841897233201582e-05, "loss": 0.3563, "step": 521 }, { "epoch": 1.388814913448735, "grad_norm": 0.2533844111874208, "learning_rate": 2.9792490118577076e-05, "loss": 0.3469, "step": 522 }, { "epoch": 1.3914780292942743, "grad_norm": 0.2695972396120006, "learning_rate": 2.974308300395257e-05, "loss": 0.3621, "step": 523 }, { "epoch": 1.3941411451398136, "grad_norm": 0.28186697645815617, "learning_rate": 2.9693675889328065e-05, "loss": 0.3559, "step": 524 }, { "epoch": 1.3968042609853528, "grad_norm": 0.26628352738719235, "learning_rate": 2.964426877470356e-05, "loss": 0.3646, "step": 525 }, { "epoch": 1.399467376830892, "grad_norm": 0.2833122304678988, "learning_rate": 2.9594861660079054e-05, "loss": 0.3552, "step": 526 }, { "epoch": 1.4021304926764313, "grad_norm": 0.26716813523678146, "learning_rate": 2.954545454545455e-05, "loss": 0.3345, "step": 527 }, { "epoch": 1.4047936085219708, "grad_norm": 0.2754005215378796, "learning_rate": 2.9496047430830043e-05, "loss": 0.3531, "step": 528 }, { "epoch": 1.40745672436751, "grad_norm": 0.3036387674463336, "learning_rate": 2.9446640316205537e-05, "loss": 0.3394, "step": 529 }, { "epoch": 1.4101198402130493, "grad_norm": 0.28788105676480225, "learning_rate": 2.939723320158103e-05, "loss": 0.342, "step": 530 }, { "epoch": 1.4127829560585885, "grad_norm": 0.28191999375557225, "learning_rate": 2.9347826086956526e-05, "loss": 0.3488, "step": 531 }, { "epoch": 1.4154460719041277, "grad_norm": 0.2973599610924886, "learning_rate": 2.9298418972332014e-05, "loss": 0.369, "step": 532 }, { "epoch": 1.4181091877496672, "grad_norm": 0.29639597168777376, "learning_rate": 2.9249011857707508e-05, "loss": 0.3696, "step": 533 }, { "epoch": 1.4207723035952065, "grad_norm": 0.2943864772067253, "learning_rate": 2.9199604743083002e-05, "loss": 0.3708, "step": 534 }, { "epoch": 1.4234354194407457, "grad_norm": 0.3275031870349291, "learning_rate": 2.9150197628458497e-05, "loss": 0.359, "step": 535 }, { "epoch": 1.426098535286285, "grad_norm": 0.288973368099439, "learning_rate": 2.910079051383399e-05, "loss": 0.3534, "step": 536 }, { "epoch": 1.4287616511318242, "grad_norm": 0.3066522465043432, "learning_rate": 2.9051383399209485e-05, "loss": 0.3568, "step": 537 }, { "epoch": 1.4314247669773636, "grad_norm": 0.3056985012074139, "learning_rate": 2.900197628458498e-05, "loss": 0.3457, "step": 538 }, { "epoch": 1.4340878828229027, "grad_norm": 0.2793941010759859, "learning_rate": 2.8952569169960474e-05, "loss": 0.3559, "step": 539 }, { "epoch": 1.4367509986684421, "grad_norm": 0.2535278252678889, "learning_rate": 2.890316205533597e-05, "loss": 0.3528, "step": 540 }, { "epoch": 1.4394141145139814, "grad_norm": 0.2842251418338047, "learning_rate": 2.8853754940711463e-05, "loss": 0.3522, "step": 541 }, { "epoch": 1.4420772303595206, "grad_norm": 0.2778073412674222, "learning_rate": 2.8804347826086957e-05, "loss": 0.3603, "step": 542 }, { "epoch": 1.4447403462050599, "grad_norm": 0.2554361454610928, "learning_rate": 2.8754940711462452e-05, "loss": 0.3635, "step": 543 }, { "epoch": 1.447403462050599, "grad_norm": 0.3049003958057493, "learning_rate": 2.8705533596837946e-05, "loss": 0.3602, "step": 544 }, { "epoch": 1.4500665778961386, "grad_norm": 0.2675057851041106, "learning_rate": 2.865612648221344e-05, "loss": 0.3612, "step": 545 }, { "epoch": 1.4527296937416778, "grad_norm": 0.24887490119807607, "learning_rate": 2.8606719367588935e-05, "loss": 0.3654, "step": 546 }, { "epoch": 1.455392809587217, "grad_norm": 0.3195728958038635, "learning_rate": 2.855731225296443e-05, "loss": 0.3513, "step": 547 }, { "epoch": 1.4580559254327563, "grad_norm": 0.2546987092178984, "learning_rate": 2.8507905138339924e-05, "loss": 0.3398, "step": 548 }, { "epoch": 1.4607190412782955, "grad_norm": 0.29773690473267483, "learning_rate": 2.8458498023715418e-05, "loss": 0.3694, "step": 549 }, { "epoch": 1.463382157123835, "grad_norm": 0.29315481833169116, "learning_rate": 2.8409090909090912e-05, "loss": 0.3426, "step": 550 }, { "epoch": 1.466045272969374, "grad_norm": 0.3296358712762741, "learning_rate": 2.8359683794466403e-05, "loss": 0.3761, "step": 551 }, { "epoch": 1.4687083888149135, "grad_norm": 0.2989240945630588, "learning_rate": 2.8310276679841894e-05, "loss": 0.3574, "step": 552 }, { "epoch": 1.4713715046604527, "grad_norm": 0.2933347023687216, "learning_rate": 2.826086956521739e-05, "loss": 0.3615, "step": 553 }, { "epoch": 1.474034620505992, "grad_norm": 0.31885875118020457, "learning_rate": 2.8211462450592883e-05, "loss": 0.3645, "step": 554 }, { "epoch": 1.4766977363515312, "grad_norm": 0.2777657172797497, "learning_rate": 2.8162055335968378e-05, "loss": 0.3531, "step": 555 }, { "epoch": 1.4793608521970705, "grad_norm": 0.3318676753935055, "learning_rate": 2.8112648221343872e-05, "loss": 0.3668, "step": 556 }, { "epoch": 1.48202396804261, "grad_norm": 0.3316376422278272, "learning_rate": 2.8063241106719366e-05, "loss": 0.348, "step": 557 }, { "epoch": 1.4846870838881492, "grad_norm": 0.34334200086282374, "learning_rate": 2.801383399209486e-05, "loss": 0.3684, "step": 558 }, { "epoch": 1.4873501997336884, "grad_norm": 0.2998752672686297, "learning_rate": 2.7964426877470355e-05, "loss": 0.343, "step": 559 }, { "epoch": 1.4900133155792277, "grad_norm": 0.323718625297975, "learning_rate": 2.791501976284585e-05, "loss": 0.3435, "step": 560 }, { "epoch": 1.492676431424767, "grad_norm": 0.3042077739086944, "learning_rate": 2.7865612648221344e-05, "loss": 0.357, "step": 561 }, { "epoch": 1.4953395472703064, "grad_norm": 0.3132911982849499, "learning_rate": 2.7816205533596838e-05, "loss": 0.3481, "step": 562 }, { "epoch": 1.4980026631158454, "grad_norm": 0.25389583970465485, "learning_rate": 2.7766798418972333e-05, "loss": 0.3567, "step": 563 }, { "epoch": 1.5006657789613849, "grad_norm": 0.263337393271962, "learning_rate": 2.7717391304347827e-05, "loss": 0.3431, "step": 564 }, { "epoch": 1.503328894806924, "grad_norm": 0.2712654205175259, "learning_rate": 2.766798418972332e-05, "loss": 0.3582, "step": 565 }, { "epoch": 1.5059920106524634, "grad_norm": 0.2612896047069462, "learning_rate": 2.7618577075098816e-05, "loss": 0.3445, "step": 566 }, { "epoch": 1.5086551264980028, "grad_norm": 0.27219615901029837, "learning_rate": 2.756916996047431e-05, "loss": 0.3652, "step": 567 }, { "epoch": 1.5113182423435418, "grad_norm": 0.24840155978956244, "learning_rate": 2.7519762845849805e-05, "loss": 0.3421, "step": 568 }, { "epoch": 1.5139813581890813, "grad_norm": 0.24176135920761713, "learning_rate": 2.74703557312253e-05, "loss": 0.3512, "step": 569 }, { "epoch": 1.5166444740346205, "grad_norm": 0.2647051981979065, "learning_rate": 2.7420948616600793e-05, "loss": 0.3499, "step": 570 }, { "epoch": 1.5193075898801598, "grad_norm": 0.27211007538489024, "learning_rate": 2.7371541501976284e-05, "loss": 0.3462, "step": 571 }, { "epoch": 1.521970705725699, "grad_norm": 0.2507493740105373, "learning_rate": 2.732213438735178e-05, "loss": 0.3434, "step": 572 }, { "epoch": 1.5246338215712383, "grad_norm": 0.2693556555763232, "learning_rate": 2.7272727272727273e-05, "loss": 0.3615, "step": 573 }, { "epoch": 1.5272969374167777, "grad_norm": 0.274645850715254, "learning_rate": 2.7223320158102767e-05, "loss": 0.3445, "step": 574 }, { "epoch": 1.5299600532623168, "grad_norm": 0.24351837189102682, "learning_rate": 2.7173913043478262e-05, "loss": 0.3686, "step": 575 }, { "epoch": 1.5326231691078562, "grad_norm": 0.27710340393878174, "learning_rate": 2.7124505928853756e-05, "loss": 0.3547, "step": 576 }, { "epoch": 1.5352862849533955, "grad_norm": 0.2806488747523977, "learning_rate": 2.707509881422925e-05, "loss": 0.3672, "step": 577 }, { "epoch": 1.5379494007989347, "grad_norm": 0.32294972985992815, "learning_rate": 2.7025691699604745e-05, "loss": 0.3527, "step": 578 }, { "epoch": 1.5406125166444742, "grad_norm": 0.24771959309258884, "learning_rate": 2.697628458498024e-05, "loss": 0.3626, "step": 579 }, { "epoch": 1.5432756324900132, "grad_norm": 0.31974111618484613, "learning_rate": 2.6926877470355734e-05, "loss": 0.3553, "step": 580 }, { "epoch": 1.5459387483355527, "grad_norm": 0.28071413168163195, "learning_rate": 2.6877470355731228e-05, "loss": 0.3676, "step": 581 }, { "epoch": 1.548601864181092, "grad_norm": 0.2584928716043461, "learning_rate": 2.6828063241106723e-05, "loss": 0.3427, "step": 582 }, { "epoch": 1.5512649800266312, "grad_norm": 0.2648608207536266, "learning_rate": 2.6778656126482217e-05, "loss": 0.3377, "step": 583 }, { "epoch": 1.5539280958721704, "grad_norm": 0.2671119266891378, "learning_rate": 2.672924901185771e-05, "loss": 0.3559, "step": 584 }, { "epoch": 1.5565912117177096, "grad_norm": 0.2840788018392293, "learning_rate": 2.6679841897233206e-05, "loss": 0.355, "step": 585 }, { "epoch": 1.559254327563249, "grad_norm": 0.29216560920303836, "learning_rate": 2.66304347826087e-05, "loss": 0.3625, "step": 586 }, { "epoch": 1.5619174434087881, "grad_norm": 0.2782406231477868, "learning_rate": 2.6581027667984194e-05, "loss": 0.3544, "step": 587 }, { "epoch": 1.5645805592543276, "grad_norm": 0.27482653297611137, "learning_rate": 2.653162055335969e-05, "loss": 0.3505, "step": 588 }, { "epoch": 1.5672436750998668, "grad_norm": 0.2737639812672786, "learning_rate": 2.6482213438735183e-05, "loss": 0.3339, "step": 589 }, { "epoch": 1.569906790945406, "grad_norm": 0.30172379604459587, "learning_rate": 2.643280632411067e-05, "loss": 0.3574, "step": 590 }, { "epoch": 1.5725699067909455, "grad_norm": 0.30937296239336515, "learning_rate": 2.6383399209486165e-05, "loss": 0.3552, "step": 591 }, { "epoch": 1.5752330226364846, "grad_norm": 0.30263893603202113, "learning_rate": 2.633399209486166e-05, "loss": 0.3806, "step": 592 }, { "epoch": 1.577896138482024, "grad_norm": 0.36351951882340405, "learning_rate": 2.6284584980237154e-05, "loss": 0.3483, "step": 593 }, { "epoch": 1.5805592543275633, "grad_norm": 0.27596120256597706, "learning_rate": 2.623517786561265e-05, "loss": 0.3785, "step": 594 }, { "epoch": 1.5832223701731025, "grad_norm": 0.30086295136857, "learning_rate": 2.6185770750988143e-05, "loss": 0.3536, "step": 595 }, { "epoch": 1.5858854860186418, "grad_norm": 0.3786534775512319, "learning_rate": 2.6136363636363637e-05, "loss": 0.3577, "step": 596 }, { "epoch": 1.588548601864181, "grad_norm": 0.294153803281236, "learning_rate": 2.608695652173913e-05, "loss": 0.3603, "step": 597 }, { "epoch": 1.5912117177097205, "grad_norm": 0.316506621080003, "learning_rate": 2.6037549407114626e-05, "loss": 0.3763, "step": 598 }, { "epoch": 1.5938748335552595, "grad_norm": 0.31539133712695033, "learning_rate": 2.598814229249012e-05, "loss": 0.3373, "step": 599 }, { "epoch": 1.596537949400799, "grad_norm": 0.29787884422276756, "learning_rate": 2.5938735177865615e-05, "loss": 0.3461, "step": 600 }, { "epoch": 1.5992010652463382, "grad_norm": 0.2794574362382508, "learning_rate": 2.588932806324111e-05, "loss": 0.3607, "step": 601 }, { "epoch": 1.6018641810918774, "grad_norm": 0.28198668683252337, "learning_rate": 2.5839920948616603e-05, "loss": 0.3698, "step": 602 }, { "epoch": 1.604527296937417, "grad_norm": 0.2767707782956735, "learning_rate": 2.5790513833992098e-05, "loss": 0.3358, "step": 603 }, { "epoch": 1.607190412782956, "grad_norm": 0.26770289783678053, "learning_rate": 2.5741106719367592e-05, "loss": 0.3376, "step": 604 }, { "epoch": 1.6098535286284954, "grad_norm": 0.3244106061056206, "learning_rate": 2.5691699604743087e-05, "loss": 0.3515, "step": 605 }, { "epoch": 1.6125166444740346, "grad_norm": 0.29260066196150414, "learning_rate": 2.564229249011858e-05, "loss": 0.3712, "step": 606 }, { "epoch": 1.6151797603195739, "grad_norm": 0.39595763824507085, "learning_rate": 2.5592885375494075e-05, "loss": 0.3402, "step": 607 }, { "epoch": 1.6178428761651131, "grad_norm": 0.2911698047056363, "learning_rate": 2.554347826086957e-05, "loss": 0.3579, "step": 608 }, { "epoch": 1.6205059920106524, "grad_norm": 0.30667505894069086, "learning_rate": 2.5494071146245064e-05, "loss": 0.3488, "step": 609 }, { "epoch": 1.6231691078561918, "grad_norm": 0.3377626596928706, "learning_rate": 2.5444664031620552e-05, "loss": 0.3455, "step": 610 }, { "epoch": 1.6258322237017309, "grad_norm": 0.3019507720671119, "learning_rate": 2.5395256916996046e-05, "loss": 0.352, "step": 611 }, { "epoch": 1.6284953395472703, "grad_norm": 0.2835949922829532, "learning_rate": 2.534584980237154e-05, "loss": 0.3602, "step": 612 }, { "epoch": 1.6311584553928096, "grad_norm": 0.32444980944074003, "learning_rate": 2.5296442687747035e-05, "loss": 0.3626, "step": 613 }, { "epoch": 1.6338215712383488, "grad_norm": 0.30852262333031255, "learning_rate": 2.524703557312253e-05, "loss": 0.3415, "step": 614 }, { "epoch": 1.6364846870838883, "grad_norm": 0.2769395153617194, "learning_rate": 2.5197628458498024e-05, "loss": 0.36, "step": 615 }, { "epoch": 1.6391478029294273, "grad_norm": 0.3225695333542542, "learning_rate": 2.5148221343873518e-05, "loss": 0.3504, "step": 616 }, { "epoch": 1.6418109187749668, "grad_norm": 0.26000908179747434, "learning_rate": 2.5098814229249012e-05, "loss": 0.3511, "step": 617 }, { "epoch": 1.644474034620506, "grad_norm": 0.2558998742720099, "learning_rate": 2.5049407114624507e-05, "loss": 0.3551, "step": 618 }, { "epoch": 1.6471371504660453, "grad_norm": 0.2810631366750719, "learning_rate": 2.5e-05, "loss": 0.359, "step": 619 }, { "epoch": 1.6498002663115847, "grad_norm": 0.2764036943026752, "learning_rate": 2.4950592885375496e-05, "loss": 0.3552, "step": 620 }, { "epoch": 1.6524633821571237, "grad_norm": 0.29157627798525887, "learning_rate": 2.490118577075099e-05, "loss": 0.3477, "step": 621 }, { "epoch": 1.6551264980026632, "grad_norm": 0.30005399168360375, "learning_rate": 2.4851778656126484e-05, "loss": 0.3635, "step": 622 }, { "epoch": 1.6577896138482024, "grad_norm": 0.28682265413573244, "learning_rate": 2.480237154150198e-05, "loss": 0.3472, "step": 623 }, { "epoch": 1.6604527296937417, "grad_norm": 0.30810891527099654, "learning_rate": 2.475296442687747e-05, "loss": 0.3453, "step": 624 }, { "epoch": 1.663115845539281, "grad_norm": 0.2894658697891752, "learning_rate": 2.4703557312252964e-05, "loss": 0.348, "step": 625 }, { "epoch": 1.6657789613848202, "grad_norm": 0.26056026406293753, "learning_rate": 2.465415019762846e-05, "loss": 0.3422, "step": 626 }, { "epoch": 1.6684420772303596, "grad_norm": 0.27955802745377495, "learning_rate": 2.4604743083003953e-05, "loss": 0.351, "step": 627 }, { "epoch": 1.6711051930758987, "grad_norm": 0.2589447838000819, "learning_rate": 2.4555335968379447e-05, "loss": 0.3606, "step": 628 }, { "epoch": 1.6737683089214381, "grad_norm": 0.2726720946381243, "learning_rate": 2.450592885375494e-05, "loss": 0.3553, "step": 629 }, { "epoch": 1.6764314247669774, "grad_norm": 0.29585982981776077, "learning_rate": 2.4456521739130436e-05, "loss": 0.3429, "step": 630 }, { "epoch": 1.6790945406125166, "grad_norm": 0.25866785993085295, "learning_rate": 2.440711462450593e-05, "loss": 0.3464, "step": 631 }, { "epoch": 1.681757656458056, "grad_norm": 0.26186173743371105, "learning_rate": 2.4357707509881425e-05, "loss": 0.3624, "step": 632 }, { "epoch": 1.684420772303595, "grad_norm": 0.27529386090536323, "learning_rate": 2.430830039525692e-05, "loss": 0.3464, "step": 633 }, { "epoch": 1.6870838881491346, "grad_norm": 0.24305368943964414, "learning_rate": 2.425889328063241e-05, "loss": 0.3542, "step": 634 }, { "epoch": 1.6897470039946738, "grad_norm": 0.263035963649886, "learning_rate": 2.4209486166007905e-05, "loss": 0.3638, "step": 635 }, { "epoch": 1.692410119840213, "grad_norm": 0.2737080512587832, "learning_rate": 2.41600790513834e-05, "loss": 0.3368, "step": 636 }, { "epoch": 1.6950732356857523, "grad_norm": 0.33404220986339256, "learning_rate": 2.4110671936758893e-05, "loss": 0.3724, "step": 637 }, { "epoch": 1.6977363515312915, "grad_norm": 0.2897416261690682, "learning_rate": 2.4061264822134388e-05, "loss": 0.3593, "step": 638 }, { "epoch": 1.700399467376831, "grad_norm": 0.3041816217006561, "learning_rate": 2.4011857707509882e-05, "loss": 0.3513, "step": 639 }, { "epoch": 1.70306258322237, "grad_norm": 0.2677006117678147, "learning_rate": 2.3962450592885376e-05, "loss": 0.3594, "step": 640 }, { "epoch": 1.7057256990679095, "grad_norm": 0.2783081801536929, "learning_rate": 2.391304347826087e-05, "loss": 0.3497, "step": 641 }, { "epoch": 1.7083888149134487, "grad_norm": 0.2949970037820572, "learning_rate": 2.3863636363636365e-05, "loss": 0.3527, "step": 642 }, { "epoch": 1.711051930758988, "grad_norm": 0.29435826287206446, "learning_rate": 2.381422924901186e-05, "loss": 0.3476, "step": 643 }, { "epoch": 1.7137150466045274, "grad_norm": 0.22820704347237256, "learning_rate": 2.376482213438735e-05, "loss": 0.3563, "step": 644 }, { "epoch": 1.7163781624500665, "grad_norm": 0.2662369562790593, "learning_rate": 2.3715415019762845e-05, "loss": 0.3564, "step": 645 }, { "epoch": 1.719041278295606, "grad_norm": 0.2660848595820705, "learning_rate": 2.366600790513834e-05, "loss": 0.3507, "step": 646 }, { "epoch": 1.7217043941411452, "grad_norm": 0.2736362440179924, "learning_rate": 2.3616600790513834e-05, "loss": 0.3583, "step": 647 }, { "epoch": 1.7243675099866844, "grad_norm": 0.2877841104207108, "learning_rate": 2.3567193675889328e-05, "loss": 0.3543, "step": 648 }, { "epoch": 1.7270306258322237, "grad_norm": 0.26935615929008033, "learning_rate": 2.3517786561264823e-05, "loss": 0.3437, "step": 649 }, { "epoch": 1.729693741677763, "grad_norm": 0.2578776022705283, "learning_rate": 2.3468379446640317e-05, "loss": 0.3665, "step": 650 }, { "epoch": 1.7323568575233024, "grad_norm": 0.28540169794092723, "learning_rate": 2.341897233201581e-05, "loss": 0.3427, "step": 651 }, { "epoch": 1.7350199733688414, "grad_norm": 0.302406678764912, "learning_rate": 2.3369565217391306e-05, "loss": 0.3493, "step": 652 }, { "epoch": 1.7376830892143809, "grad_norm": 0.2613558705976954, "learning_rate": 2.33201581027668e-05, "loss": 0.3384, "step": 653 }, { "epoch": 1.74034620505992, "grad_norm": 0.31445958338443253, "learning_rate": 2.327075098814229e-05, "loss": 0.3563, "step": 654 }, { "epoch": 1.7430093209054593, "grad_norm": 0.26295035895535324, "learning_rate": 2.3221343873517785e-05, "loss": 0.3523, "step": 655 }, { "epoch": 1.7456724367509988, "grad_norm": 0.26455791446031185, "learning_rate": 2.317193675889328e-05, "loss": 0.347, "step": 656 }, { "epoch": 1.7483355525965378, "grad_norm": 0.267920904226216, "learning_rate": 2.3122529644268774e-05, "loss": 0.3757, "step": 657 }, { "epoch": 1.7509986684420773, "grad_norm": 0.29766057642277893, "learning_rate": 2.307312252964427e-05, "loss": 0.3388, "step": 658 }, { "epoch": 1.7536617842876165, "grad_norm": 0.2614333124037635, "learning_rate": 2.3023715415019763e-05, "loss": 0.3448, "step": 659 }, { "epoch": 1.7563249001331558, "grad_norm": 0.2460873862604595, "learning_rate": 2.2974308300395257e-05, "loss": 0.3701, "step": 660 }, { "epoch": 1.758988015978695, "grad_norm": 0.32415471595000084, "learning_rate": 2.2924901185770752e-05, "loss": 0.3502, "step": 661 }, { "epoch": 1.7616511318242343, "grad_norm": 0.28861202445680917, "learning_rate": 2.2875494071146246e-05, "loss": 0.3419, "step": 662 }, { "epoch": 1.7643142476697737, "grad_norm": 0.33178480237112284, "learning_rate": 2.282608695652174e-05, "loss": 0.364, "step": 663 }, { "epoch": 1.7669773635153128, "grad_norm": 0.28362428197182826, "learning_rate": 2.2776679841897235e-05, "loss": 0.3447, "step": 664 }, { "epoch": 1.7696404793608522, "grad_norm": 0.2593493932357841, "learning_rate": 2.272727272727273e-05, "loss": 0.3566, "step": 665 }, { "epoch": 1.7723035952063915, "grad_norm": 0.32399886004151673, "learning_rate": 2.267786561264822e-05, "loss": 0.352, "step": 666 }, { "epoch": 1.7749667110519307, "grad_norm": 0.2898594306022826, "learning_rate": 2.2628458498023715e-05, "loss": 0.3552, "step": 667 }, { "epoch": 1.7776298268974702, "grad_norm": 0.30141440115798507, "learning_rate": 2.257905138339921e-05, "loss": 0.3394, "step": 668 }, { "epoch": 1.7802929427430092, "grad_norm": 0.2748566768296462, "learning_rate": 2.2529644268774703e-05, "loss": 0.3639, "step": 669 }, { "epoch": 1.7829560585885487, "grad_norm": 0.2597063738725183, "learning_rate": 2.2480237154150198e-05, "loss": 0.3523, "step": 670 }, { "epoch": 1.785619174434088, "grad_norm": 0.27428899527158185, "learning_rate": 2.2430830039525692e-05, "loss": 0.3576, "step": 671 }, { "epoch": 1.7882822902796272, "grad_norm": 0.27821642567843663, "learning_rate": 2.2381422924901187e-05, "loss": 0.3431, "step": 672 }, { "epoch": 1.7909454061251664, "grad_norm": 0.3009289717068197, "learning_rate": 2.233201581027668e-05, "loss": 0.3506, "step": 673 }, { "epoch": 1.7936085219707056, "grad_norm": 0.27901500754869907, "learning_rate": 2.2282608695652175e-05, "loss": 0.3413, "step": 674 }, { "epoch": 1.796271637816245, "grad_norm": 0.26359419972730574, "learning_rate": 2.223320158102767e-05, "loss": 0.3574, "step": 675 }, { "epoch": 1.7989347536617841, "grad_norm": 0.301875250326235, "learning_rate": 2.2183794466403164e-05, "loss": 0.3586, "step": 676 }, { "epoch": 1.8015978695073236, "grad_norm": 0.293396805853932, "learning_rate": 2.213438735177866e-05, "loss": 0.3631, "step": 677 }, { "epoch": 1.8042609853528628, "grad_norm": 0.2627077951859255, "learning_rate": 2.2084980237154153e-05, "loss": 0.3421, "step": 678 }, { "epoch": 1.806924101198402, "grad_norm": 0.2910041424241653, "learning_rate": 2.2035573122529647e-05, "loss": 0.3508, "step": 679 }, { "epoch": 1.8095872170439415, "grad_norm": 0.2700422024120216, "learning_rate": 2.198616600790514e-05, "loss": 0.3656, "step": 680 }, { "epoch": 1.8122503328894806, "grad_norm": 0.261122870241434, "learning_rate": 2.1936758893280636e-05, "loss": 0.3727, "step": 681 }, { "epoch": 1.81491344873502, "grad_norm": 0.2759182990026985, "learning_rate": 2.188735177865613e-05, "loss": 0.3429, "step": 682 }, { "epoch": 1.8175765645805593, "grad_norm": 0.25688731642570295, "learning_rate": 2.183794466403162e-05, "loss": 0.3638, "step": 683 }, { "epoch": 1.8202396804260985, "grad_norm": 0.2583299882188377, "learning_rate": 2.1788537549407116e-05, "loss": 0.3627, "step": 684 }, { "epoch": 1.822902796271638, "grad_norm": 0.24824630818405677, "learning_rate": 2.173913043478261e-05, "loss": 0.3509, "step": 685 }, { "epoch": 1.825565912117177, "grad_norm": 0.2775222142294749, "learning_rate": 2.1689723320158105e-05, "loss": 0.3421, "step": 686 }, { "epoch": 1.8282290279627165, "grad_norm": 0.23869310034905467, "learning_rate": 2.16403162055336e-05, "loss": 0.3376, "step": 687 }, { "epoch": 1.8308921438082557, "grad_norm": 0.2933357911415976, "learning_rate": 2.1590909090909093e-05, "loss": 0.3521, "step": 688 }, { "epoch": 1.833555259653795, "grad_norm": 0.27832210393035933, "learning_rate": 2.1541501976284588e-05, "loss": 0.3553, "step": 689 }, { "epoch": 1.8362183754993342, "grad_norm": 0.3087436970907245, "learning_rate": 2.1492094861660082e-05, "loss": 0.347, "step": 690 }, { "epoch": 1.8388814913448734, "grad_norm": 0.2943513499295711, "learning_rate": 2.1442687747035576e-05, "loss": 0.3536, "step": 691 }, { "epoch": 1.841544607190413, "grad_norm": 0.26722654225950093, "learning_rate": 2.1393280632411067e-05, "loss": 0.3624, "step": 692 }, { "epoch": 1.844207723035952, "grad_norm": 0.2686739391641238, "learning_rate": 2.1343873517786562e-05, "loss": 0.3551, "step": 693 }, { "epoch": 1.8468708388814914, "grad_norm": 0.3317404535951985, "learning_rate": 2.1294466403162056e-05, "loss": 0.3519, "step": 694 }, { "epoch": 1.8495339547270306, "grad_norm": 0.25888461414583197, "learning_rate": 2.124505928853755e-05, "loss": 0.3621, "step": 695 }, { "epoch": 1.8521970705725699, "grad_norm": 0.2388947383775022, "learning_rate": 2.1195652173913045e-05, "loss": 0.3464, "step": 696 }, { "epoch": 1.8548601864181093, "grad_norm": 0.32253652339123096, "learning_rate": 2.114624505928854e-05, "loss": 0.3486, "step": 697 }, { "epoch": 1.8575233022636484, "grad_norm": 0.23971764237483872, "learning_rate": 2.1096837944664034e-05, "loss": 0.3469, "step": 698 }, { "epoch": 1.8601864181091878, "grad_norm": 0.2822968430519757, "learning_rate": 2.1047430830039528e-05, "loss": 0.3464, "step": 699 }, { "epoch": 1.862849533954727, "grad_norm": 0.28707092445711563, "learning_rate": 2.0998023715415023e-05, "loss": 0.3454, "step": 700 }, { "epoch": 1.8655126498002663, "grad_norm": 0.26633357589223594, "learning_rate": 2.0948616600790517e-05, "loss": 0.3528, "step": 701 }, { "epoch": 1.8681757656458056, "grad_norm": 0.30480677025070735, "learning_rate": 2.0899209486166008e-05, "loss": 0.3705, "step": 702 }, { "epoch": 1.8708388814913448, "grad_norm": 0.2589295473498244, "learning_rate": 2.0849802371541502e-05, "loss": 0.366, "step": 703 }, { "epoch": 1.8735019973368843, "grad_norm": 0.3615686651832072, "learning_rate": 2.0800395256916997e-05, "loss": 0.3545, "step": 704 }, { "epoch": 1.8761651131824233, "grad_norm": 0.2643316410023579, "learning_rate": 2.075098814229249e-05, "loss": 0.3478, "step": 705 }, { "epoch": 1.8788282290279628, "grad_norm": 0.3002604064308654, "learning_rate": 2.0701581027667985e-05, "loss": 0.3691, "step": 706 }, { "epoch": 1.881491344873502, "grad_norm": 0.2842611156357375, "learning_rate": 2.065217391304348e-05, "loss": 0.361, "step": 707 }, { "epoch": 1.8841544607190412, "grad_norm": 0.3130168183378823, "learning_rate": 2.0602766798418974e-05, "loss": 0.3536, "step": 708 }, { "epoch": 1.8868175765645807, "grad_norm": 0.3519161067004107, "learning_rate": 2.055335968379447e-05, "loss": 0.3557, "step": 709 }, { "epoch": 1.8894806924101197, "grad_norm": 0.27233651062760655, "learning_rate": 2.0503952569169963e-05, "loss": 0.3594, "step": 710 }, { "epoch": 1.8921438082556592, "grad_norm": 0.31833253788492577, "learning_rate": 2.0454545454545457e-05, "loss": 0.3489, "step": 711 }, { "epoch": 1.8948069241011984, "grad_norm": 0.24567699858003664, "learning_rate": 2.040513833992095e-05, "loss": 0.3367, "step": 712 }, { "epoch": 1.8974700399467377, "grad_norm": 0.2969050880879015, "learning_rate": 2.0355731225296443e-05, "loss": 0.3537, "step": 713 }, { "epoch": 1.900133155792277, "grad_norm": 0.3189993081371087, "learning_rate": 2.0306324110671937e-05, "loss": 0.3669, "step": 714 }, { "epoch": 1.9027962716378162, "grad_norm": 0.24524923802003742, "learning_rate": 2.025691699604743e-05, "loss": 0.3448, "step": 715 }, { "epoch": 1.9054593874833556, "grad_norm": 0.3002012848114626, "learning_rate": 2.0207509881422926e-05, "loss": 0.3592, "step": 716 }, { "epoch": 1.9081225033288947, "grad_norm": 0.2577221774068482, "learning_rate": 2.015810276679842e-05, "loss": 0.3615, "step": 717 }, { "epoch": 1.9107856191744341, "grad_norm": 0.2662922499052391, "learning_rate": 2.0108695652173915e-05, "loss": 0.3564, "step": 718 }, { "epoch": 1.9134487350199734, "grad_norm": 0.2748543453818437, "learning_rate": 2.005928853754941e-05, "loss": 0.3367, "step": 719 }, { "epoch": 1.9161118508655126, "grad_norm": 0.29453902437825724, "learning_rate": 2.0009881422924903e-05, "loss": 0.3346, "step": 720 }, { "epoch": 1.918774966711052, "grad_norm": 0.2958384946201868, "learning_rate": 1.9960474308300398e-05, "loss": 0.3653, "step": 721 }, { "epoch": 1.921438082556591, "grad_norm": 0.3110870857995837, "learning_rate": 1.991106719367589e-05, "loss": 0.3626, "step": 722 }, { "epoch": 1.9241011984021306, "grad_norm": 0.29754006004298117, "learning_rate": 1.9861660079051383e-05, "loss": 0.3595, "step": 723 }, { "epoch": 1.9267643142476698, "grad_norm": 0.2637206512469971, "learning_rate": 1.9812252964426878e-05, "loss": 0.3637, "step": 724 }, { "epoch": 1.929427430093209, "grad_norm": 0.28572071909963137, "learning_rate": 1.9762845849802372e-05, "loss": 0.351, "step": 725 }, { "epoch": 1.9320905459387483, "grad_norm": 0.26449910347561634, "learning_rate": 1.9713438735177866e-05, "loss": 0.3607, "step": 726 }, { "epoch": 1.9347536617842875, "grad_norm": 0.312752897256756, "learning_rate": 1.966403162055336e-05, "loss": 0.3591, "step": 727 }, { "epoch": 1.937416777629827, "grad_norm": 0.2592410502272739, "learning_rate": 1.9614624505928855e-05, "loss": 0.3439, "step": 728 }, { "epoch": 1.940079893475366, "grad_norm": 0.24250837194662156, "learning_rate": 1.956521739130435e-05, "loss": 0.3322, "step": 729 }, { "epoch": 1.9427430093209055, "grad_norm": 0.27100632690728255, "learning_rate": 1.9515810276679844e-05, "loss": 0.3478, "step": 730 }, { "epoch": 1.9454061251664447, "grad_norm": 0.2792664428193274, "learning_rate": 1.9466403162055335e-05, "loss": 0.3667, "step": 731 }, { "epoch": 1.948069241011984, "grad_norm": 0.2619688688672022, "learning_rate": 1.941699604743083e-05, "loss": 0.3533, "step": 732 }, { "epoch": 1.9507323568575234, "grad_norm": 0.250474396728028, "learning_rate": 1.9367588932806324e-05, "loss": 0.3615, "step": 733 }, { "epoch": 1.9533954727030625, "grad_norm": 0.2592917559527508, "learning_rate": 1.9318181818181818e-05, "loss": 0.35, "step": 734 }, { "epoch": 1.956058588548602, "grad_norm": 0.28358412495828245, "learning_rate": 1.9268774703557312e-05, "loss": 0.3438, "step": 735 }, { "epoch": 1.9587217043941412, "grad_norm": 0.2905168266596484, "learning_rate": 1.9219367588932807e-05, "loss": 0.3363, "step": 736 }, { "epoch": 1.9613848202396804, "grad_norm": 0.2558334592646534, "learning_rate": 1.91699604743083e-05, "loss": 0.3636, "step": 737 }, { "epoch": 1.9640479360852197, "grad_norm": 0.2856486905717076, "learning_rate": 1.9120553359683796e-05, "loss": 0.3423, "step": 738 }, { "epoch": 1.966711051930759, "grad_norm": 0.25338680291782845, "learning_rate": 1.907114624505929e-05, "loss": 0.3647, "step": 739 }, { "epoch": 1.9693741677762984, "grad_norm": 0.25927241893410596, "learning_rate": 1.9021739130434784e-05, "loss": 0.361, "step": 740 }, { "epoch": 1.9720372836218374, "grad_norm": 0.26559107296256046, "learning_rate": 1.8972332015810275e-05, "loss": 0.3532, "step": 741 }, { "epoch": 1.9747003994673769, "grad_norm": 0.23909262831928838, "learning_rate": 1.892292490118577e-05, "loss": 0.3458, "step": 742 }, { "epoch": 1.977363515312916, "grad_norm": 0.29570607043062813, "learning_rate": 1.8873517786561264e-05, "loss": 0.3651, "step": 743 }, { "epoch": 1.9800266311584553, "grad_norm": 0.26837566907079335, "learning_rate": 1.882411067193676e-05, "loss": 0.3624, "step": 744 }, { "epoch": 1.9826897470039948, "grad_norm": 0.24855234703810405, "learning_rate": 1.8774703557312253e-05, "loss": 0.3458, "step": 745 }, { "epoch": 1.9853528628495338, "grad_norm": 0.2581276414313357, "learning_rate": 1.8725296442687747e-05, "loss": 0.3532, "step": 746 }, { "epoch": 1.9880159786950733, "grad_norm": 0.2769192507293847, "learning_rate": 1.867588932806324e-05, "loss": 0.3662, "step": 747 }, { "epoch": 1.9906790945406125, "grad_norm": 0.24782306003081656, "learning_rate": 1.8626482213438736e-05, "loss": 0.3444, "step": 748 }, { "epoch": 1.9933422103861518, "grad_norm": 0.23338769959338118, "learning_rate": 1.857707509881423e-05, "loss": 0.3375, "step": 749 }, { "epoch": 1.996005326231691, "grad_norm": 0.2399452380668713, "learning_rate": 1.8527667984189725e-05, "loss": 0.3577, "step": 750 }, { "epoch": 1.9986684420772303, "grad_norm": 0.24061002934920092, "learning_rate": 1.8478260869565216e-05, "loss": 0.3558, "step": 751 }, { "epoch": 2.0, "grad_norm": 0.3876397436943037, "learning_rate": 1.842885375494071e-05, "loss": 0.3232, "step": 752 }, { "epoch": 2.0026631158455395, "grad_norm": 0.3255318592205839, "learning_rate": 1.8379446640316205e-05, "loss": 0.2829, "step": 753 }, { "epoch": 2.0053262316910785, "grad_norm": 0.2688339427044817, "learning_rate": 1.83300395256917e-05, "loss": 0.2808, "step": 754 }, { "epoch": 2.007989347536618, "grad_norm": 0.31006819974729777, "learning_rate": 1.8280632411067193e-05, "loss": 0.2619, "step": 755 }, { "epoch": 2.010652463382157, "grad_norm": 0.3391232912122683, "learning_rate": 1.8231225296442688e-05, "loss": 0.2797, "step": 756 }, { "epoch": 2.0133155792276964, "grad_norm": 0.22961985808221483, "learning_rate": 1.8181818181818182e-05, "loss": 0.2716, "step": 757 }, { "epoch": 2.015978695073236, "grad_norm": 0.3029488541333639, "learning_rate": 1.8132411067193676e-05, "loss": 0.2748, "step": 758 }, { "epoch": 2.018641810918775, "grad_norm": 0.3272089229771229, "learning_rate": 1.808300395256917e-05, "loss": 0.259, "step": 759 }, { "epoch": 2.0213049267643144, "grad_norm": 0.2632568547847837, "learning_rate": 1.8033596837944665e-05, "loss": 0.2765, "step": 760 }, { "epoch": 2.0239680426098534, "grad_norm": 0.278440470950714, "learning_rate": 1.7984189723320156e-05, "loss": 0.2638, "step": 761 }, { "epoch": 2.026631158455393, "grad_norm": 0.3139907981507755, "learning_rate": 1.793478260869565e-05, "loss": 0.2805, "step": 762 }, { "epoch": 2.029294274300932, "grad_norm": 0.26955412514066035, "learning_rate": 1.7885375494071145e-05, "loss": 0.2617, "step": 763 }, { "epoch": 2.0319573901464714, "grad_norm": 0.2583856619944918, "learning_rate": 1.783596837944664e-05, "loss": 0.2678, "step": 764 }, { "epoch": 2.034620505992011, "grad_norm": 0.27298004272506543, "learning_rate": 1.7786561264822134e-05, "loss": 0.2674, "step": 765 }, { "epoch": 2.03728362183755, "grad_norm": 0.272776301937256, "learning_rate": 1.7737154150197628e-05, "loss": 0.2783, "step": 766 }, { "epoch": 2.0399467376830893, "grad_norm": 0.23604664211204196, "learning_rate": 1.7687747035573123e-05, "loss": 0.2694, "step": 767 }, { "epoch": 2.0426098535286283, "grad_norm": 0.2705685089413051, "learning_rate": 1.7638339920948617e-05, "loss": 0.2835, "step": 768 }, { "epoch": 2.045272969374168, "grad_norm": 0.2348856411632335, "learning_rate": 1.758893280632411e-05, "loss": 0.2591, "step": 769 }, { "epoch": 2.0479360852197073, "grad_norm": 0.24862768901035942, "learning_rate": 1.7539525691699606e-05, "loss": 0.2641, "step": 770 }, { "epoch": 2.0505992010652463, "grad_norm": 0.25511185080416404, "learning_rate": 1.74901185770751e-05, "loss": 0.2709, "step": 771 }, { "epoch": 2.0532623169107858, "grad_norm": 0.24302033763825434, "learning_rate": 1.7440711462450594e-05, "loss": 0.2759, "step": 772 }, { "epoch": 2.0559254327563248, "grad_norm": 0.20872328589643, "learning_rate": 1.739130434782609e-05, "loss": 0.2632, "step": 773 }, { "epoch": 2.0585885486018642, "grad_norm": 0.26636593407387676, "learning_rate": 1.7341897233201583e-05, "loss": 0.2636, "step": 774 }, { "epoch": 2.0612516644474033, "grad_norm": 0.28091568129361494, "learning_rate": 1.7292490118577078e-05, "loss": 0.2628, "step": 775 }, { "epoch": 2.0639147802929427, "grad_norm": 0.2560746499348802, "learning_rate": 1.7243083003952572e-05, "loss": 0.2655, "step": 776 }, { "epoch": 2.066577896138482, "grad_norm": 0.26276899174108526, "learning_rate": 1.7193675889328066e-05, "loss": 0.2728, "step": 777 }, { "epoch": 2.069241011984021, "grad_norm": 0.26384946938199305, "learning_rate": 1.714426877470356e-05, "loss": 0.2747, "step": 778 }, { "epoch": 2.0719041278295607, "grad_norm": 0.23715984391863434, "learning_rate": 1.7094861660079055e-05, "loss": 0.2694, "step": 779 }, { "epoch": 2.0745672436750997, "grad_norm": 0.2404103191932088, "learning_rate": 1.7045454545454546e-05, "loss": 0.2844, "step": 780 }, { "epoch": 2.077230359520639, "grad_norm": 0.2295546055568796, "learning_rate": 1.699604743083004e-05, "loss": 0.2563, "step": 781 }, { "epoch": 2.0798934753661786, "grad_norm": 0.25081138258701596, "learning_rate": 1.6946640316205535e-05, "loss": 0.2657, "step": 782 }, { "epoch": 2.0825565912117177, "grad_norm": 0.23299102413940379, "learning_rate": 1.689723320158103e-05, "loss": 0.2841, "step": 783 }, { "epoch": 2.085219707057257, "grad_norm": 0.2352302932330538, "learning_rate": 1.6847826086956524e-05, "loss": 0.2696, "step": 784 }, { "epoch": 2.087882822902796, "grad_norm": 0.2396805580902733, "learning_rate": 1.6798418972332018e-05, "loss": 0.2687, "step": 785 }, { "epoch": 2.0905459387483356, "grad_norm": 0.22897484277870242, "learning_rate": 1.6749011857707512e-05, "loss": 0.2678, "step": 786 }, { "epoch": 2.0932090545938746, "grad_norm": 0.224891214268194, "learning_rate": 1.6699604743083007e-05, "loss": 0.2729, "step": 787 }, { "epoch": 2.095872170439414, "grad_norm": 0.26860270920114504, "learning_rate": 1.66501976284585e-05, "loss": 0.2581, "step": 788 }, { "epoch": 2.0985352862849536, "grad_norm": 0.24961552358211944, "learning_rate": 1.6600790513833996e-05, "loss": 0.2624, "step": 789 }, { "epoch": 2.1011984021304926, "grad_norm": 0.22308364748740767, "learning_rate": 1.6551383399209487e-05, "loss": 0.2647, "step": 790 }, { "epoch": 2.103861517976032, "grad_norm": 0.2380839364570976, "learning_rate": 1.650197628458498e-05, "loss": 0.271, "step": 791 }, { "epoch": 2.106524633821571, "grad_norm": 0.24381955578610937, "learning_rate": 1.6452569169960475e-05, "loss": 0.2694, "step": 792 }, { "epoch": 2.1091877496671105, "grad_norm": 0.23758646142710013, "learning_rate": 1.640316205533597e-05, "loss": 0.2775, "step": 793 }, { "epoch": 2.11185086551265, "grad_norm": 0.23538198400085814, "learning_rate": 1.6353754940711464e-05, "loss": 0.2814, "step": 794 }, { "epoch": 2.114513981358189, "grad_norm": 0.21674748879871775, "learning_rate": 1.630434782608696e-05, "loss": 0.2548, "step": 795 }, { "epoch": 2.1171770972037285, "grad_norm": 0.24105445224605443, "learning_rate": 1.6254940711462453e-05, "loss": 0.2641, "step": 796 }, { "epoch": 2.1198402130492675, "grad_norm": 0.23753067329213304, "learning_rate": 1.6205533596837947e-05, "loss": 0.2709, "step": 797 }, { "epoch": 2.122503328894807, "grad_norm": 0.23404194217010732, "learning_rate": 1.615612648221344e-05, "loss": 0.271, "step": 798 }, { "epoch": 2.125166444740346, "grad_norm": 0.2121069651623829, "learning_rate": 1.6106719367588936e-05, "loss": 0.2627, "step": 799 }, { "epoch": 2.1278295605858855, "grad_norm": 0.22624703639894228, "learning_rate": 1.6057312252964427e-05, "loss": 0.2538, "step": 800 }, { "epoch": 2.130492676431425, "grad_norm": 0.2386292992012449, "learning_rate": 1.600790513833992e-05, "loss": 0.2576, "step": 801 }, { "epoch": 2.133155792276964, "grad_norm": 0.22877737188756703, "learning_rate": 1.5958498023715416e-05, "loss": 0.2727, "step": 802 }, { "epoch": 2.1358189081225034, "grad_norm": 0.27117813021650006, "learning_rate": 1.590909090909091e-05, "loss": 0.2895, "step": 803 }, { "epoch": 2.1384820239680424, "grad_norm": 0.22867337217751538, "learning_rate": 1.5859683794466405e-05, "loss": 0.2734, "step": 804 }, { "epoch": 2.141145139813582, "grad_norm": 0.24512337588151054, "learning_rate": 1.58102766798419e-05, "loss": 0.273, "step": 805 }, { "epoch": 2.1438082556591214, "grad_norm": 0.2727608695581687, "learning_rate": 1.5760869565217393e-05, "loss": 0.2901, "step": 806 }, { "epoch": 2.1464713715046604, "grad_norm": 0.2387866974014394, "learning_rate": 1.5711462450592888e-05, "loss": 0.2643, "step": 807 }, { "epoch": 2.1491344873502, "grad_norm": 0.22440460077720992, "learning_rate": 1.5662055335968382e-05, "loss": 0.2653, "step": 808 }, { "epoch": 2.151797603195739, "grad_norm": 0.248288295680679, "learning_rate": 1.5612648221343873e-05, "loss": 0.2549, "step": 809 }, { "epoch": 2.1544607190412783, "grad_norm": 0.24110717758110342, "learning_rate": 1.5563241106719367e-05, "loss": 0.2748, "step": 810 }, { "epoch": 2.157123834886818, "grad_norm": 0.23171730936199766, "learning_rate": 1.5513833992094862e-05, "loss": 0.2709, "step": 811 }, { "epoch": 2.159786950732357, "grad_norm": 0.22345452374040276, "learning_rate": 1.5464426877470356e-05, "loss": 0.2688, "step": 812 }, { "epoch": 2.1624500665778963, "grad_norm": 0.26551342546130663, "learning_rate": 1.541501976284585e-05, "loss": 0.2709, "step": 813 }, { "epoch": 2.1651131824234353, "grad_norm": 0.2375754285218798, "learning_rate": 1.5365612648221345e-05, "loss": 0.259, "step": 814 }, { "epoch": 2.1677762982689748, "grad_norm": 0.2115542246448785, "learning_rate": 1.531620553359684e-05, "loss": 0.2684, "step": 815 }, { "epoch": 2.170439414114514, "grad_norm": 0.2447171773393202, "learning_rate": 1.5266798418972334e-05, "loss": 0.2762, "step": 816 }, { "epoch": 2.1731025299600533, "grad_norm": 0.22704904523049146, "learning_rate": 1.5217391304347828e-05, "loss": 0.2587, "step": 817 }, { "epoch": 2.1757656458055927, "grad_norm": 0.2103985476952429, "learning_rate": 1.5167984189723323e-05, "loss": 0.2706, "step": 818 }, { "epoch": 2.1784287616511318, "grad_norm": 0.25159263014889965, "learning_rate": 1.5118577075098814e-05, "loss": 0.2584, "step": 819 }, { "epoch": 2.181091877496671, "grad_norm": 0.24458443995501622, "learning_rate": 1.5069169960474308e-05, "loss": 0.2704, "step": 820 }, { "epoch": 2.1837549933422102, "grad_norm": 0.22057301940141671, "learning_rate": 1.5019762845849802e-05, "loss": 0.2719, "step": 821 }, { "epoch": 2.1864181091877497, "grad_norm": 0.267519780973077, "learning_rate": 1.4970355731225297e-05, "loss": 0.2716, "step": 822 }, { "epoch": 2.1890812250332887, "grad_norm": 0.22154250046870252, "learning_rate": 1.4920948616600791e-05, "loss": 0.2591, "step": 823 }, { "epoch": 2.191744340878828, "grad_norm": 0.21165234414085649, "learning_rate": 1.4871541501976285e-05, "loss": 0.2655, "step": 824 }, { "epoch": 2.1944074567243677, "grad_norm": 0.24374815251314244, "learning_rate": 1.482213438735178e-05, "loss": 0.2655, "step": 825 }, { "epoch": 2.1970705725699067, "grad_norm": 0.2455699195489871, "learning_rate": 1.4772727272727274e-05, "loss": 0.2665, "step": 826 }, { "epoch": 2.199733688415446, "grad_norm": 0.22958103222280501, "learning_rate": 1.4723320158102769e-05, "loss": 0.266, "step": 827 }, { "epoch": 2.202396804260985, "grad_norm": 0.22203196516766327, "learning_rate": 1.4673913043478263e-05, "loss": 0.2646, "step": 828 }, { "epoch": 2.2050599201065246, "grad_norm": 0.24608492700980994, "learning_rate": 1.4624505928853754e-05, "loss": 0.2794, "step": 829 }, { "epoch": 2.207723035952064, "grad_norm": 0.21991565592070453, "learning_rate": 1.4575098814229248e-05, "loss": 0.2721, "step": 830 }, { "epoch": 2.210386151797603, "grad_norm": 0.21684224263000038, "learning_rate": 1.4525691699604743e-05, "loss": 0.2584, "step": 831 }, { "epoch": 2.2130492676431426, "grad_norm": 0.25977569519470245, "learning_rate": 1.4476284584980237e-05, "loss": 0.2726, "step": 832 }, { "epoch": 2.2157123834886816, "grad_norm": 0.2386084151402447, "learning_rate": 1.4426877470355732e-05, "loss": 0.2852, "step": 833 }, { "epoch": 2.218375499334221, "grad_norm": 0.21986693449971093, "learning_rate": 1.4377470355731226e-05, "loss": 0.2626, "step": 834 }, { "epoch": 2.2210386151797605, "grad_norm": 0.21749065277576188, "learning_rate": 1.432806324110672e-05, "loss": 0.2602, "step": 835 }, { "epoch": 2.2237017310252996, "grad_norm": 0.23989512729814974, "learning_rate": 1.4278656126482215e-05, "loss": 0.2692, "step": 836 }, { "epoch": 2.226364846870839, "grad_norm": 0.23832582321216103, "learning_rate": 1.4229249011857709e-05, "loss": 0.2635, "step": 837 }, { "epoch": 2.229027962716378, "grad_norm": 0.2426811597238821, "learning_rate": 1.4179841897233202e-05, "loss": 0.2668, "step": 838 }, { "epoch": 2.2316910785619175, "grad_norm": 0.22741820303496693, "learning_rate": 1.4130434782608694e-05, "loss": 0.2687, "step": 839 }, { "epoch": 2.2343541944074565, "grad_norm": 0.2193731262262756, "learning_rate": 1.4081027667984189e-05, "loss": 0.2707, "step": 840 }, { "epoch": 2.237017310252996, "grad_norm": 0.22566921822696567, "learning_rate": 1.4031620553359683e-05, "loss": 0.2676, "step": 841 }, { "epoch": 2.2396804260985355, "grad_norm": 0.22383415671065598, "learning_rate": 1.3982213438735178e-05, "loss": 0.2652, "step": 842 }, { "epoch": 2.2423435419440745, "grad_norm": 0.20320657711674117, "learning_rate": 1.3932806324110672e-05, "loss": 0.2595, "step": 843 }, { "epoch": 2.245006657789614, "grad_norm": 0.2333067790520279, "learning_rate": 1.3883399209486166e-05, "loss": 0.2584, "step": 844 }, { "epoch": 2.247669773635153, "grad_norm": 0.2198492093260434, "learning_rate": 1.383399209486166e-05, "loss": 0.2787, "step": 845 }, { "epoch": 2.2503328894806924, "grad_norm": 0.20578959481390344, "learning_rate": 1.3784584980237155e-05, "loss": 0.2717, "step": 846 }, { "epoch": 2.2529960053262315, "grad_norm": 0.23821537591362393, "learning_rate": 1.373517786561265e-05, "loss": 0.2699, "step": 847 }, { "epoch": 2.255659121171771, "grad_norm": 0.22087113735109618, "learning_rate": 1.3685770750988142e-05, "loss": 0.2643, "step": 848 }, { "epoch": 2.2583222370173104, "grad_norm": 0.21122229854050678, "learning_rate": 1.3636363636363637e-05, "loss": 0.2724, "step": 849 }, { "epoch": 2.2609853528628494, "grad_norm": 0.21706856754708864, "learning_rate": 1.3586956521739131e-05, "loss": 0.2726, "step": 850 }, { "epoch": 2.263648468708389, "grad_norm": 0.21623723691120003, "learning_rate": 1.3537549407114625e-05, "loss": 0.2551, "step": 851 }, { "epoch": 2.266311584553928, "grad_norm": 0.2271100658389757, "learning_rate": 1.348814229249012e-05, "loss": 0.2586, "step": 852 }, { "epoch": 2.2689747003994674, "grad_norm": 0.2209764109681619, "learning_rate": 1.3438735177865614e-05, "loss": 0.2716, "step": 853 }, { "epoch": 2.271637816245007, "grad_norm": 0.2178701412614265, "learning_rate": 1.3389328063241108e-05, "loss": 0.2891, "step": 854 }, { "epoch": 2.274300932090546, "grad_norm": 0.2661642988662999, "learning_rate": 1.3339920948616603e-05, "loss": 0.2564, "step": 855 }, { "epoch": 2.2769640479360853, "grad_norm": 0.21388446109096484, "learning_rate": 1.3290513833992097e-05, "loss": 0.2529, "step": 856 }, { "epoch": 2.2796271637816243, "grad_norm": 0.2216576992935052, "learning_rate": 1.3241106719367592e-05, "loss": 0.2636, "step": 857 }, { "epoch": 2.282290279627164, "grad_norm": 0.23210662511306396, "learning_rate": 1.3191699604743083e-05, "loss": 0.2589, "step": 858 }, { "epoch": 2.2849533954727033, "grad_norm": 0.2392108261983096, "learning_rate": 1.3142292490118577e-05, "loss": 0.265, "step": 859 }, { "epoch": 2.2876165113182423, "grad_norm": 0.21786440972478727, "learning_rate": 1.3092885375494071e-05, "loss": 0.2793, "step": 860 }, { "epoch": 2.2902796271637818, "grad_norm": 0.260403587668551, "learning_rate": 1.3043478260869566e-05, "loss": 0.2777, "step": 861 }, { "epoch": 2.2929427430093208, "grad_norm": 0.2430960989806936, "learning_rate": 1.299407114624506e-05, "loss": 0.2572, "step": 862 }, { "epoch": 2.2956058588548602, "grad_norm": 0.21752051573777517, "learning_rate": 1.2944664031620555e-05, "loss": 0.2803, "step": 863 }, { "epoch": 2.2982689747003997, "grad_norm": 0.2573344766515025, "learning_rate": 1.2895256916996049e-05, "loss": 0.2803, "step": 864 }, { "epoch": 2.3009320905459387, "grad_norm": 0.24369267722963625, "learning_rate": 1.2845849802371543e-05, "loss": 0.2559, "step": 865 }, { "epoch": 2.303595206391478, "grad_norm": 0.2676475243278646, "learning_rate": 1.2796442687747038e-05, "loss": 0.2634, "step": 866 }, { "epoch": 2.306258322237017, "grad_norm": 0.21674298638149098, "learning_rate": 1.2747035573122532e-05, "loss": 0.2673, "step": 867 }, { "epoch": 2.3089214380825567, "grad_norm": 0.23541545396380092, "learning_rate": 1.2697628458498023e-05, "loss": 0.2673, "step": 868 }, { "epoch": 2.3115845539280957, "grad_norm": 0.22699711620607352, "learning_rate": 1.2648221343873517e-05, "loss": 0.2674, "step": 869 }, { "epoch": 2.314247669773635, "grad_norm": 0.22613468537499234, "learning_rate": 1.2598814229249012e-05, "loss": 0.2655, "step": 870 }, { "epoch": 2.316910785619174, "grad_norm": 0.2253665054481723, "learning_rate": 1.2549407114624506e-05, "loss": 0.2542, "step": 871 }, { "epoch": 2.3195739014647137, "grad_norm": 0.2389905563347208, "learning_rate": 1.25e-05, "loss": 0.2642, "step": 872 }, { "epoch": 2.322237017310253, "grad_norm": 0.1972800090188119, "learning_rate": 1.2450592885375495e-05, "loss": 0.2738, "step": 873 }, { "epoch": 2.324900133155792, "grad_norm": 0.22018172948520282, "learning_rate": 1.240118577075099e-05, "loss": 0.2736, "step": 874 }, { "epoch": 2.3275632490013316, "grad_norm": 0.22660897800754057, "learning_rate": 1.2351778656126482e-05, "loss": 0.2797, "step": 875 }, { "epoch": 2.3302263648468706, "grad_norm": 0.22691181432819396, "learning_rate": 1.2302371541501976e-05, "loss": 0.2562, "step": 876 }, { "epoch": 2.33288948069241, "grad_norm": 0.21367535241766863, "learning_rate": 1.225296442687747e-05, "loss": 0.2687, "step": 877 }, { "epoch": 2.3355525965379496, "grad_norm": 0.23289737129114052, "learning_rate": 1.2203557312252965e-05, "loss": 0.2595, "step": 878 }, { "epoch": 2.3382157123834886, "grad_norm": 0.21941025876118542, "learning_rate": 1.215415019762846e-05, "loss": 0.2785, "step": 879 }, { "epoch": 2.340878828229028, "grad_norm": 0.23113074495001715, "learning_rate": 1.2104743083003952e-05, "loss": 0.283, "step": 880 }, { "epoch": 2.343541944074567, "grad_norm": 0.21978182787011594, "learning_rate": 1.2055335968379447e-05, "loss": 0.2602, "step": 881 }, { "epoch": 2.3462050599201065, "grad_norm": 0.22558732477437654, "learning_rate": 1.2005928853754941e-05, "loss": 0.2744, "step": 882 }, { "epoch": 2.348868175765646, "grad_norm": 0.21761347406156886, "learning_rate": 1.1956521739130435e-05, "loss": 0.2702, "step": 883 }, { "epoch": 2.351531291611185, "grad_norm": 0.5461188257601155, "learning_rate": 1.190711462450593e-05, "loss": 0.2894, "step": 884 }, { "epoch": 2.3541944074567245, "grad_norm": 0.21406318400975563, "learning_rate": 1.1857707509881423e-05, "loss": 0.2661, "step": 885 }, { "epoch": 2.3568575233022635, "grad_norm": 0.1984149911802996, "learning_rate": 1.1808300395256917e-05, "loss": 0.266, "step": 886 }, { "epoch": 2.359520639147803, "grad_norm": 0.21968916065746072, "learning_rate": 1.1758893280632411e-05, "loss": 0.2635, "step": 887 }, { "epoch": 2.3621837549933424, "grad_norm": 0.22188429396465353, "learning_rate": 1.1709486166007906e-05, "loss": 0.2729, "step": 888 }, { "epoch": 2.3648468708388815, "grad_norm": 0.21019336767245783, "learning_rate": 1.16600790513834e-05, "loss": 0.2773, "step": 889 }, { "epoch": 2.367509986684421, "grad_norm": 0.22711608967366953, "learning_rate": 1.1610671936758893e-05, "loss": 0.2714, "step": 890 }, { "epoch": 2.37017310252996, "grad_norm": 0.2226773168313416, "learning_rate": 1.1561264822134387e-05, "loss": 0.264, "step": 891 }, { "epoch": 2.3728362183754994, "grad_norm": 0.21211073663718902, "learning_rate": 1.1511857707509881e-05, "loss": 0.2623, "step": 892 }, { "epoch": 2.3754993342210384, "grad_norm": 0.22155796804883984, "learning_rate": 1.1462450592885376e-05, "loss": 0.2786, "step": 893 }, { "epoch": 2.378162450066578, "grad_norm": 0.21152104541352987, "learning_rate": 1.141304347826087e-05, "loss": 0.2754, "step": 894 }, { "epoch": 2.3808255659121174, "grad_norm": 0.2436663825711812, "learning_rate": 1.1363636363636365e-05, "loss": 0.2646, "step": 895 }, { "epoch": 2.3834886817576564, "grad_norm": 0.253729858596224, "learning_rate": 1.1314229249011857e-05, "loss": 0.2815, "step": 896 }, { "epoch": 2.386151797603196, "grad_norm": 0.20642310572208497, "learning_rate": 1.1264822134387352e-05, "loss": 0.272, "step": 897 }, { "epoch": 2.388814913448735, "grad_norm": 0.22828401957220001, "learning_rate": 1.1215415019762846e-05, "loss": 0.2679, "step": 898 }, { "epoch": 2.3914780292942743, "grad_norm": 0.2226863403827293, "learning_rate": 1.116600790513834e-05, "loss": 0.2783, "step": 899 }, { "epoch": 2.3941411451398134, "grad_norm": 0.2380848377629423, "learning_rate": 1.1116600790513835e-05, "loss": 0.2688, "step": 900 }, { "epoch": 2.396804260985353, "grad_norm": 0.23278674245520006, "learning_rate": 1.106719367588933e-05, "loss": 0.271, "step": 901 }, { "epoch": 2.3994673768308923, "grad_norm": 0.20419629462602493, "learning_rate": 1.1017786561264824e-05, "loss": 0.265, "step": 902 }, { "epoch": 2.4021304926764313, "grad_norm": 0.2390569872958442, "learning_rate": 1.0968379446640318e-05, "loss": 0.2638, "step": 903 }, { "epoch": 2.4047936085219708, "grad_norm": 0.2279702813171203, "learning_rate": 1.091897233201581e-05, "loss": 0.2428, "step": 904 }, { "epoch": 2.40745672436751, "grad_norm": 0.21319204168497982, "learning_rate": 1.0869565217391305e-05, "loss": 0.2738, "step": 905 }, { "epoch": 2.4101198402130493, "grad_norm": 0.21016965126306628, "learning_rate": 1.08201581027668e-05, "loss": 0.2591, "step": 906 }, { "epoch": 2.4127829560585887, "grad_norm": 0.23241828917431315, "learning_rate": 1.0770750988142294e-05, "loss": 0.2691, "step": 907 }, { "epoch": 2.4154460719041277, "grad_norm": 0.2508034557509808, "learning_rate": 1.0721343873517788e-05, "loss": 0.2716, "step": 908 }, { "epoch": 2.418109187749667, "grad_norm": 0.24987214036836988, "learning_rate": 1.0671936758893281e-05, "loss": 0.2578, "step": 909 }, { "epoch": 2.4207723035952062, "grad_norm": 0.2380445170755529, "learning_rate": 1.0622529644268775e-05, "loss": 0.264, "step": 910 }, { "epoch": 2.4234354194407457, "grad_norm": 0.2201379804572699, "learning_rate": 1.057312252964427e-05, "loss": 0.274, "step": 911 }, { "epoch": 2.426098535286285, "grad_norm": 0.250942745509917, "learning_rate": 1.0523715415019764e-05, "loss": 0.2648, "step": 912 }, { "epoch": 2.428761651131824, "grad_norm": 0.22995097984900165, "learning_rate": 1.0474308300395258e-05, "loss": 0.2647, "step": 913 }, { "epoch": 2.4314247669773636, "grad_norm": 0.23698141688133578, "learning_rate": 1.0424901185770751e-05, "loss": 0.2737, "step": 914 }, { "epoch": 2.4340878828229027, "grad_norm": 0.21803776160842997, "learning_rate": 1.0375494071146246e-05, "loss": 0.272, "step": 915 }, { "epoch": 2.436750998668442, "grad_norm": 0.24131490172282968, "learning_rate": 1.032608695652174e-05, "loss": 0.2695, "step": 916 }, { "epoch": 2.4394141145139816, "grad_norm": 0.21919070590537304, "learning_rate": 1.0276679841897234e-05, "loss": 0.2642, "step": 917 }, { "epoch": 2.4420772303595206, "grad_norm": 0.22130430229063322, "learning_rate": 1.0227272727272729e-05, "loss": 0.2648, "step": 918 }, { "epoch": 2.44474034620506, "grad_norm": 0.2207950484316367, "learning_rate": 1.0177865612648221e-05, "loss": 0.2661, "step": 919 }, { "epoch": 2.447403462050599, "grad_norm": 0.21836484864507769, "learning_rate": 1.0128458498023716e-05, "loss": 0.2674, "step": 920 }, { "epoch": 2.4500665778961386, "grad_norm": 0.20744131254325618, "learning_rate": 1.007905138339921e-05, "loss": 0.2541, "step": 921 }, { "epoch": 2.4527296937416776, "grad_norm": 0.21453675745306103, "learning_rate": 1.0029644268774705e-05, "loss": 0.2739, "step": 922 }, { "epoch": 2.455392809587217, "grad_norm": 0.20834291358903456, "learning_rate": 9.980237154150199e-06, "loss": 0.2683, "step": 923 }, { "epoch": 2.458055925432756, "grad_norm": 0.2218801415090961, "learning_rate": 9.930830039525692e-06, "loss": 0.2725, "step": 924 }, { "epoch": 2.4607190412782955, "grad_norm": 0.22892525986093554, "learning_rate": 9.881422924901186e-06, "loss": 0.2736, "step": 925 }, { "epoch": 2.463382157123835, "grad_norm": 0.21019735025511882, "learning_rate": 9.83201581027668e-06, "loss": 0.2667, "step": 926 }, { "epoch": 2.466045272969374, "grad_norm": 0.22029826331712365, "learning_rate": 9.782608695652175e-06, "loss": 0.2685, "step": 927 }, { "epoch": 2.4687083888149135, "grad_norm": 0.2048436758988922, "learning_rate": 9.733201581027667e-06, "loss": 0.2675, "step": 928 }, { "epoch": 2.4713715046604525, "grad_norm": 0.22910504440789492, "learning_rate": 9.683794466403162e-06, "loss": 0.2769, "step": 929 }, { "epoch": 2.474034620505992, "grad_norm": 0.22852762946943356, "learning_rate": 9.634387351778656e-06, "loss": 0.2834, "step": 930 }, { "epoch": 2.4766977363515315, "grad_norm": 0.21897574663546826, "learning_rate": 9.58498023715415e-06, "loss": 0.2778, "step": 931 }, { "epoch": 2.4793608521970705, "grad_norm": 0.2050794319936511, "learning_rate": 9.535573122529645e-06, "loss": 0.2715, "step": 932 }, { "epoch": 2.48202396804261, "grad_norm": 0.21728652059101256, "learning_rate": 9.486166007905138e-06, "loss": 0.259, "step": 933 }, { "epoch": 2.484687083888149, "grad_norm": 0.22845416533089977, "learning_rate": 9.436758893280632e-06, "loss": 0.2761, "step": 934 }, { "epoch": 2.4873501997336884, "grad_norm": 0.21231590297088435, "learning_rate": 9.387351778656126e-06, "loss": 0.2677, "step": 935 }, { "epoch": 2.490013315579228, "grad_norm": 0.1926400508160791, "learning_rate": 9.33794466403162e-06, "loss": 0.2575, "step": 936 }, { "epoch": 2.492676431424767, "grad_norm": 0.22996010092008873, "learning_rate": 9.288537549407115e-06, "loss": 0.2548, "step": 937 }, { "epoch": 2.4953395472703064, "grad_norm": 0.22619760087939098, "learning_rate": 9.239130434782608e-06, "loss": 0.2676, "step": 938 }, { "epoch": 2.4980026631158454, "grad_norm": 0.20946128187824178, "learning_rate": 9.189723320158102e-06, "loss": 0.2649, "step": 939 }, { "epoch": 2.500665778961385, "grad_norm": 0.21291901939824368, "learning_rate": 9.140316205533597e-06, "loss": 0.2794, "step": 940 }, { "epoch": 2.5033288948069243, "grad_norm": 0.23983156472432737, "learning_rate": 9.090909090909091e-06, "loss": 0.2612, "step": 941 }, { "epoch": 2.5059920106524634, "grad_norm": 0.21371558486466197, "learning_rate": 9.041501976284585e-06, "loss": 0.2715, "step": 942 }, { "epoch": 2.508655126498003, "grad_norm": 0.20948609220977954, "learning_rate": 8.992094861660078e-06, "loss": 0.2685, "step": 943 }, { "epoch": 2.511318242343542, "grad_norm": 0.20326902436416877, "learning_rate": 8.942687747035572e-06, "loss": 0.2646, "step": 944 }, { "epoch": 2.5139813581890813, "grad_norm": 0.20716732265525145, "learning_rate": 8.893280632411067e-06, "loss": 0.2624, "step": 945 }, { "epoch": 2.5166444740346208, "grad_norm": 0.21310454845084212, "learning_rate": 8.843873517786561e-06, "loss": 0.2666, "step": 946 }, { "epoch": 2.51930758988016, "grad_norm": 0.2356341947109539, "learning_rate": 8.794466403162056e-06, "loss": 0.2607, "step": 947 }, { "epoch": 2.521970705725699, "grad_norm": 0.206705458805249, "learning_rate": 8.74505928853755e-06, "loss": 0.2765, "step": 948 }, { "epoch": 2.5246338215712383, "grad_norm": 0.19941570008688478, "learning_rate": 8.695652173913044e-06, "loss": 0.2774, "step": 949 }, { "epoch": 2.5272969374167777, "grad_norm": 0.22426207188439748, "learning_rate": 8.646245059288539e-06, "loss": 0.2829, "step": 950 }, { "epoch": 2.5299600532623168, "grad_norm": 0.25258528314600287, "learning_rate": 8.596837944664033e-06, "loss": 0.2646, "step": 951 }, { "epoch": 2.5326231691078562, "grad_norm": 0.2145489784213885, "learning_rate": 8.547430830039528e-06, "loss": 0.2607, "step": 952 }, { "epoch": 2.5352862849533953, "grad_norm": 0.19599385905462602, "learning_rate": 8.49802371541502e-06, "loss": 0.2543, "step": 953 }, { "epoch": 2.5379494007989347, "grad_norm": 0.2480014218006241, "learning_rate": 8.448616600790515e-06, "loss": 0.2689, "step": 954 }, { "epoch": 2.540612516644474, "grad_norm": 0.24788509439736134, "learning_rate": 8.399209486166009e-06, "loss": 0.2725, "step": 955 }, { "epoch": 2.543275632490013, "grad_norm": 0.2267111546180155, "learning_rate": 8.349802371541503e-06, "loss": 0.2635, "step": 956 }, { "epoch": 2.5459387483355527, "grad_norm": 0.21182851928367047, "learning_rate": 8.300395256916998e-06, "loss": 0.2638, "step": 957 }, { "epoch": 2.5486018641810917, "grad_norm": 0.21455676194315262, "learning_rate": 8.25098814229249e-06, "loss": 0.2585, "step": 958 }, { "epoch": 2.551264980026631, "grad_norm": 0.2169073571862216, "learning_rate": 8.201581027667985e-06, "loss": 0.2617, "step": 959 }, { "epoch": 2.5539280958721706, "grad_norm": 0.22625888751011447, "learning_rate": 8.15217391304348e-06, "loss": 0.271, "step": 960 }, { "epoch": 2.5565912117177096, "grad_norm": 0.20470193896466704, "learning_rate": 8.102766798418974e-06, "loss": 0.2662, "step": 961 }, { "epoch": 2.559254327563249, "grad_norm": 0.21322007235950363, "learning_rate": 8.053359683794468e-06, "loss": 0.2556, "step": 962 }, { "epoch": 2.561917443408788, "grad_norm": 0.20150617925679104, "learning_rate": 8.00395256916996e-06, "loss": 0.2582, "step": 963 }, { "epoch": 2.5645805592543276, "grad_norm": 0.2286944491087834, "learning_rate": 7.954545454545455e-06, "loss": 0.2661, "step": 964 }, { "epoch": 2.567243675099867, "grad_norm": 0.20708520844073464, "learning_rate": 7.90513833992095e-06, "loss": 0.2625, "step": 965 }, { "epoch": 2.569906790945406, "grad_norm": 0.1993453778786671, "learning_rate": 7.855731225296444e-06, "loss": 0.2684, "step": 966 }, { "epoch": 2.5725699067909455, "grad_norm": 0.19939625758599083, "learning_rate": 7.806324110671937e-06, "loss": 0.2658, "step": 967 }, { "epoch": 2.5752330226364846, "grad_norm": 0.20007029899978518, "learning_rate": 7.756916996047431e-06, "loss": 0.2612, "step": 968 }, { "epoch": 2.577896138482024, "grad_norm": 0.20768490453881108, "learning_rate": 7.707509881422925e-06, "loss": 0.2671, "step": 969 }, { "epoch": 2.5805592543275635, "grad_norm": 0.21354810130953325, "learning_rate": 7.65810276679842e-06, "loss": 0.2578, "step": 970 }, { "epoch": 2.5832223701731025, "grad_norm": 0.23174711166338519, "learning_rate": 7.608695652173914e-06, "loss": 0.2715, "step": 971 }, { "epoch": 2.5858854860186415, "grad_norm": 0.21079000224350897, "learning_rate": 7.559288537549407e-06, "loss": 0.2658, "step": 972 }, { "epoch": 2.588548601864181, "grad_norm": 0.2001035421079937, "learning_rate": 7.509881422924901e-06, "loss": 0.2569, "step": 973 }, { "epoch": 2.5912117177097205, "grad_norm": 0.2021065412071498, "learning_rate": 7.4604743083003955e-06, "loss": 0.2608, "step": 974 }, { "epoch": 2.5938748335552595, "grad_norm": 0.214158147452307, "learning_rate": 7.41106719367589e-06, "loss": 0.2779, "step": 975 }, { "epoch": 2.596537949400799, "grad_norm": 0.20790431049928293, "learning_rate": 7.361660079051384e-06, "loss": 0.2733, "step": 976 }, { "epoch": 2.599201065246338, "grad_norm": 0.20549750329181854, "learning_rate": 7.312252964426877e-06, "loss": 0.276, "step": 977 }, { "epoch": 2.6018641810918774, "grad_norm": 0.20237657523764993, "learning_rate": 7.262845849802371e-06, "loss": 0.2735, "step": 978 }, { "epoch": 2.604527296937417, "grad_norm": 0.20973877300015645, "learning_rate": 7.213438735177866e-06, "loss": 0.281, "step": 979 }, { "epoch": 2.607190412782956, "grad_norm": 0.22017905718680691, "learning_rate": 7.16403162055336e-06, "loss": 0.2677, "step": 980 }, { "epoch": 2.6098535286284954, "grad_norm": 0.2144342458050631, "learning_rate": 7.1146245059288545e-06, "loss": 0.2604, "step": 981 }, { "epoch": 2.6125166444740344, "grad_norm": 0.2050156532271564, "learning_rate": 7.065217391304347e-06, "loss": 0.2701, "step": 982 }, { "epoch": 2.615179760319574, "grad_norm": 0.1970203183942734, "learning_rate": 7.015810276679842e-06, "loss": 0.2505, "step": 983 }, { "epoch": 2.6178428761651134, "grad_norm": 0.20402269570746995, "learning_rate": 6.966403162055336e-06, "loss": 0.2599, "step": 984 }, { "epoch": 2.6205059920106524, "grad_norm": 0.20759868626386915, "learning_rate": 6.91699604743083e-06, "loss": 0.2733, "step": 985 }, { "epoch": 2.623169107856192, "grad_norm": 0.22693920517209076, "learning_rate": 6.867588932806325e-06, "loss": 0.2627, "step": 986 }, { "epoch": 2.625832223701731, "grad_norm": 0.20970122945185465, "learning_rate": 6.818181818181818e-06, "loss": 0.2704, "step": 987 }, { "epoch": 2.6284953395472703, "grad_norm": 0.20332704992870704, "learning_rate": 6.768774703557313e-06, "loss": 0.2762, "step": 988 }, { "epoch": 2.63115845539281, "grad_norm": 0.20966961639828544, "learning_rate": 6.719367588932807e-06, "loss": 0.2737, "step": 989 }, { "epoch": 2.633821571238349, "grad_norm": 0.2392085498215163, "learning_rate": 6.6699604743083014e-06, "loss": 0.2639, "step": 990 }, { "epoch": 2.6364846870838883, "grad_norm": 0.22069815282030755, "learning_rate": 6.620553359683796e-06, "loss": 0.2623, "step": 991 }, { "epoch": 2.6391478029294273, "grad_norm": 0.2062130093620195, "learning_rate": 6.5711462450592885e-06, "loss": 0.2634, "step": 992 }, { "epoch": 2.6418109187749668, "grad_norm": 0.21202212454473487, "learning_rate": 6.521739130434783e-06, "loss": 0.2732, "step": 993 }, { "epoch": 2.6444740346205062, "grad_norm": 0.20742438691074003, "learning_rate": 6.472332015810277e-06, "loss": 0.2775, "step": 994 }, { "epoch": 2.6471371504660453, "grad_norm": 0.20539419758832048, "learning_rate": 6.422924901185772e-06, "loss": 0.2786, "step": 995 }, { "epoch": 2.6498002663115847, "grad_norm": 0.19871961616535505, "learning_rate": 6.373517786561266e-06, "loss": 0.2642, "step": 996 }, { "epoch": 2.6524633821571237, "grad_norm": 0.2445459224085182, "learning_rate": 6.324110671936759e-06, "loss": 0.271, "step": 997 }, { "epoch": 2.655126498002663, "grad_norm": 0.20294635449003665, "learning_rate": 6.274703557312253e-06, "loss": 0.272, "step": 998 }, { "epoch": 2.6577896138482027, "grad_norm": 0.20711520929552674, "learning_rate": 6.2252964426877475e-06, "loss": 0.277, "step": 999 }, { "epoch": 2.6604527296937417, "grad_norm": 0.19858451035812705, "learning_rate": 6.175889328063241e-06, "loss": 0.2781, "step": 1000 }, { "epoch": 2.6631158455392807, "grad_norm": 0.2029933078164672, "learning_rate": 6.126482213438735e-06, "loss": 0.259, "step": 1001 }, { "epoch": 2.66577896138482, "grad_norm": 0.21745287030160018, "learning_rate": 6.07707509881423e-06, "loss": 0.27, "step": 1002 }, { "epoch": 2.6684420772303596, "grad_norm": 0.19345167090566057, "learning_rate": 6.027667984189723e-06, "loss": 0.268, "step": 1003 }, { "epoch": 2.6711051930758987, "grad_norm": 0.21568939666641776, "learning_rate": 5.978260869565218e-06, "loss": 0.2643, "step": 1004 }, { "epoch": 2.673768308921438, "grad_norm": 0.19296044607870885, "learning_rate": 5.928853754940711e-06, "loss": 0.2761, "step": 1005 }, { "epoch": 2.676431424766977, "grad_norm": 0.20181257150105722, "learning_rate": 5.879446640316206e-06, "loss": 0.271, "step": 1006 }, { "epoch": 2.6790945406125166, "grad_norm": 0.2073838164023787, "learning_rate": 5.8300395256917e-06, "loss": 0.2713, "step": 1007 }, { "epoch": 2.681757656458056, "grad_norm": 0.20965825745167907, "learning_rate": 5.7806324110671936e-06, "loss": 0.2689, "step": 1008 }, { "epoch": 2.684420772303595, "grad_norm": 0.20444583357709556, "learning_rate": 5.731225296442688e-06, "loss": 0.2831, "step": 1009 }, { "epoch": 2.6870838881491346, "grad_norm": 0.20971896583727812, "learning_rate": 5.681818181818182e-06, "loss": 0.2626, "step": 1010 }, { "epoch": 2.6897470039946736, "grad_norm": 0.2080555215910288, "learning_rate": 5.632411067193676e-06, "loss": 0.2602, "step": 1011 }, { "epoch": 2.692410119840213, "grad_norm": 0.2013420667078693, "learning_rate": 5.58300395256917e-06, "loss": 0.2653, "step": 1012 }, { "epoch": 2.6950732356857525, "grad_norm": 0.19614771328643982, "learning_rate": 5.533596837944665e-06, "loss": 0.2556, "step": 1013 }, { "epoch": 2.6977363515312915, "grad_norm": 0.20085761642467498, "learning_rate": 5.484189723320159e-06, "loss": 0.2744, "step": 1014 }, { "epoch": 2.700399467376831, "grad_norm": 0.21544774180757933, "learning_rate": 5.4347826086956525e-06, "loss": 0.2602, "step": 1015 }, { "epoch": 2.70306258322237, "grad_norm": 0.19696825099825307, "learning_rate": 5.385375494071147e-06, "loss": 0.2595, "step": 1016 }, { "epoch": 2.7057256990679095, "grad_norm": 0.1924176776922604, "learning_rate": 5.3359683794466405e-06, "loss": 0.2619, "step": 1017 }, { "epoch": 2.708388814913449, "grad_norm": 0.22132480166121332, "learning_rate": 5.286561264822135e-06, "loss": 0.2697, "step": 1018 }, { "epoch": 2.711051930758988, "grad_norm": 0.18691262036412767, "learning_rate": 5.237154150197629e-06, "loss": 0.2554, "step": 1019 }, { "epoch": 2.7137150466045274, "grad_norm": 0.1938229034237995, "learning_rate": 5.187747035573123e-06, "loss": 0.2586, "step": 1020 }, { "epoch": 2.7163781624500665, "grad_norm": 0.2129748283287826, "learning_rate": 5.138339920948617e-06, "loss": 0.2795, "step": 1021 }, { "epoch": 2.719041278295606, "grad_norm": 0.20445583537089335, "learning_rate": 5.088932806324111e-06, "loss": 0.2658, "step": 1022 }, { "epoch": 2.7217043941411454, "grad_norm": 0.1933528504807178, "learning_rate": 5.039525691699605e-06, "loss": 0.2621, "step": 1023 }, { "epoch": 2.7243675099866844, "grad_norm": 0.21949852883334098, "learning_rate": 4.9901185770750995e-06, "loss": 0.2649, "step": 1024 }, { "epoch": 2.7270306258322234, "grad_norm": 0.20152020359649447, "learning_rate": 4.940711462450593e-06, "loss": 0.265, "step": 1025 }, { "epoch": 2.729693741677763, "grad_norm": 0.20583564086259545, "learning_rate": 4.891304347826087e-06, "loss": 0.2619, "step": 1026 }, { "epoch": 2.7323568575233024, "grad_norm": 0.2007179587300372, "learning_rate": 4.841897233201581e-06, "loss": 0.2693, "step": 1027 }, { "epoch": 2.7350199733688414, "grad_norm": 0.1998685679119499, "learning_rate": 4.792490118577075e-06, "loss": 0.2629, "step": 1028 }, { "epoch": 2.737683089214381, "grad_norm": 0.21626697273734094, "learning_rate": 4.743083003952569e-06, "loss": 0.269, "step": 1029 }, { "epoch": 2.74034620505992, "grad_norm": 0.19448387232242922, "learning_rate": 4.693675889328063e-06, "loss": 0.2761, "step": 1030 }, { "epoch": 2.7430093209054593, "grad_norm": 0.19395208512967949, "learning_rate": 4.644268774703558e-06, "loss": 0.2653, "step": 1031 }, { "epoch": 2.745672436750999, "grad_norm": 0.18925291663752578, "learning_rate": 4.594861660079051e-06, "loss": 0.2568, "step": 1032 }, { "epoch": 2.748335552596538, "grad_norm": 0.20842012726728598, "learning_rate": 4.5454545454545455e-06, "loss": 0.2689, "step": 1033 }, { "epoch": 2.7509986684420773, "grad_norm": 0.20399895934870427, "learning_rate": 4.496047430830039e-06, "loss": 0.262, "step": 1034 }, { "epoch": 2.7536617842876163, "grad_norm": 0.21638718896911208, "learning_rate": 4.4466403162055334e-06, "loss": 0.2589, "step": 1035 }, { "epoch": 2.756324900133156, "grad_norm": 0.19757801710020018, "learning_rate": 4.397233201581028e-06, "loss": 0.2575, "step": 1036 }, { "epoch": 2.7589880159786953, "grad_norm": 0.1930523815662032, "learning_rate": 4.347826086956522e-06, "loss": 0.2589, "step": 1037 }, { "epoch": 2.7616511318242343, "grad_norm": 0.20093506678059855, "learning_rate": 4.298418972332017e-06, "loss": 0.2686, "step": 1038 }, { "epoch": 2.7643142476697737, "grad_norm": 0.20051627815913756, "learning_rate": 4.24901185770751e-06, "loss": 0.2709, "step": 1039 }, { "epoch": 2.7669773635153128, "grad_norm": 0.196594765327016, "learning_rate": 4.1996047430830045e-06, "loss": 0.2617, "step": 1040 }, { "epoch": 2.7696404793608522, "grad_norm": 0.19314366189878793, "learning_rate": 4.150197628458499e-06, "loss": 0.2851, "step": 1041 }, { "epoch": 2.7723035952063917, "grad_norm": 0.2161802526854043, "learning_rate": 4.1007905138339924e-06, "loss": 0.2674, "step": 1042 }, { "epoch": 2.7749667110519307, "grad_norm": 0.18272700852758644, "learning_rate": 4.051383399209487e-06, "loss": 0.2523, "step": 1043 }, { "epoch": 2.77762982689747, "grad_norm": 0.1914267001454524, "learning_rate": 4.00197628458498e-06, "loss": 0.271, "step": 1044 }, { "epoch": 2.780292942743009, "grad_norm": 0.20563053341844564, "learning_rate": 3.952569169960475e-06, "loss": 0.2588, "step": 1045 }, { "epoch": 2.7829560585885487, "grad_norm": 0.19474283827667518, "learning_rate": 3.903162055335968e-06, "loss": 0.259, "step": 1046 }, { "epoch": 2.785619174434088, "grad_norm": 0.199541546086498, "learning_rate": 3.853754940711463e-06, "loss": 0.2766, "step": 1047 }, { "epoch": 2.788282290279627, "grad_norm": 0.1962650749461456, "learning_rate": 3.804347826086957e-06, "loss": 0.275, "step": 1048 }, { "epoch": 2.790945406125166, "grad_norm": 0.19771877806493995, "learning_rate": 3.7549407114624506e-06, "loss": 0.2651, "step": 1049 }, { "epoch": 2.7936085219707056, "grad_norm": 0.25769379294942607, "learning_rate": 3.705533596837945e-06, "loss": 0.2792, "step": 1050 }, { "epoch": 2.796271637816245, "grad_norm": 0.2095398170946154, "learning_rate": 3.6561264822134385e-06, "loss": 0.2671, "step": 1051 }, { "epoch": 2.798934753661784, "grad_norm": 0.1929871299001819, "learning_rate": 3.606719367588933e-06, "loss": 0.2571, "step": 1052 }, { "epoch": 2.8015978695073236, "grad_norm": 0.19854196709504868, "learning_rate": 3.5573122529644273e-06, "loss": 0.2734, "step": 1053 }, { "epoch": 2.8042609853528626, "grad_norm": 0.20342959087962045, "learning_rate": 3.507905138339921e-06, "loss": 0.2675, "step": 1054 }, { "epoch": 2.806924101198402, "grad_norm": 0.19566813473730155, "learning_rate": 3.458498023715415e-06, "loss": 0.2636, "step": 1055 }, { "epoch": 2.8095872170439415, "grad_norm": 0.19394868609732532, "learning_rate": 3.409090909090909e-06, "loss": 0.2582, "step": 1056 }, { "epoch": 2.8122503328894806, "grad_norm": 0.19315741666740258, "learning_rate": 3.3596837944664035e-06, "loss": 0.2744, "step": 1057 }, { "epoch": 2.81491344873502, "grad_norm": 0.19500591092508857, "learning_rate": 3.310276679841898e-06, "loss": 0.2664, "step": 1058 }, { "epoch": 2.817576564580559, "grad_norm": 0.20369824754516933, "learning_rate": 3.2608695652173914e-06, "loss": 0.2753, "step": 1059 }, { "epoch": 2.8202396804260985, "grad_norm": 0.21679482311751339, "learning_rate": 3.211462450592886e-06, "loss": 0.2668, "step": 1060 }, { "epoch": 2.822902796271638, "grad_norm": 0.19207166020188257, "learning_rate": 3.1620553359683794e-06, "loss": 0.2714, "step": 1061 }, { "epoch": 2.825565912117177, "grad_norm": 0.18576307265975345, "learning_rate": 3.1126482213438737e-06, "loss": 0.2641, "step": 1062 }, { "epoch": 2.8282290279627165, "grad_norm": 0.19216814433561258, "learning_rate": 3.0632411067193677e-06, "loss": 0.2686, "step": 1063 }, { "epoch": 2.8308921438082555, "grad_norm": 0.20223820044568933, "learning_rate": 3.0138339920948617e-06, "loss": 0.2681, "step": 1064 }, { "epoch": 2.833555259653795, "grad_norm": 0.2025068882484355, "learning_rate": 2.9644268774703556e-06, "loss": 0.2671, "step": 1065 }, { "epoch": 2.8362183754993344, "grad_norm": 0.19192928047405172, "learning_rate": 2.91501976284585e-06, "loss": 0.2738, "step": 1066 }, { "epoch": 2.8388814913448734, "grad_norm": 0.18813387022576608, "learning_rate": 2.865612648221344e-06, "loss": 0.2555, "step": 1067 }, { "epoch": 2.841544607190413, "grad_norm": 0.17981642336035955, "learning_rate": 2.816205533596838e-06, "loss": 0.2649, "step": 1068 }, { "epoch": 2.844207723035952, "grad_norm": 0.19082585501925517, "learning_rate": 2.7667984189723323e-06, "loss": 0.2717, "step": 1069 }, { "epoch": 2.8468708388814914, "grad_norm": 0.1934715160744257, "learning_rate": 2.7173913043478263e-06, "loss": 0.2588, "step": 1070 }, { "epoch": 2.849533954727031, "grad_norm": 0.1943027368827162, "learning_rate": 2.6679841897233202e-06, "loss": 0.2612, "step": 1071 }, { "epoch": 2.85219707057257, "grad_norm": 0.20463059754180915, "learning_rate": 2.6185770750988146e-06, "loss": 0.2654, "step": 1072 }, { "epoch": 2.8548601864181093, "grad_norm": 0.21078399413940485, "learning_rate": 2.5691699604743086e-06, "loss": 0.2671, "step": 1073 }, { "epoch": 2.8575233022636484, "grad_norm": 0.20725181291345451, "learning_rate": 2.5197628458498025e-06, "loss": 0.2658, "step": 1074 }, { "epoch": 2.860186418109188, "grad_norm": 0.19210859826009163, "learning_rate": 2.4703557312252965e-06, "loss": 0.2749, "step": 1075 }, { "epoch": 2.8628495339547273, "grad_norm": 0.21087703729971102, "learning_rate": 2.4209486166007905e-06, "loss": 0.2565, "step": 1076 }, { "epoch": 2.8655126498002663, "grad_norm": 0.1932869202958659, "learning_rate": 2.3715415019762844e-06, "loss": 0.2761, "step": 1077 }, { "epoch": 2.8681757656458053, "grad_norm": 0.212098446975856, "learning_rate": 2.322134387351779e-06, "loss": 0.2739, "step": 1078 }, { "epoch": 2.870838881491345, "grad_norm": 0.1907847773078055, "learning_rate": 2.2727272727272728e-06, "loss": 0.2645, "step": 1079 }, { "epoch": 2.8735019973368843, "grad_norm": 0.2138904901003034, "learning_rate": 2.2233201581027667e-06, "loss": 0.266, "step": 1080 }, { "epoch": 2.8761651131824233, "grad_norm": 0.19201411133409543, "learning_rate": 2.173913043478261e-06, "loss": 0.2589, "step": 1081 }, { "epoch": 2.8788282290279628, "grad_norm": 0.1845739978063396, "learning_rate": 2.124505928853755e-06, "loss": 0.2597, "step": 1082 }, { "epoch": 2.881491344873502, "grad_norm": 0.19569151053283082, "learning_rate": 2.0750988142292494e-06, "loss": 0.2617, "step": 1083 }, { "epoch": 2.8841544607190412, "grad_norm": 0.19194512760322638, "learning_rate": 2.0256916996047434e-06, "loss": 0.2741, "step": 1084 }, { "epoch": 2.8868175765645807, "grad_norm": 0.19164700223613637, "learning_rate": 1.9762845849802374e-06, "loss": 0.2557, "step": 1085 }, { "epoch": 2.8894806924101197, "grad_norm": 0.20722349213232807, "learning_rate": 1.9268774703557313e-06, "loss": 0.2811, "step": 1086 }, { "epoch": 2.892143808255659, "grad_norm": 0.21395903599582983, "learning_rate": 1.8774703557312253e-06, "loss": 0.2697, "step": 1087 }, { "epoch": 2.894806924101198, "grad_norm": 0.19932722434475636, "learning_rate": 1.8280632411067192e-06, "loss": 0.2848, "step": 1088 }, { "epoch": 2.8974700399467377, "grad_norm": 0.19719366657115883, "learning_rate": 1.7786561264822136e-06, "loss": 0.2715, "step": 1089 }, { "epoch": 2.900133155792277, "grad_norm": 0.1975588211380889, "learning_rate": 1.7292490118577076e-06, "loss": 0.263, "step": 1090 }, { "epoch": 2.902796271637816, "grad_norm": 0.1939515446139924, "learning_rate": 1.6798418972332018e-06, "loss": 0.2576, "step": 1091 }, { "epoch": 2.9054593874833556, "grad_norm": 0.21461670844381095, "learning_rate": 1.6304347826086957e-06, "loss": 0.2622, "step": 1092 }, { "epoch": 2.9081225033288947, "grad_norm": 0.18141714157708164, "learning_rate": 1.5810276679841897e-06, "loss": 0.2602, "step": 1093 }, { "epoch": 2.910785619174434, "grad_norm": 0.18648909903146674, "learning_rate": 1.5316205533596839e-06, "loss": 0.2544, "step": 1094 }, { "epoch": 2.9134487350199736, "grad_norm": 0.19749530453878072, "learning_rate": 1.4822134387351778e-06, "loss": 0.2511, "step": 1095 }, { "epoch": 2.9161118508655126, "grad_norm": 0.2008025174676635, "learning_rate": 1.432806324110672e-06, "loss": 0.2621, "step": 1096 }, { "epoch": 2.918774966711052, "grad_norm": 0.1926237458483956, "learning_rate": 1.3833992094861662e-06, "loss": 0.2584, "step": 1097 }, { "epoch": 2.921438082556591, "grad_norm": 0.1917953810867646, "learning_rate": 1.3339920948616601e-06, "loss": 0.2696, "step": 1098 }, { "epoch": 2.9241011984021306, "grad_norm": 0.18863387793323863, "learning_rate": 1.2845849802371543e-06, "loss": 0.269, "step": 1099 }, { "epoch": 2.92676431424767, "grad_norm": 0.18859923936820897, "learning_rate": 1.2351778656126482e-06, "loss": 0.2629, "step": 1100 }, { "epoch": 2.929427430093209, "grad_norm": 0.18918722042687142, "learning_rate": 1.1857707509881422e-06, "loss": 0.2659, "step": 1101 }, { "epoch": 2.932090545938748, "grad_norm": 0.1909436486504395, "learning_rate": 1.1363636363636364e-06, "loss": 0.279, "step": 1102 }, { "epoch": 2.9347536617842875, "grad_norm": 0.215394252478964, "learning_rate": 1.0869565217391306e-06, "loss": 0.2771, "step": 1103 }, { "epoch": 2.937416777629827, "grad_norm": 0.1868050430391036, "learning_rate": 1.0375494071146247e-06, "loss": 0.255, "step": 1104 }, { "epoch": 2.940079893475366, "grad_norm": 0.18705337019297927, "learning_rate": 9.881422924901187e-07, "loss": 0.2472, "step": 1105 }, { "epoch": 2.9427430093209055, "grad_norm": 0.1935007995659731, "learning_rate": 9.387351778656126e-07, "loss": 0.2713, "step": 1106 }, { "epoch": 2.9454061251664445, "grad_norm": 0.18412759277611498, "learning_rate": 8.893280632411068e-07, "loss": 0.2653, "step": 1107 }, { "epoch": 2.948069241011984, "grad_norm": 0.18330377570006776, "learning_rate": 8.399209486166009e-07, "loss": 0.256, "step": 1108 }, { "epoch": 2.9507323568575234, "grad_norm": 0.19950543771973236, "learning_rate": 7.905138339920948e-07, "loss": 0.2732, "step": 1109 }, { "epoch": 2.9533954727030625, "grad_norm": 0.18701751210436693, "learning_rate": 7.411067193675889e-07, "loss": 0.2634, "step": 1110 }, { "epoch": 2.956058588548602, "grad_norm": 0.18889807484399168, "learning_rate": 6.916996047430831e-07, "loss": 0.2519, "step": 1111 }, { "epoch": 2.958721704394141, "grad_norm": 0.1898035633014786, "learning_rate": 6.422924901185771e-07, "loss": 0.2658, "step": 1112 }, { "epoch": 2.9613848202396804, "grad_norm": 0.1864905294817814, "learning_rate": 5.928853754940711e-07, "loss": 0.2562, "step": 1113 }, { "epoch": 2.96404793608522, "grad_norm": 0.18976880996630371, "learning_rate": 5.434782608695653e-07, "loss": 0.264, "step": 1114 }, { "epoch": 2.966711051930759, "grad_norm": 0.19331420232956223, "learning_rate": 4.940711462450593e-07, "loss": 0.273, "step": 1115 }, { "epoch": 2.9693741677762984, "grad_norm": 0.1930205378531215, "learning_rate": 4.446640316205534e-07, "loss": 0.2592, "step": 1116 }, { "epoch": 2.9720372836218374, "grad_norm": 0.19028897264532088, "learning_rate": 3.952569169960474e-07, "loss": 0.2654, "step": 1117 }, { "epoch": 2.974700399467377, "grad_norm": 0.19156481816748225, "learning_rate": 3.4584980237154154e-07, "loss": 0.261, "step": 1118 }, { "epoch": 2.9773635153129163, "grad_norm": 0.1889476580235995, "learning_rate": 2.9644268774703555e-07, "loss": 0.2566, "step": 1119 }, { "epoch": 2.9800266311584553, "grad_norm": 0.19663277621172817, "learning_rate": 2.4703557312252967e-07, "loss": 0.2751, "step": 1120 }, { "epoch": 2.982689747003995, "grad_norm": 0.1848208372611624, "learning_rate": 1.976284584980237e-07, "loss": 0.2633, "step": 1121 }, { "epoch": 2.985352862849534, "grad_norm": 0.18259691758877614, "learning_rate": 1.4822134387351778e-07, "loss": 0.2696, "step": 1122 }, { "epoch": 2.9880159786950733, "grad_norm": 0.1849664900149779, "learning_rate": 9.881422924901186e-08, "loss": 0.2704, "step": 1123 }, { "epoch": 2.9906790945406128, "grad_norm": 0.1854714711613864, "learning_rate": 4.940711462450593e-08, "loss": 0.2613, "step": 1124 }, { "epoch": 2.993342210386152, "grad_norm": 0.18380044771707796, "learning_rate": 0.0, "loss": 0.2614, "step": 1125 }, { "epoch": 2.993342210386152, "step": 1125, "total_flos": 9.575573608085586e+17, "train_loss": 0.4614936934842004, "train_runtime": 99022.1208, "train_samples_per_second": 0.182, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 1125, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.575573608085586e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }